aboutsummaryrefslogtreecommitdiff
path: root/internal
diff options
context:
space:
mode:
Diffstat (limited to 'internal')
-rw-r--r--internal/headers.go4
-rw-r--r--internal/ogtags/cache.go51
-rw-r--r--internal/ogtags/cache_test.go122
-rw-r--r--internal/ogtags/fetch.go69
-rw-r--r--internal/ogtags/fetch_test.go119
-rw-r--r--internal/ogtags/integration_test.go155
-rw-r--r--internal/ogtags/ogtags.go51
-rw-r--r--internal/ogtags/ogtags_test.go100
-rw-r--r--internal/ogtags/parse.go81
-rw-r--r--internal/ogtags/parse_test.go295
-rw-r--r--internal/test/playwright_test.go24
11 files changed, 1062 insertions, 9 deletions
diff --git a/internal/headers.go b/internal/headers.go
index 5c6a218..bdb5e9e 100644
--- a/internal/headers.go
+++ b/internal/headers.go
@@ -13,6 +13,7 @@ import (
// UnchangingCache sets the Cache-Control header to cache a response for 1 year if
// and only if the application is compiled in "release" mode by Docker.
func UnchangingCache(next http.Handler) http.Handler {
+ //goland:noinspection GoBoolExpressions
if anubis.Version == "devel" {
return next
}
@@ -68,11 +69,10 @@ func XForwardedForToXRealIP(next http.Handler) http.Handler {
func NoStoreCache(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Cache-Control", "no-store")
- next.ServeHTTP(w, r)
+ next.ServeHTTP(w, r)
})
}
-
// Do not allow browsing directory listings in paths that end with /
func NoBrowsing(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
diff --git a/internal/ogtags/cache.go b/internal/ogtags/cache.go
new file mode 100644
index 0000000..0d1a615
--- /dev/null
+++ b/internal/ogtags/cache.go
@@ -0,0 +1,51 @@
+package ogtags
+
+import (
+ "errors"
+ "log/slog"
+ "net/url"
+ "syscall"
+)
+
+// GetOGTags is the main function that retrieves Open Graph tags for a URL
+func (c *OGTagCache) GetOGTags(url *url.URL) (map[string]string, error) {
+ if url == nil {
+ return nil, errors.New("nil URL provided, cannot fetch OG tags")
+ }
+ urlStr := c.getTarget(url)
+ // Check cache first
+ if cachedTags := c.checkCache(urlStr); cachedTags != nil {
+ return cachedTags, nil
+ }
+
+ // Fetch HTML content
+ doc, err := c.fetchHTMLDocument(urlStr)
+ if errors.Is(err, syscall.ECONNREFUSED) {
+ slog.Debug("Connection refused, returning empty tags")
+ return nil, nil
+ } else if errors.Is(err, ErrNotFound) {
+ // not even worth a debug log...
+ return nil, nil
+ }
+ if err != nil {
+ return nil, err
+ }
+
+ // Extract OG tags
+ ogTags := c.extractOGTags(doc)
+
+ // Store in cache
+ c.cache.Set(urlStr, ogTags, c.ogTimeToLive)
+
+ return ogTags, nil
+}
+
+// checkCache checks if we have the tags cached and returns them if so
+func (c *OGTagCache) checkCache(urlStr string) map[string]string {
+ if cachedTags, ok := c.cache.Get(urlStr); ok {
+ slog.Debug("cache hit", "tags", cachedTags)
+ return cachedTags
+ }
+ slog.Debug("cache miss", "url", urlStr)
+ return nil
+}
diff --git a/internal/ogtags/cache_test.go b/internal/ogtags/cache_test.go
new file mode 100644
index 0000000..cd32414
--- /dev/null
+++ b/internal/ogtags/cache_test.go
@@ -0,0 +1,122 @@
+package ogtags
+
+import (
+ "net/http"
+ "net/http/httptest"
+ "net/url"
+ "testing"
+ "time"
+)
+
+func TestCheckCache(t *testing.T) {
+ cache := NewOGTagCache("http://example.com", true, time.Minute)
+
+ // Set up test data
+ urlStr := "http://example.com/page"
+ expectedTags := map[string]string{
+ "og:title": "Test Title",
+ "og:description": "Test Description",
+ }
+
+ // Test cache miss
+ tags := cache.checkCache(urlStr)
+ if tags != nil {
+ t.Errorf("expected nil tags on cache miss, got %v", tags)
+ }
+
+ // Manually add to cache
+ cache.cache.Set(urlStr, expectedTags, time.Minute)
+
+ // Test cache hit
+ tags = cache.checkCache(urlStr)
+ if tags == nil {
+ t.Fatal("expected non-nil tags on cache hit, got nil")
+ }
+
+ for key, expectedValue := range expectedTags {
+ if value, ok := tags[key]; !ok || value != expectedValue {
+ t.Errorf("expected %s: %s, got: %s", key, expectedValue, value)
+ }
+ }
+}
+
+func TestGetOGTags(t *testing.T) {
+ var loadCount int // Counter to track how many times the test route is loaded
+
+ // Create a test server to serve a sample HTML page with OG tags
+ ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ loadCount++
+ if loadCount > 1 {
+ t.Fatalf("Test route loaded more than once, cache failed")
+ }
+ w.Header().Set("Content-Type", "text/html")
+ w.Write([]byte(`
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <meta property="og:title" content="Test Title" />
+ <meta property="og:description" content="Test Description" />
+ <meta property="og:image" content="http://example.com/image.jpg" />
+ </head>
+ <body>
+ <p>Hello, world!</p>
+ </body>
+ </html>
+ `))
+ }))
+ defer ts.Close()
+
+ // Create an instance of OGTagCache with a short TTL for testing
+ cache := NewOGTagCache(ts.URL, true, 1*time.Minute)
+
+ // Parse the test server URL
+ parsedURL, err := url.Parse(ts.URL)
+ if err != nil {
+ t.Fatalf("failed to parse test server URL: %v", err)
+ }
+
+ // Test fetching OG tags from the test server
+ ogTags, err := cache.GetOGTags(parsedURL)
+ if err != nil {
+ t.Fatalf("failed to get OG tags: %v", err)
+ }
+
+ // Verify the fetched OG tags
+ expectedTags := map[string]string{
+ "og:title": "Test Title",
+ "og:description": "Test Description",
+ "og:image": "http://example.com/image.jpg",
+ }
+
+ for key, expectedValue := range expectedTags {
+ if value, ok := ogTags[key]; !ok || value != expectedValue {
+ t.Errorf("expected %s: %s, got: %s", key, expectedValue, value)
+ }
+ }
+
+ // Test fetching OG tags from the cache
+ ogTags, err = cache.GetOGTags(parsedURL)
+ if err != nil {
+ t.Fatalf("failed to get OG tags from cache: %v", err)
+ }
+
+ // Test fetching OG tags from the cache (3rd time)
+ newOgTags, err := cache.GetOGTags(parsedURL)
+ if err != nil {
+ t.Fatalf("failed to get OG tags from cache: %v", err)
+ }
+
+ // Verify the cached OG tags
+ for key, expectedValue := range expectedTags {
+ if value, ok := ogTags[key]; !ok || value != expectedValue {
+ t.Errorf("expected %s: %s, got: %s", key, expectedValue, value)
+ }
+
+ initialValue := ogTags[key]
+ cachedValue, ok := newOgTags[key]
+ if !ok || initialValue != cachedValue {
+ t.Errorf("Cache does not line up: expected %s: %s, got: %s", key, initialValue, cachedValue)
+ }
+
+ }
+}
diff --git a/internal/ogtags/fetch.go b/internal/ogtags/fetch.go
new file mode 100644
index 0000000..3ea9aac
--- /dev/null
+++ b/internal/ogtags/fetch.go
@@ -0,0 +1,69 @@
+package ogtags
+
+import (
+ "errors"
+ "fmt"
+ "golang.org/x/net/html"
+ "log/slog"
+ "mime"
+ "net"
+ "net/http"
+)
+
+var (
+ ErrNotFound = errors.New("page not found") /*todo: refactor into common errors lib? */
+ emptyMap = map[string]string{} // used to indicate an empty result in the cache. Can't use nil as it would be a cache miss.
+)
+
+func (c *OGTagCache) fetchHTMLDocument(urlStr string) (*html.Node, error) {
+ resp, err := c.client.Get(urlStr)
+ if err != nil {
+ var netErr net.Error
+ if errors.As(err, &netErr) && netErr.Timeout() {
+ slog.Debug("og: request timed out", "url", urlStr)
+ c.cache.Set(urlStr, emptyMap, c.ogTimeToLive/2) // Cache empty result for half the TTL to not spam the server
+ }
+ return nil, fmt.Errorf("http get failed: %w", err)
+ }
+ // this defer will call MaxBytesReader's Close, which closes the original body.
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ slog.Debug("og: received non-OK status code", "url", urlStr, "status", resp.StatusCode)
+ c.cache.Set(urlStr, emptyMap, c.ogTimeToLive) // Cache empty result for non-successful status codes
+ return nil, ErrNotFound
+ }
+
+ // Check content type
+ ct := resp.Header.Get("Content-Type")
+ if ct == "" {
+ // assume non html body
+ return nil, fmt.Errorf("missing Content-Type header")
+ } else {
+ mediaType, _, err := mime.ParseMediaType(ct)
+ if err != nil {
+ // Malformed Content-Type header
+ return nil, fmt.Errorf("invalid Content-Type '%s': %w", ct, err)
+ }
+
+ if mediaType != "text/html" && mediaType != "application/xhtml+xml" {
+ return nil, fmt.Errorf("unsupported Content-Type: %s", mediaType)
+ }
+ }
+
+ resp.Body = http.MaxBytesReader(nil, resp.Body, c.maxContentLength)
+
+ doc, err := html.Parse(resp.Body)
+ if err != nil {
+ // Check if the error is specifically because the limit was exceeded
+ var maxBytesErr *http.MaxBytesError
+ if errors.As(err, &maxBytesErr) {
+ slog.Debug("og: content exceeded max length", "url", urlStr, "limit", c.maxContentLength)
+ return nil, fmt.Errorf("content too large: exceeded %d bytes", c.maxContentLength)
+ }
+ // parsing error (e.g., malformed HTML)
+ return nil, fmt.Errorf("failed to parse HTML: %w", err)
+ }
+
+ return doc, nil
+}
diff --git a/internal/ogtags/fetch_test.go b/internal/ogtags/fetch_test.go
new file mode 100644
index 0000000..60af957
--- /dev/null
+++ b/internal/ogtags/fetch_test.go
@@ -0,0 +1,119 @@
+package ogtags
+
+import (
+ "fmt"
+ "io"
+ "net/http"
+ "net/http/httptest"
+ "os"
+ "strings"
+ "testing"
+ "time"
+)
+
+func TestFetchHTMLDocument(t *testing.T) {
+ tests := []struct {
+ name string
+ htmlContent string
+ contentType string
+ statusCode int
+ contentLength int64
+ expectError bool
+ }{
+ {
+ name: "Valid HTML",
+ htmlContent: `<!DOCTYPE html>
+ <html>
+ <head><title>Test</title></head>
+ <body><p>Test content</p></body>
+ </html>`,
+ contentType: "text/html",
+ statusCode: http.StatusOK,
+ expectError: false,
+ },
+ {
+ name: "Empty HTML",
+ htmlContent: "",
+ contentType: "text/html",
+ statusCode: http.StatusOK,
+ expectError: false,
+ },
+ {
+ name: "Not found error",
+ htmlContent: "",
+ contentType: "text/html",
+ statusCode: http.StatusNotFound,
+ expectError: true,
+ },
+ {
+ name: "Unsupported Content-Type",
+ htmlContent: "*Insert rick roll here*",
+ contentType: "video/mp4",
+ statusCode: http.StatusOK,
+ expectError: true,
+ },
+ {
+ name: "Too large content",
+ contentType: "text/html",
+ statusCode: http.StatusOK,
+ expectError: true,
+ contentLength: 5 * 1024 * 1024, // 5MB (over 2MB limit)
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ if tt.contentType != "" {
+ w.Header().Set("Content-Type", tt.contentType)
+ }
+ if tt.contentLength > 0 {
+ // Simulate content length but avoid sending too much actual data
+ w.Header().Set("Content-Length", fmt.Sprintf("%d", tt.contentLength))
+ io.CopyN(w, strings.NewReader("X"), tt.contentLength)
+ } else {
+ w.WriteHeader(tt.statusCode)
+ w.Write([]byte(tt.htmlContent))
+ }
+ }))
+ defer ts.Close()
+
+ cache := NewOGTagCache("", true, time.Minute)
+ doc, err := cache.fetchHTMLDocument(ts.URL)
+
+ if tt.expectError {
+ if err == nil {
+ t.Error("expected error, got nil")
+ }
+ if doc != nil {
+ t.Error("expected nil document on error, got non-nil")
+ }
+ } else {
+ if err != nil {
+ t.Errorf("unexpected error: %v", err)
+ }
+ if doc == nil {
+ t.Error("expected non-nil document, got nil")
+ }
+ }
+ })
+ }
+}
+
+func TestFetchHTMLDocumentInvalidURL(t *testing.T) {
+ if os.Getenv("DONT_USE_NETWORK") != "" {
+ t.Skip("test requires theoretical network egress")
+ }
+
+ cache := NewOGTagCache("", true, time.Minute)
+
+ doc, err := cache.fetchHTMLDocument("http://invalid.url.that.doesnt.exist.example")
+
+ if err == nil {
+ t.Error("expected error for invalid URL, got nil")
+ }
+
+ if doc != nil {
+ t.Error("expected nil document for invalid URL, got non-nil")
+ }
+}
diff --git a/internal/ogtags/integration_test.go b/internal/ogtags/integration_test.go
new file mode 100644
index 0000000..9eaaa3a
--- /dev/null
+++ b/internal/ogtags/integration_test.go
@@ -0,0 +1,155 @@
+package ogtags
+
+import (
+ "net/http"
+ "net/http/httptest"
+ "net/url"
+ "testing"
+ "time"
+)
+
+func TestIntegrationGetOGTags(t *testing.T) {
+ ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "text/html")
+
+ switch r.URL.Path {
+ case "/simple":
+ w.Write([]byte(`
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <meta property="og:title" content="Simple Page" />
+ <meta property="og:type" content="website" />
+ </head>
+ <body><p>Simple page content</p></body>
+ </html>
+ `))
+ case "/complete":
+ w.Write([]byte(`
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <meta property="og:title" content="Complete Page" />
+ <meta property="og:description" content="A page with many OG tags" />
+ <meta property="og:image" content="http://example.com/image.jpg" />
+ <meta property="og:url" content="http://example.com/complete" />
+ <meta property="og:type" content="article" />
+ </head>
+ <body><p>Complete page content</p></body>
+ </html>
+ `))
+ case "/no-og":
+ w.Write([]byte(`
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <title>No OG Tags</title>
+ </head>
+ <body><p>No OG tags here</p></body>
+ </html>
+ `))
+ default:
+ w.WriteHeader(http.StatusNotFound)
+ }
+ }))
+ defer ts.Close()
+
+ // Test with different configurations
+ testCases := []struct {
+ name string
+ path string
+ query string
+ expectedTags map[string]string
+ expectError bool
+ }{
+ {
+ name: "Simple page",
+ path: "/simple",
+ query: "",
+ expectedTags: map[string]string{
+ "og:title": "Simple Page",
+ "og:type": "website",
+ },
+ expectError: false,
+ },
+ {
+ name: "Complete page",
+ path: "/complete",
+ query: "ref=test",
+ expectedTags: map[string]string{
+ "og:title": "Complete Page",
+ "og:description": "A page with many OG tags",
+ "og:image": "http://example.com/image.jpg",
+ "og:url": "http://example.com/complete",
+ "og:type": "article",
+ },
+ expectError: false,
+ },
+ {
+ name: "Page with no OG tags",
+ path: "/no-og",
+ query: "",
+ expectedTags: map[string]string{},
+ expectError: false,
+ },
+ {
+ name: "Non-existent page",
+ path: "/not-found",
+ query: "",
+ expectedTags: nil,
+ expectError: false,
+ },
+ }
+
+ for _, tc := range testCases {
+ t.Run(tc.name, func(t *testing.T) {
+ // Create cache instance
+ cache := NewOGTagCache(ts.URL, true, 1*time.Minute)
+
+ // Create URL for test
+ testURL, _ := url.Parse(ts.URL)
+ testURL.Path = tc.path
+ testURL.RawQuery = tc.query
+
+ // Get OG tags
+ ogTags, err := cache.GetOGTags(testURL)
+
+ // Check error expectation
+ if tc.expectError {
+ if err == nil {
+ t.Error("expected error, got nil")
+ }
+ return
+ }
+
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ // Verify all expected tags are present
+ for key, expectedValue := range tc.expectedTags {
+ if value, ok := ogTags[key]; !ok || value != expectedValue {
+ t.Errorf("expected %s: %s, got: %s", key, expectedValue, value)
+ }
+ }
+
+ // Verify no extra tags are present
+ if len(ogTags) != len(tc.expectedTags) {
+ t.Errorf("expected %d tags, got %d", len(tc.expectedTags), len(ogTags))
+ }
+
+ // Test cache retrieval
+ cachedOGTags, err := cache.GetOGTags(testURL)
+ if err != nil {
+ t.Fatalf("failed to get OG tags from cache: %v", err)
+ }
+
+ // Verify cached tags match
+ for key, expectedValue := range tc.expectedTags {
+ if value, ok := cachedOGTags[key]; !ok || value != expectedValue {
+ t.Errorf("cached value - expected %s: %s, got: %s", key, expectedValue, value)
+ }
+ }
+ })
+ }
+}
diff --git a/internal/ogtags/ogtags.go b/internal/ogtags/ogtags.go
new file mode 100644
index 0000000..72185bb
--- /dev/null
+++ b/internal/ogtags/ogtags.go
@@ -0,0 +1,51 @@
+package ogtags
+
+import (
+ "net/http"
+ "net/url"
+ "time"
+
+ "github.com/TecharoHQ/anubis/decaymap"
+)
+
+type OGTagCache struct {
+ cache *decaymap.Impl[string, map[string]string]
+ target string
+ ogPassthrough bool
+ ogTimeToLive time.Duration
+ approvedTags []string
+ approvedPrefixes []string
+ client *http.Client
+ maxContentLength int64
+}
+
+func NewOGTagCache(target string, ogPassthrough bool, ogTimeToLive time.Duration) *OGTagCache {
+ // Predefined approved tags and prefixes
+ // In the future, these could come from configuration
+ defaultApprovedTags := []string{"description", "keywords", "author"}
+ defaultApprovedPrefixes := []string{"og:", "twitter:", "fediverse:"}
+ client := &http.Client{
+ Timeout: 5 * time.Second, /*make this configurable?*/
+ }
+
+ const maxContentLength = 16 << 20 // 16 MiB in bytes
+
+ return &OGTagCache{
+ cache: decaymap.New[string, map[string]string](),
+ target: target,
+ ogPassthrough: ogPassthrough,
+ ogTimeToLive: ogTimeToLive,
+ approvedTags: defaultApprovedTags,
+ approvedPrefixes: defaultApprovedPrefixes,
+ client: client,
+ maxContentLength: maxContentLength,
+ }
+}
+
+func (c *OGTagCache) getTarget(u *url.URL) string {
+ return c.target + u.Path
+}
+
+func (c *OGTagCache) Cleanup() {
+ c.cache.Cleanup()
+}
diff --git a/internal/ogtags/ogtags_test.go b/internal/ogtags/ogtags_test.go
new file mode 100644
index 0000000..8cd5b0d
--- /dev/null
+++ b/internal/ogtags/ogtags_test.go
@@ -0,0 +1,100 @@
+package ogtags
+
+import (
+ "net/url"
+ "testing"
+ "time"
+)
+
+func TestNewOGTagCache(t *testing.T) {
+ tests := []struct {
+ name string
+ target string
+ ogPassthrough bool
+ ogTimeToLive time.Duration
+ }{
+ {
+ name: "Basic initialization",
+ target: "http://example.com",
+ ogPassthrough: true,
+ ogTimeToLive: 5 * time.Minute,
+ },
+ {
+ name: "Empty target",
+ target: "",
+ ogPassthrough: false,
+ ogTimeToLive: 10 * time.Minute,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ cache := NewOGTagCache(tt.target, tt.ogPassthrough, tt.ogTimeToLive)
+
+ if cache == nil {
+ t.Fatal("expected non-nil cache, got nil")
+ }
+
+ if cache.target != tt.target {
+ t.Errorf("expected target %s, got %s", tt.target, cache.target)
+ }
+
+ if cache.ogPassthrough != tt.ogPassthrough {
+ t.Errorf("expected ogPassthrough %v, got %v", tt.ogPassthrough, cache.ogPassthrough)
+ }
+
+ if cache.ogTimeToLive != tt.ogTimeToLive {
+ t.Errorf("expected ogTimeToLive %v, got %v", tt.ogTimeToLive, cache.ogTimeToLive)
+ }
+ })
+ }
+}
+
+func TestGetTarget(t *testing.T) {
+ tests := []struct {
+ name string
+ target string
+ path string
+ query string
+ expected string
+ }{
+ {
+ name: "No path or query",
+ target: "http://example.com",
+ path: "",
+ query: "",
+ expected: "http://example.com",
+ },
+ {
+ name: "With complex path",
+ target: "http://example.com",
+ path: "/pag(#*((#@)ΓΓΓΓe/Γ",
+ query: "id=123",
+ expected: "http://example.com/pag(#*((#@)ΓΓΓΓe/Γ",
+ },
+ {
+ name: "With query and path",
+ target: "http://example.com",
+ path: "/page",
+ query: "id=123",
+ expected: "http://example.com/page",
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ cache := NewOGTagCache(tt.target, false, time.Minute)
+
+ u := &url.URL{
+ Path: tt.path,
+ RawQuery: tt.query,
+ }
+
+ result := cache.getTarget(u)
+
+ if result != tt.expected {
+ t.Errorf("expected %s, got %s", tt.expected, result)
+ }
+ })
+ }
+}
diff --git a/internal/ogtags/parse.go b/internal/ogtags/parse.go
new file mode 100644
index 0000000..8828e59
--- /dev/null
+++ b/internal/ogtags/parse.go
@@ -0,0 +1,81 @@
+package ogtags
+
+import (
+ "strings"
+
+ "golang.org/x/net/html"
+)
+
+// extractOGTags traverses the HTML document and extracts approved Open Graph tags
+func (c *OGTagCache) extractOGTags(doc *html.Node) map[string]string {
+ ogTags := make(map[string]string)
+
+ var traverseNodes func(*html.Node)
+ traverseNodes = func(n *html.Node) {
+ // isOGMetaTag only checks if it's a <meta> tag.
+ // The actual filtering happens in extractMetaTagInfo now.
+ if isOGMetaTag(n) {
+ property, content := c.extractMetaTagInfo(n)
+ if property != "" {
+ ogTags[property] = content
+ }
+ }
+
+ for child := n.FirstChild; child != nil; child = child.NextSibling {
+ traverseNodes(child)
+ }
+ }
+
+ traverseNodes(doc)
+ return ogTags
+}
+
+// isOGMetaTag checks if a node is *any* meta tag
+func isOGMetaTag(n *html.Node) bool {
+ if n == nil {
+ return false
+ }
+ return n.Type == html.ElementNode && n.Data == "meta"
+}
+
+// extractMetaTagInfo extracts property and content from a meta tag
+// *and* checks if the property is approved.
+// Returns empty property string if the tag is not approved.
+func (c *OGTagCache) extractMetaTagInfo(n *html.Node) (property, content string) {
+ var rawProperty string // Store the property found before approval check
+
+ for _, attr := range n.Attr {
+ if attr.Key == "property" || attr.Key == "name" {
+ rawProperty = attr.Val
+ }
+ if attr.Key == "content" {
+ content = attr.Val
+ }
+ }
+
+ // Check if the rawProperty is approved
+ isApproved := false
+ for _, prefix := range c.approvedPrefixes {
+ if strings.HasPrefix(rawProperty, prefix) {
+ isApproved = true
+ break
+ }
+ }
+ // Check exact approved tags if not already approved by prefix
+ if !isApproved {
+ for _, tag := range c.approvedTags {
+ if rawProperty == tag {
+ isApproved = true
+ break
+ }
+ }
+ }
+
+ // Only return the property if it's approved
+ if isApproved {
+ property = rawProperty
+ }
+
+ // Content is returned regardless, but property will be "" if not approved
+ return property, content
+}
diff --git a/internal/ogtags/parse_test.go b/internal/ogtags/parse_test.go
new file mode 100644
index 0000000..54815b3
--- /dev/null
+++ b/internal/ogtags/parse_test.go
@@ -0,0 +1,295 @@
+package ogtags
+
+import (
+ "reflect"
+ "strings"
+ "testing"
+ "time"
+
+ "golang.org/x/net/html"
+)
+
+// TestExtractOGTags updated with correct expectations based on filtering logic
+func TestExtractOGTags(t *testing.T) {
+ // Use a cache instance that reflects the default approved lists
+ testCache := NewOGTagCache("", false, time.Minute)
+ // Manually set approved tags/prefixes based on the user request for clarity
+ testCache.approvedTags = []string{"description"}
+ testCache.approvedPrefixes = []string{"og:"}
+
+ tests := []struct {
+ name string
+ htmlStr string
+ expected map[string]string
+ }{
+ {
+ name: "Basic OG tags", // Includes standard 'description' meta tag
+ htmlStr: `<!DOCTYPE html>
+ <html>
+ <head>
+ <meta property="og:title" content="Test Title" />
+ <meta property="og:description" content="Test Description" />
+ <meta name="description" content="Regular Description" />
+ <meta name="keywords" content="test, keyword" />
+ </head>
+ <body></body>
+ </html>`,
+ expected: map[string]string{
+ "og:title": "Test Title",
+ "og:description": "Test Description",
+ "description": "Regular Description",
+ },
+ },
+ {
+ name: "OG tags with name attribute",
+ htmlStr: `<!DOCTYPE html>
+ <html>
+ <head>
+ <meta name="og:title" content="Test Title" />
+ <meta property="og:description" content="Test Description" />
+ <meta name="twitter:card" content="summary" />
+ </head>
+ <body></body>
+ </html>`,
+ expected: map[string]string{
+ "og:title": "Test Title",
+ "og:description": "Test Description",
+ // twitter:card is still not approved
+ },
+ },
+ {
+ name: "No approved OG tags", // Contains only standard 'description'
+ htmlStr: `<!DOCTYPE html>
+ <html>
+ <head>
+ <meta name="description" content="Test Description" />
+ <meta name="keywords" content="Test" />
+ </head>
+ <body></body>
+ </html>`,
+ expected: map[string]string{
+ "description": "Test Description",
+ },
+ },
+ {
+ name: "Empty content",
+ htmlStr: `<!DOCTYPE html>
+ <html>
+ <head>
+ <meta property="og:title" content="" />
+ <meta property="og:description" content="Test Description" />
+ </head>
+ <body></body>
+ </html>`,
+ expected: map[string]string{
+ "og:title": "",
+ "og:description": "Test Description",
+ },
+ },
+ {
+ name: "Explicitly approved tag",
+ htmlStr: `<!DOCTYPE html>
+ <html>
+ <head>
+ <meta property="description" content="Approved Description Tag" />
+ </head>
+ <body></body>
+ </html>`,
+ expected: map[string]string{
+ // This is approved because "description" is in cache.approvedTags
+ "description": "Approved Description Tag",
+ },
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ doc, err := html.Parse(strings.NewReader(tt.htmlStr))
+ if err != nil {
+ t.Fatalf("failed to parse HTML: %v", err)
+ }
+
+ ogTags := testCache.extractOGTags(doc)
+
+ if !reflect.DeepEqual(ogTags, tt.expected) {
+ t.Errorf("expected %v, got %v", tt.expected, ogTags)
+ }
+ })
+ }
+}
+
+func TestIsOGMetaTag(t *testing.T) {
+ tests := []struct {
+ name string
+ nodeHTML string
+ targetNode string // Helper to find the right node in parsed fragment
+ expected bool
+ }{
+ {
+ name: "Meta OG tag",
+ nodeHTML: `<meta property="og:title" content="Test">`,
+ targetNode: "meta",
+ expected: true,
+ },
+ {
+ name: "Regular meta tag",
+ nodeHTML: `<meta name="description" content="Test">`,
+ targetNode: "meta",
+ expected: true,
+ },
+ {
+ name: "Not a meta tag",
+ nodeHTML: `<div>Test</div>`,
+ targetNode: "div",
+ expected: false,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ // Wrap the partial HTML in basic structure for parsing
+ fullHTML := "<html><head>" + tt.nodeHTML + "</head><body></body></html>"
+ doc, err := html.Parse(strings.NewReader(fullHTML))
+ if err != nil {
+ t.Fatalf("failed to parse HTML: %v", err)
+ }
+
+ // Find the target element node (meta or div based on targetNode)
+ var node *html.Node
+ var findNode func(*html.Node)
+ findNode = func(n *html.Node) {
+ // Skip finding if already found
+ if node != nil {
+ return
+ }
+ // Check if current node matches type and tag data
+ if n.Type == html.ElementNode && n.Data == tt.targetNode {
+ node = n
+ return
+ }
+ // Recursively check children
+ for c := n.FirstChild; c != nil; c = c.NextSibling {
+ findNode(c)
+ }
+ }
+ findNode(doc) // Start search from root
+
+ if node == nil {
+ t.Fatalf("Could not find target node '%s' in test HTML", tt.targetNode)
+ }
+
+ // Call the function under test
+ result := isOGMetaTag(node)
+ if result != tt.expected {
+ t.Errorf("expected %v, got %v", tt.expected, result)
+ }
+ })
+ }
+}
+
+func TestExtractMetaTagInfo(t *testing.T) {
+ // Use a cache instance that reflects the default approved lists
+ testCache := NewOGTagCache("", false, time.Minute)
+ testCache.approvedTags = []string{"description"}
+ testCache.approvedPrefixes = []string{"og:"}
+
+ tests := []struct {
+ name string
+ nodeHTML string
+ expectedProperty string
+ expectedContent string
+ }{
+ {
+ name: "OG title with property (approved by prefix)",
+ nodeHTML: `<meta property="og:title" content="Test Title">`,
+ expectedProperty: "og:title",
+ expectedContent: "Test Title",
+ },
+ {
+ name: "OG description with name (approved by prefix)",
+ nodeHTML: `<meta name="og:description" content="Test Description">`,
+ expectedProperty: "og:description",
+ expectedContent: "Test Description",
+ },
+ {
+ name: "Regular meta tag (name=description, approved by exact match)", // Updated name for clarity
+ nodeHTML: `<meta name="description" content="Test Description">`,
+ expectedProperty: "description",
+ expectedContent: "Test Description",
+ },
+ {
+ name: "Regular meta tag (name=keywords, not approved)",
+ nodeHTML: `<meta name="keywords" content="Test Keywords">`,
+ expectedProperty: "",
+ expectedContent: "Test Keywords",
+ },
+ {
+ name: "Twitter tag (not approved by default)",
+ nodeHTML: `<meta name="twitter:card" content="summary">`,
+ expectedProperty: "",
+ expectedContent: "summary",
+ },
+ {
+ name: "No content (but approved property)",
+ nodeHTML: `<meta property="og:title"&g