diff options
Diffstat (limited to 'internal')
| -rw-r--r-- | internal/headers.go | 4 | ||||
| -rw-r--r-- | internal/ogtags/cache.go | 51 | ||||
| -rw-r--r-- | internal/ogtags/cache_test.go | 122 | ||||
| -rw-r--r-- | internal/ogtags/fetch.go | 69 | ||||
| -rw-r--r-- | internal/ogtags/fetch_test.go | 119 | ||||
| -rw-r--r-- | internal/ogtags/integration_test.go | 155 | ||||
| -rw-r--r-- | internal/ogtags/ogtags.go | 51 | ||||
| -rw-r--r-- | internal/ogtags/ogtags_test.go | 100 | ||||
| -rw-r--r-- | internal/ogtags/parse.go | 81 | ||||
| -rw-r--r-- | internal/ogtags/parse_test.go | 295 | ||||
| -rw-r--r-- | internal/test/playwright_test.go | 24 |
11 files changed, 1062 insertions, 9 deletions
diff --git a/internal/headers.go b/internal/headers.go index 5c6a218..bdb5e9e 100644 --- a/internal/headers.go +++ b/internal/headers.go @@ -13,6 +13,7 @@ import ( // UnchangingCache sets the Cache-Control header to cache a response for 1 year if // and only if the application is compiled in "release" mode by Docker. func UnchangingCache(next http.Handler) http.Handler { + //goland:noinspection GoBoolExpressions if anubis.Version == "devel" { return next } @@ -68,11 +69,10 @@ func XForwardedForToXRealIP(next http.Handler) http.Handler { func NoStoreCache(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Cache-Control", "no-store") - next.ServeHTTP(w, r) + next.ServeHTTP(w, r) }) } - // Do not allow browsing directory listings in paths that end with / func NoBrowsing(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { diff --git a/internal/ogtags/cache.go b/internal/ogtags/cache.go new file mode 100644 index 0000000..0d1a615 --- /dev/null +++ b/internal/ogtags/cache.go @@ -0,0 +1,51 @@ +package ogtags + +import ( + "errors" + "log/slog" + "net/url" + "syscall" +) + +// GetOGTags is the main function that retrieves Open Graph tags for a URL +func (c *OGTagCache) GetOGTags(url *url.URL) (map[string]string, error) { + if url == nil { + return nil, errors.New("nil URL provided, cannot fetch OG tags") + } + urlStr := c.getTarget(url) + // Check cache first + if cachedTags := c.checkCache(urlStr); cachedTags != nil { + return cachedTags, nil + } + + // Fetch HTML content + doc, err := c.fetchHTMLDocument(urlStr) + if errors.Is(err, syscall.ECONNREFUSED) { + slog.Debug("Connection refused, returning empty tags") + return nil, nil + } else if errors.Is(err, ErrNotFound) { + // not even worth a debug log... + return nil, nil + } + if err != nil { + return nil, err + } + + // Extract OG tags + ogTags := c.extractOGTags(doc) + + // Store in cache + c.cache.Set(urlStr, ogTags, c.ogTimeToLive) + + return ogTags, nil +} + +// checkCache checks if we have the tags cached and returns them if so +func (c *OGTagCache) checkCache(urlStr string) map[string]string { + if cachedTags, ok := c.cache.Get(urlStr); ok { + slog.Debug("cache hit", "tags", cachedTags) + return cachedTags + } + slog.Debug("cache miss", "url", urlStr) + return nil +} diff --git a/internal/ogtags/cache_test.go b/internal/ogtags/cache_test.go new file mode 100644 index 0000000..cd32414 --- /dev/null +++ b/internal/ogtags/cache_test.go @@ -0,0 +1,122 @@ +package ogtags + +import ( + "net/http" + "net/http/httptest" + "net/url" + "testing" + "time" +) + +func TestCheckCache(t *testing.T) { + cache := NewOGTagCache("http://example.com", true, time.Minute) + + // Set up test data + urlStr := "http://example.com/page" + expectedTags := map[string]string{ + "og:title": "Test Title", + "og:description": "Test Description", + } + + // Test cache miss + tags := cache.checkCache(urlStr) + if tags != nil { + t.Errorf("expected nil tags on cache miss, got %v", tags) + } + + // Manually add to cache + cache.cache.Set(urlStr, expectedTags, time.Minute) + + // Test cache hit + tags = cache.checkCache(urlStr) + if tags == nil { + t.Fatal("expected non-nil tags on cache hit, got nil") + } + + for key, expectedValue := range expectedTags { + if value, ok := tags[key]; !ok || value != expectedValue { + t.Errorf("expected %s: %s, got: %s", key, expectedValue, value) + } + } +} + +func TestGetOGTags(t *testing.T) { + var loadCount int // Counter to track how many times the test route is loaded + + // Create a test server to serve a sample HTML page with OG tags + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + loadCount++ + if loadCount > 1 { + t.Fatalf("Test route loaded more than once, cache failed") + } + w.Header().Set("Content-Type", "text/html") + w.Write([]byte(` + <!DOCTYPE html> + <html> + <head> + <meta property="og:title" content="Test Title" /> + <meta property="og:description" content="Test Description" /> + <meta property="og:image" content="http://example.com/image.jpg" /> + </head> + <body> + <p>Hello, world!</p> + </body> + </html> + `)) + })) + defer ts.Close() + + // Create an instance of OGTagCache with a short TTL for testing + cache := NewOGTagCache(ts.URL, true, 1*time.Minute) + + // Parse the test server URL + parsedURL, err := url.Parse(ts.URL) + if err != nil { + t.Fatalf("failed to parse test server URL: %v", err) + } + + // Test fetching OG tags from the test server + ogTags, err := cache.GetOGTags(parsedURL) + if err != nil { + t.Fatalf("failed to get OG tags: %v", err) + } + + // Verify the fetched OG tags + expectedTags := map[string]string{ + "og:title": "Test Title", + "og:description": "Test Description", + "og:image": "http://example.com/image.jpg", + } + + for key, expectedValue := range expectedTags { + if value, ok := ogTags[key]; !ok || value != expectedValue { + t.Errorf("expected %s: %s, got: %s", key, expectedValue, value) + } + } + + // Test fetching OG tags from the cache + ogTags, err = cache.GetOGTags(parsedURL) + if err != nil { + t.Fatalf("failed to get OG tags from cache: %v", err) + } + + // Test fetching OG tags from the cache (3rd time) + newOgTags, err := cache.GetOGTags(parsedURL) + if err != nil { + t.Fatalf("failed to get OG tags from cache: %v", err) + } + + // Verify the cached OG tags + for key, expectedValue := range expectedTags { + if value, ok := ogTags[key]; !ok || value != expectedValue { + t.Errorf("expected %s: %s, got: %s", key, expectedValue, value) + } + + initialValue := ogTags[key] + cachedValue, ok := newOgTags[key] + if !ok || initialValue != cachedValue { + t.Errorf("Cache does not line up: expected %s: %s, got: %s", key, initialValue, cachedValue) + } + + } +} diff --git a/internal/ogtags/fetch.go b/internal/ogtags/fetch.go new file mode 100644 index 0000000..3ea9aac --- /dev/null +++ b/internal/ogtags/fetch.go @@ -0,0 +1,69 @@ +package ogtags + +import ( + "errors" + "fmt" + "golang.org/x/net/html" + "log/slog" + "mime" + "net" + "net/http" +) + +var ( + ErrNotFound = errors.New("page not found") /*todo: refactor into common errors lib? */ + emptyMap = map[string]string{} // used to indicate an empty result in the cache. Can't use nil as it would be a cache miss. +) + +func (c *OGTagCache) fetchHTMLDocument(urlStr string) (*html.Node, error) { + resp, err := c.client.Get(urlStr) + if err != nil { + var netErr net.Error + if errors.As(err, &netErr) && netErr.Timeout() { + slog.Debug("og: request timed out", "url", urlStr) + c.cache.Set(urlStr, emptyMap, c.ogTimeToLive/2) // Cache empty result for half the TTL to not spam the server + } + return nil, fmt.Errorf("http get failed: %w", err) + } + // this defer will call MaxBytesReader's Close, which closes the original body. + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + slog.Debug("og: received non-OK status code", "url", urlStr, "status", resp.StatusCode) + c.cache.Set(urlStr, emptyMap, c.ogTimeToLive) // Cache empty result for non-successful status codes + return nil, ErrNotFound + } + + // Check content type + ct := resp.Header.Get("Content-Type") + if ct == "" { + // assume non html body + return nil, fmt.Errorf("missing Content-Type header") + } else { + mediaType, _, err := mime.ParseMediaType(ct) + if err != nil { + // Malformed Content-Type header + return nil, fmt.Errorf("invalid Content-Type '%s': %w", ct, err) + } + + if mediaType != "text/html" && mediaType != "application/xhtml+xml" { + return nil, fmt.Errorf("unsupported Content-Type: %s", mediaType) + } + } + + resp.Body = http.MaxBytesReader(nil, resp.Body, c.maxContentLength) + + doc, err := html.Parse(resp.Body) + if err != nil { + // Check if the error is specifically because the limit was exceeded + var maxBytesErr *http.MaxBytesError + if errors.As(err, &maxBytesErr) { + slog.Debug("og: content exceeded max length", "url", urlStr, "limit", c.maxContentLength) + return nil, fmt.Errorf("content too large: exceeded %d bytes", c.maxContentLength) + } + // parsing error (e.g., malformed HTML) + return nil, fmt.Errorf("failed to parse HTML: %w", err) + } + + return doc, nil +} diff --git a/internal/ogtags/fetch_test.go b/internal/ogtags/fetch_test.go new file mode 100644 index 0000000..60af957 --- /dev/null +++ b/internal/ogtags/fetch_test.go @@ -0,0 +1,119 @@ +package ogtags + +import ( + "fmt" + "io" + "net/http" + "net/http/httptest" + "os" + "strings" + "testing" + "time" +) + +func TestFetchHTMLDocument(t *testing.T) { + tests := []struct { + name string + htmlContent string + contentType string + statusCode int + contentLength int64 + expectError bool + }{ + { + name: "Valid HTML", + htmlContent: `<!DOCTYPE html> + <html> + <head><title>Test</title></head> + <body><p>Test content</p></body> + </html>`, + contentType: "text/html", + statusCode: http.StatusOK, + expectError: false, + }, + { + name: "Empty HTML", + htmlContent: "", + contentType: "text/html", + statusCode: http.StatusOK, + expectError: false, + }, + { + name: "Not found error", + htmlContent: "", + contentType: "text/html", + statusCode: http.StatusNotFound, + expectError: true, + }, + { + name: "Unsupported Content-Type", + htmlContent: "*Insert rick roll here*", + contentType: "video/mp4", + statusCode: http.StatusOK, + expectError: true, + }, + { + name: "Too large content", + contentType: "text/html", + statusCode: http.StatusOK, + expectError: true, + contentLength: 5 * 1024 * 1024, // 5MB (over 2MB limit) + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if tt.contentType != "" { + w.Header().Set("Content-Type", tt.contentType) + } + if tt.contentLength > 0 { + // Simulate content length but avoid sending too much actual data + w.Header().Set("Content-Length", fmt.Sprintf("%d", tt.contentLength)) + io.CopyN(w, strings.NewReader("X"), tt.contentLength) + } else { + w.WriteHeader(tt.statusCode) + w.Write([]byte(tt.htmlContent)) + } + })) + defer ts.Close() + + cache := NewOGTagCache("", true, time.Minute) + doc, err := cache.fetchHTMLDocument(ts.URL) + + if tt.expectError { + if err == nil { + t.Error("expected error, got nil") + } + if doc != nil { + t.Error("expected nil document on error, got non-nil") + } + } else { + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if doc == nil { + t.Error("expected non-nil document, got nil") + } + } + }) + } +} + +func TestFetchHTMLDocumentInvalidURL(t *testing.T) { + if os.Getenv("DONT_USE_NETWORK") != "" { + t.Skip("test requires theoretical network egress") + } + + cache := NewOGTagCache("", true, time.Minute) + + doc, err := cache.fetchHTMLDocument("http://invalid.url.that.doesnt.exist.example") + + if err == nil { + t.Error("expected error for invalid URL, got nil") + } + + if doc != nil { + t.Error("expected nil document for invalid URL, got non-nil") + } +} diff --git a/internal/ogtags/integration_test.go b/internal/ogtags/integration_test.go new file mode 100644 index 0000000..9eaaa3a --- /dev/null +++ b/internal/ogtags/integration_test.go @@ -0,0 +1,155 @@ +package ogtags + +import ( + "net/http" + "net/http/httptest" + "net/url" + "testing" + "time" +) + +func TestIntegrationGetOGTags(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + + switch r.URL.Path { + case "/simple": + w.Write([]byte(` + <!DOCTYPE html> + <html> + <head> + <meta property="og:title" content="Simple Page" /> + <meta property="og:type" content="website" /> + </head> + <body><p>Simple page content</p></body> + </html> + `)) + case "/complete": + w.Write([]byte(` + <!DOCTYPE html> + <html> + <head> + <meta property="og:title" content="Complete Page" /> + <meta property="og:description" content="A page with many OG tags" /> + <meta property="og:image" content="http://example.com/image.jpg" /> + <meta property="og:url" content="http://example.com/complete" /> + <meta property="og:type" content="article" /> + </head> + <body><p>Complete page content</p></body> + </html> + `)) + case "/no-og": + w.Write([]byte(` + <!DOCTYPE html> + <html> + <head> + <title>No OG Tags</title> + </head> + <body><p>No OG tags here</p></body> + </html> + `)) + default: + w.WriteHeader(http.StatusNotFound) + } + })) + defer ts.Close() + + // Test with different configurations + testCases := []struct { + name string + path string + query string + expectedTags map[string]string + expectError bool + }{ + { + name: "Simple page", + path: "/simple", + query: "", + expectedTags: map[string]string{ + "og:title": "Simple Page", + "og:type": "website", + }, + expectError: false, + }, + { + name: "Complete page", + path: "/complete", + query: "ref=test", + expectedTags: map[string]string{ + "og:title": "Complete Page", + "og:description": "A page with many OG tags", + "og:image": "http://example.com/image.jpg", + "og:url": "http://example.com/complete", + "og:type": "article", + }, + expectError: false, + }, + { + name: "Page with no OG tags", + path: "/no-og", + query: "", + expectedTags: map[string]string{}, + expectError: false, + }, + { + name: "Non-existent page", + path: "/not-found", + query: "", + expectedTags: nil, + expectError: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + // Create cache instance + cache := NewOGTagCache(ts.URL, true, 1*time.Minute) + + // Create URL for test + testURL, _ := url.Parse(ts.URL) + testURL.Path = tc.path + testURL.RawQuery = tc.query + + // Get OG tags + ogTags, err := cache.GetOGTags(testURL) + + // Check error expectation + if tc.expectError { + if err == nil { + t.Error("expected error, got nil") + } + return + } + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Verify all expected tags are present + for key, expectedValue := range tc.expectedTags { + if value, ok := ogTags[key]; !ok || value != expectedValue { + t.Errorf("expected %s: %s, got: %s", key, expectedValue, value) + } + } + + // Verify no extra tags are present + if len(ogTags) != len(tc.expectedTags) { + t.Errorf("expected %d tags, got %d", len(tc.expectedTags), len(ogTags)) + } + + // Test cache retrieval + cachedOGTags, err := cache.GetOGTags(testURL) + if err != nil { + t.Fatalf("failed to get OG tags from cache: %v", err) + } + + // Verify cached tags match + for key, expectedValue := range tc.expectedTags { + if value, ok := cachedOGTags[key]; !ok || value != expectedValue { + t.Errorf("cached value - expected %s: %s, got: %s", key, expectedValue, value) + } + } + }) + } +} diff --git a/internal/ogtags/ogtags.go b/internal/ogtags/ogtags.go new file mode 100644 index 0000000..72185bb --- /dev/null +++ b/internal/ogtags/ogtags.go @@ -0,0 +1,51 @@ +package ogtags + +import ( + "net/http" + "net/url" + "time" + + "github.com/TecharoHQ/anubis/decaymap" +) + +type OGTagCache struct { + cache *decaymap.Impl[string, map[string]string] + target string + ogPassthrough bool + ogTimeToLive time.Duration + approvedTags []string + approvedPrefixes []string + client *http.Client + maxContentLength int64 +} + +func NewOGTagCache(target string, ogPassthrough bool, ogTimeToLive time.Duration) *OGTagCache { + // Predefined approved tags and prefixes + // In the future, these could come from configuration + defaultApprovedTags := []string{"description", "keywords", "author"} + defaultApprovedPrefixes := []string{"og:", "twitter:", "fediverse:"} + client := &http.Client{ + Timeout: 5 * time.Second, /*make this configurable?*/ + } + + const maxContentLength = 16 << 20 // 16 MiB in bytes + + return &OGTagCache{ + cache: decaymap.New[string, map[string]string](), + target: target, + ogPassthrough: ogPassthrough, + ogTimeToLive: ogTimeToLive, + approvedTags: defaultApprovedTags, + approvedPrefixes: defaultApprovedPrefixes, + client: client, + maxContentLength: maxContentLength, + } +} + +func (c *OGTagCache) getTarget(u *url.URL) string { + return c.target + u.Path +} + +func (c *OGTagCache) Cleanup() { + c.cache.Cleanup() +} diff --git a/internal/ogtags/ogtags_test.go b/internal/ogtags/ogtags_test.go new file mode 100644 index 0000000..8cd5b0d --- /dev/null +++ b/internal/ogtags/ogtags_test.go @@ -0,0 +1,100 @@ +package ogtags + +import ( + "net/url" + "testing" + "time" +) + +func TestNewOGTagCache(t *testing.T) { + tests := []struct { + name string + target string + ogPassthrough bool + ogTimeToLive time.Duration + }{ + { + name: "Basic initialization", + target: "http://example.com", + ogPassthrough: true, + ogTimeToLive: 5 * time.Minute, + }, + { + name: "Empty target", + target: "", + ogPassthrough: false, + ogTimeToLive: 10 * time.Minute, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cache := NewOGTagCache(tt.target, tt.ogPassthrough, tt.ogTimeToLive) + + if cache == nil { + t.Fatal("expected non-nil cache, got nil") + } + + if cache.target != tt.target { + t.Errorf("expected target %s, got %s", tt.target, cache.target) + } + + if cache.ogPassthrough != tt.ogPassthrough { + t.Errorf("expected ogPassthrough %v, got %v", tt.ogPassthrough, cache.ogPassthrough) + } + + if cache.ogTimeToLive != tt.ogTimeToLive { + t.Errorf("expected ogTimeToLive %v, got %v", tt.ogTimeToLive, cache.ogTimeToLive) + } + }) + } +} + +func TestGetTarget(t *testing.T) { + tests := []struct { + name string + target string + path string + query string + expected string + }{ + { + name: "No path or query", + target: "http://example.com", + path: "", + query: "", + expected: "http://example.com", + }, + { + name: "With complex path", + target: "http://example.com", + path: "/pag(#*((#@)ΓΓΓΓe/Γ", + query: "id=123", + expected: "http://example.com/pag(#*((#@)ΓΓΓΓe/Γ", + }, + { + name: "With query and path", + target: "http://example.com", + path: "/page", + query: "id=123", + expected: "http://example.com/page", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cache := NewOGTagCache(tt.target, false, time.Minute) + + u := &url.URL{ + Path: tt.path, + RawQuery: tt.query, + } + + result := cache.getTarget(u) + + if result != tt.expected { + t.Errorf("expected %s, got %s", tt.expected, result) + } + }) + } +} diff --git a/internal/ogtags/parse.go b/internal/ogtags/parse.go new file mode 100644 index 0000000..8828e59 --- /dev/null +++ b/internal/ogtags/parse.go @@ -0,0 +1,81 @@ +package ogtags + +import ( + "strings" + + "golang.org/x/net/html" +) + +// extractOGTags traverses the HTML document and extracts approved Open Graph tags +func (c *OGTagCache) extractOGTags(doc *html.Node) map[string]string { + ogTags := make(map[string]string) + + var traverseNodes func(*html.Node) + traverseNodes = func(n *html.Node) { + // isOGMetaTag only checks if it's a <meta> tag. + // The actual filtering happens in extractMetaTagInfo now. + if isOGMetaTag(n) { + property, content := c.extractMetaTagInfo(n) + if property != "" { + ogTags[property] = content + } + } + + for child := n.FirstChild; child != nil; child = child.NextSibling { + traverseNodes(child) + } + } + + traverseNodes(doc) + return ogTags +} + +// isOGMetaTag checks if a node is *any* meta tag +func isOGMetaTag(n *html.Node) bool { + if n == nil { + return false + } + return n.Type == html.ElementNode && n.Data == "meta" +} + +// extractMetaTagInfo extracts property and content from a meta tag +// *and* checks if the property is approved. +// Returns empty property string if the tag is not approved. +func (c *OGTagCache) extractMetaTagInfo(n *html.Node) (property, content string) { + var rawProperty string // Store the property found before approval check + + for _, attr := range n.Attr { + if attr.Key == "property" || attr.Key == "name" { + rawProperty = attr.Val + } + if attr.Key == "content" { + content = attr.Val + } + } + + // Check if the rawProperty is approved + isApproved := false + for _, prefix := range c.approvedPrefixes { + if strings.HasPrefix(rawProperty, prefix) { + isApproved = true + break + } + } + // Check exact approved tags if not already approved by prefix + if !isApproved { + for _, tag := range c.approvedTags { + if rawProperty == tag { + isApproved = true + break + } + } + } + + // Only return the property if it's approved + if isApproved { + property = rawProperty + } + + // Content is returned regardless, but property will be "" if not approved + return property, content +} diff --git a/internal/ogtags/parse_test.go b/internal/ogtags/parse_test.go new file mode 100644 index 0000000..54815b3 --- /dev/null +++ b/internal/ogtags/parse_test.go @@ -0,0 +1,295 @@ +package ogtags + +import ( + "reflect" + "strings" + "testing" + "time" + + "golang.org/x/net/html" +) + +// TestExtractOGTags updated with correct expectations based on filtering logic +func TestExtractOGTags(t *testing.T) { + // Use a cache instance that reflects the default approved lists + testCache := NewOGTagCache("", false, time.Minute) + // Manually set approved tags/prefixes based on the user request for clarity + testCache.approvedTags = []string{"description"} + testCache.approvedPrefixes = []string{"og:"} + + tests := []struct { + name string + htmlStr string + expected map[string]string + }{ + { + name: "Basic OG tags", // Includes standard 'description' meta tag + htmlStr: `<!DOCTYPE html> + <html> + <head> + <meta property="og:title" content="Test Title" /> + <meta property="og:description" content="Test Description" /> + <meta name="description" content="Regular Description" /> + <meta name="keywords" content="test, keyword" /> + </head> + <body></body> + </html>`, + expected: map[string]string{ + "og:title": "Test Title", + "og:description": "Test Description", + "description": "Regular Description", + }, + }, + { + name: "OG tags with name attribute", + htmlStr: `<!DOCTYPE html> + <html> + <head> + <meta name="og:title" content="Test Title" /> + <meta property="og:description" content="Test Description" /> + <meta name="twitter:card" content="summary" /> + </head> + <body></body> + </html>`, + expected: map[string]string{ + "og:title": "Test Title", + "og:description": "Test Description", + // twitter:card is still not approved + }, + }, + { + name: "No approved OG tags", // Contains only standard 'description' + htmlStr: `<!DOCTYPE html> + <html> + <head> + <meta name="description" content="Test Description" /> + <meta name="keywords" content="Test" /> + </head> + <body></body> + </html>`, + expected: map[string]string{ + "description": "Test Description", + }, + }, + { + name: "Empty content", + htmlStr: `<!DOCTYPE html> + <html> + <head> + <meta property="og:title" content="" /> + <meta property="og:description" content="Test Description" /> + </head> + <body></body> + </html>`, + expected: map[string]string{ + "og:title": "", + "og:description": "Test Description", + }, + }, + { + name: "Explicitly approved tag", + htmlStr: `<!DOCTYPE html> + <html> + <head> + <meta property="description" content="Approved Description Tag" /> + </head> + <body></body> + </html>`, + expected: map[string]string{ + // This is approved because "description" is in cache.approvedTags + "description": "Approved Description Tag", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + doc, err := html.Parse(strings.NewReader(tt.htmlStr)) + if err != nil { + t.Fatalf("failed to parse HTML: %v", err) + } + + ogTags := testCache.extractOGTags(doc) + + if !reflect.DeepEqual(ogTags, tt.expected) { + t.Errorf("expected %v, got %v", tt.expected, ogTags) + } + }) + } +} + +func TestIsOGMetaTag(t *testing.T) { + tests := []struct { + name string + nodeHTML string + targetNode string // Helper to find the right node in parsed fragment + expected bool + }{ + { + name: "Meta OG tag", + nodeHTML: `<meta property="og:title" content="Test">`, + targetNode: "meta", + expected: true, + }, + { + name: "Regular meta tag", + nodeHTML: `<meta name="description" content="Test">`, + targetNode: "meta", + expected: true, + }, + { + name: "Not a meta tag", + nodeHTML: `<div>Test</div>`, + targetNode: "div", + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Wrap the partial HTML in basic structure for parsing + fullHTML := "<html><head>" + tt.nodeHTML + "</head><body></body></html>" + doc, err := html.Parse(strings.NewReader(fullHTML)) + if err != nil { + t.Fatalf("failed to parse HTML: %v", err) + } + + // Find the target element node (meta or div based on targetNode) + var node *html.Node + var findNode func(*html.Node) + findNode = func(n *html.Node) { + // Skip finding if already found + if node != nil { + return + } + // Check if current node matches type and tag data + if n.Type == html.ElementNode && n.Data == tt.targetNode { + node = n + return + } + // Recursively check children + for c := n.FirstChild; c != nil; c = c.NextSibling { + findNode(c) + } + } + findNode(doc) // Start search from root + + if node == nil { + t.Fatalf("Could not find target node '%s' in test HTML", tt.targetNode) + } + + // Call the function under test + result := isOGMetaTag(node) + if result != tt.expected { + t.Errorf("expected %v, got %v", tt.expected, result) + } + }) + } +} + +func TestExtractMetaTagInfo(t *testing.T) { + // Use a cache instance that reflects the default approved lists + testCache := NewOGTagCache("", false, time.Minute) + testCache.approvedTags = []string{"description"} + testCache.approvedPrefixes = []string{"og:"} + + tests := []struct { + name string + nodeHTML string + expectedProperty string + expectedContent string + }{ + { + name: "OG title with property (approved by prefix)", + nodeHTML: `<meta property="og:title" content="Test Title">`, + expectedProperty: "og:title", + expectedContent: "Test Title", + }, + { + name: "OG description with name (approved by prefix)", + nodeHTML: `<meta name="og:description" content="Test Description">`, + expectedProperty: "og:description", + expectedContent: "Test Description", + }, + { + name: "Regular meta tag (name=description, approved by exact match)", // Updated name for clarity + nodeHTML: `<meta name="description" content="Test Description">`, + expectedProperty: "description", + expectedContent: "Test Description", + }, + { + name: "Regular meta tag (name=keywords, not approved)", + nodeHTML: `<meta name="keywords" content="Test Keywords">`, + expectedProperty: "", + expectedContent: "Test Keywords", + }, + { + name: "Twitter tag (not approved by default)", + nodeHTML: `<meta name="twitter:card" content="summary">`, + expectedProperty: "", + expectedContent: "summary", + }, + { + name: "No content (but approved property)", + nodeHTML: `<meta property="og:title"&g |
