aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.air.toml12
-rw-r--r--.gitignore9
-rw-r--r--cmd/anubis/main.go29
-rw-r--r--data/botPolicies.json6
-rw-r--r--docs/docs/CHANGELOG.md3
-rw-r--r--docs/docs/admin/configuration/open-graph.mdx47
-rw-r--r--docs/docs/admin/installation.mdx12
-rw-r--r--go.mod2
-rw-r--r--internal/headers.go4
-rw-r--r--internal/ogtags/cache.go51
-rw-r--r--internal/ogtags/cache_test.go122
-rw-r--r--internal/ogtags/fetch.go69
-rw-r--r--internal/ogtags/fetch_test.go119
-rw-r--r--internal/ogtags/integration_test.go155
-rw-r--r--internal/ogtags/ogtags.go51
-rw-r--r--internal/ogtags/ogtags_test.go100
-rw-r--r--internal/ogtags/parse.go81
-rw-r--r--internal/ogtags/parse_test.go295
-rw-r--r--internal/test/playwright_test.go24
-rw-r--r--lib/anubis.go23
-rw-r--r--lib/anubis_test.go12
-rw-r--r--web/index.go10
-rw-r--r--web/index.templ327
-rw-r--r--web/index_templ.go156
24 files changed, 1466 insertions, 253 deletions
diff --git a/.air.toml b/.air.toml
new file mode 100644
index 0000000..9fd7e9a
--- /dev/null
+++ b/.air.toml
@@ -0,0 +1,12 @@
+root = "."
+tmp_dir = "var"
+
+[build]
+cmd = "go build -o ./var/main ./cmd/anubis"
+bin = "./var/main"
+args = ["--use-remote-address"]
+exclude_dir = ["var", "vendor", "docs", "node_modules"]
+
+[logger]
+time = true
+# to change flags at runtime, prepend with -- e.g. $ air -- --target http://localhost:3000 --difficulty 20 --use-remote-address \ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 9c21c75..4f2dc2a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,10 @@
*.deb
*.rpm
+# Additional package locks
+pnpm-lock.yaml
+yarn.lock
+
# Go binaries and test artifacts
main
*.test
@@ -9,4 +13,7 @@ main
node_modules
# MacOS
-.DS_store \ No newline at end of file
+.DS_store
+
+# Intellij
+.idea \ No newline at end of file
diff --git a/cmd/anubis/main.go b/cmd/anubis/main.go
index 59adc67..9fca3e2 100644
--- a/cmd/anubis/main.go
+++ b/cmd/anubis/main.go
@@ -7,6 +7,7 @@ import (
"crypto/rand"
"embed"
"encoding/hex"
+ "errors"
"flag"
"fmt"
"io/fs"
@@ -54,8 +55,9 @@ var (
healthcheck = flag.Bool("healthcheck", false, "run a health check against Anubis")
useRemoteAddress = flag.Bool("use-remote-address", false, "read the client's IP address from the network request, useful for debugging and running Anubis on bare metal")
debugBenchmarkJS = flag.Bool("debug-benchmark-js", false, "respond to every request with a challenge for benchmarking hashrate")
-
- extractResources = flag.String("extract-resources", "", "if set, extract the static resources to the specified folder")
+ ogPassthrough = flag.Bool("og-passthrough", false, "enable Open Graph tag passthrough")
+ ogTimeToLive = flag.Duration("og-expiry-time", 24*time.Hour, "Open Graph tag cache expiration time")
+ extractResources = flag.String("extract-resources", "", "if set, extract the static resources to the specified folder")
)
func keyFromHex(value string) (ed25519.PrivateKey, error) {
@@ -124,7 +126,7 @@ func setupListener(network string, address string) (net.Listener, string) {
}
func makeReverseProxy(target string) (http.Handler, error) {
- u, err := url.Parse(target)
+ targetUri, err := url.Parse(target)
if err != nil {
return nil, fmt.Errorf("failed to parse target URL: %w", err)
}
@@ -132,10 +134,10 @@ func makeReverseProxy(target string) (http.Handler, error) {
transport := http.DefaultTransport.(*http.Transport).Clone()
// https://github.com/oauth2-proxy/oauth2-proxy/blob/4e2100a2879ef06aea1411790327019c1a09217c/pkg/upstream/http.go#L124
- if u.Scheme == "unix" {
+ if targetUri.Scheme == "unix" {
// clean path up so we don't use the socket path in proxied requests
- addr := u.Path
- u.Path = ""
+ addr := targetUri.Path
+ targetUri.Path = ""
// tell transport how to dial unix sockets
transport.DialContext = func(ctx context.Context, _, _ string) (net.Conn, error) {
dialer := net.Dialer{}
@@ -145,7 +147,7 @@ func makeReverseProxy(target string) (http.Handler, error) {
transport.RegisterProtocol("unix", libanubis.UnixRoundTripper{Transport: transport})
}
- rp := httputil.NewSingleHostReverseProxy(u)
+ rp := httputil.NewSingleHostReverseProxy(targetUri)
rp.Transport = transport
return rp, nil
@@ -255,6 +257,9 @@ func main() {
PrivateKey: priv,
CookieDomain: *cookieDomain,
CookiePartitioned: *cookiePartitioned,
+ OGPassthrough: *ogPassthrough,
+ OGTimeToLive: *ogTimeToLive,
+ Target: *target,
})
if err != nil {
log.Fatalf("can't construct libanubis.Server: %v", err)
@@ -288,6 +293,8 @@ func main() {
"version", anubis.Version,
"use-remote-address", *useRemoteAddress,
"debug-benchmark-js", *debugBenchmarkJS,
+ "og-passthrough", *ogPassthrough,
+ "og-expiry-time", *ogTimeToLive,
)
go func() {
@@ -299,7 +306,7 @@ func main() {
}
}()
- if err := srv.Serve(listener); err != http.ErrServerClosed {
+ if err := srv.Serve(listener); !errors.Is(err, http.ErrServerClosed) {
log.Fatal(err)
}
wg.Wait()
@@ -312,8 +319,8 @@ func metricsServer(ctx context.Context, done func()) {
mux.Handle("/metrics", promhttp.Handler())
srv := http.Server{Handler: mux}
- listener, url := setupListener(*metricsBindNetwork, *metricsBind)
- slog.Debug("listening for metrics", "url", url)
+ listener, metricsUrl := setupListener(*metricsBindNetwork, *metricsBind)
+ slog.Debug("listening for metrics", "url", metricsUrl)
go func() {
<-ctx.Done()
@@ -324,7 +331,7 @@ func metricsServer(ctx context.Context, done func()) {
}
}()
- if err := srv.Serve(listener); err != http.ErrServerClosed {
+ if err := srv.Serve(listener); !errors.Is(err, http.ErrServerClosed) {
log.Fatal(err)
}
}
diff --git a/data/botPolicies.json b/data/botPolicies.json
index 25a7b77..d0e27a2 100644
--- a/data/botPolicies.json
+++ b/data/botPolicies.json
@@ -344,12 +344,6 @@
]
},
{
- "_comment": "This has been reverse-engineered through making iMessage's preview function hit a URL that prints the user-agent in the server logs.",
- "name": "iMessage preview",
- "user_agent_regex": ".*facebookexternalhit/1\\.1 Facebot Twitterbot/1\\.0$",
- "action": "ALLOW"
- },
- {
"name": "us-artificial-intelligence-scraper",
"user_agent_regex": "\\+https\\://github\\.com/US-Artificial-Intelligence/scraper",
"action": "DENY"
diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md
index 0937cc3..d7c6acb 100644
--- a/docs/docs/CHANGELOG.md
+++ b/docs/docs/CHANGELOG.md
@@ -14,7 +14,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added support for native Debian, Red Hat, and tarball packaging strategies including installation and use directions.
- A prebaked tarball has been added, allowing distros to build Anubis like they could in v1.15.x.
- The placeholder Anubis mascot has been replaced with a design by [CELPHASE](https://bsky.app/profile/celphase.bsky.social).
-- Allow iMessage's link preview fetcher through Anubis by default.
- Added a periodic cleanup routine for the decaymap that removes expired entries, ensuring stale data is properly pruned.
- Added a no-store Cache-Control header to the challenge page
- Hide the directory listings for Anubis' internal static content
@@ -38,6 +37,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added `zizmor` for GitHub Actions static analysis
- Fixed most `zizmor` findings
- Enabled Dependabot
+- Added an air config for autoreload support in development ([#195](https://github.com/TecharoHQ/anubis/pull/195))
+- Added support for [OpenGraph tags](https://ogp.me/) when rendering the challenge page. This allows for social previews to be generated when sharing the challenge page on social media platforms ([#195](https://github.com/TecharoHQ/anubis/pull/195))
- Added an `--extract-resources` flag to extract static resources to a local folder.
- Add noindex flag to all Anubis pages ([#227](https://github.com/TecharoHQ/anubis/issues/227)).
diff --git a/docs/docs/admin/configuration/open-graph.mdx b/docs/docs/admin/configuration/open-graph.mdx
new file mode 100644
index 0000000..87dd404
--- /dev/null
+++ b/docs/docs/admin/configuration/open-graph.mdx
@@ -0,0 +1,47 @@
+---
+id: open-graph
+title: Open Graph Configuration
+---
+
+# Open Graph Configuration
+
+This page provides detailed information on how to configure [OpenGraph tag](https://ogp.me/) passthrough in Anubis. This enables social previews of resources protected by Anubis without having to exempt each scraper individually.
+
+## Configuration Options
+
+| Name | Description | Type | Default | Example |
+|------------------|-----------------------------------------------------------|----------|---------|-------------------------|
+| `OG_PASSTHROUGH` | Enables or disables the Open Graph tag passthrough system | Boolean | `false` | `OG_PASSTHROUGH=true` |
+| `OG_EXPIRY_TIME` | Configurable cache expiration time for Open Graph tags | Duration | `24h` | `OG_EXPIRY_TIME=1h` |
+
+## Usage
+
+To configure Open Graph tags, you can set the following environment variables, environment file or as flags in your Anubis configuration:
+
+```sh
+export OG_PASSTHROUGH=true
+export OG_EXPIRY_TIME=1h
+```
+
+## Implementation Details
+
+When `OG_PASSTHROUGH` is enabled, Anubis will:
+
+1. Check a local cache for the requested URL's Open Graph tags.
+2. If a cached entry exists and is still valid, return the cached tags.
+3. If the cached entry is stale or not found, fetch the URL, parse the Open Graph tags, update the cache, and return the new tags.
+
+The cache expiration time is controlled by `OG_EXPIRY_TIME`.
+
+## Example
+
+Here is an example of how to configure Open Graph tags in your Anubis setup:
+
+```sh
+export OG_PASSTHROUGH=true
+export OG_EXPIRY_TIME=1h
+```
+
+With these settings, Anubis will cache Open Graph tags for 1 hour and pass them through to the challenge page.
+
+For more information, refer to the [installation guide](../installation).
diff --git a/docs/docs/admin/installation.mdx b/docs/docs/admin/installation.mdx
index adf5cc9..c819e09 100644
--- a/docs/docs/admin/installation.mdx
+++ b/docs/docs/admin/installation.mdx
@@ -53,12 +53,16 @@ Anubis uses these environment variables for configuration:
| `ED25519_PRIVATE_KEY_HEX_FILE` | unset | Path to a file containing the hex-encoded ed25519 private key. Only one of this or its sister option may be set. |
| `METRICS_BIND` | `:9090` | The network address that Anubis serves Prometheus metrics on. See `BIND` for more information. |
| `METRICS_BIND_NETWORK` | `tcp` | The address family that the Anubis metrics server listens on. See `BIND_NETWORK` for more information. |
-| `SOCKET_MODE` | `0770` | _Only used when at least one of the `*_BIND_NETWORK` variables are set to `unix`._ The socket mode (permissions) for Unix domain sockets. |
+| `OG_EXPIRY_TIME` | `24h` | The expiration time for the Open Graph tag cache. |
+| `OG_PASSTHROUGH` | `false` | If set to `true`, Anubis will enable Open Graph tag passthrough. |
| `POLICY_FNAME` | unset | The file containing [bot policy configuration](./policies.md). See the bot policy documentation for more details. If unset, the default bot policy configuration is used. |
| `SERVE_ROBOTS_TXT` | `false` | If set `true`, Anubis will serve a default `robots.txt` file that disallows all known AI scrapers by name and then additionally disallows every scraper. This is useful if facts and circumstances make it difficult to change the underlying service to serve such a `robots.txt` file. |
+| `SOCKET_MODE` | `0770` | _Only used when at least one of the `*_BIND_NETWORK` variables are set to `unix`._ The socket mode (permissions) for Unix domain sockets. |
| `TARGET` | `http://localhost:3923` | The URL of the service that Anubis should forward valid requests to. Supports Unix domain sockets, set this to a URI like so: `unix:///path/to/socket.sock`. |
| `USE_REMOTE_ADDRESS` | unset | If set to `true`, Anubis will take the client's IP from the network socket. For production deployments, it is expected that a reverse proxy is used in front of Anubis, which pass the IP using headers, instead. |
+For more detailed information on configuring Open Graph tags, please refer to the [Open Graph Configuration](./configuration/open-graph.mdx) page.
+
### Key generation
To generate an ed25519 private key, you can use this command:
@@ -86,6 +90,8 @@ services:
SERVE_ROBOTS_TXT: "true"
TARGET: "http://nginx"
POLICY_FNAME: "/data/cfg/botPolicy.json"
+ OG_PASSTHROUGH: "true"
+ OG_EXPIRY_TIME: "24h"
ports:
- 8080:8080
volumes:
@@ -122,6 +128,10 @@ containers:
value: "true"
- name: "TARGET"
value: "http://localhost:5000"
+ - name: "OG_PASSTHROUGH"
+ value: "true"
+ - name: "OG_EXPIRY_TIME"
+ value: "24h"
resources:
limits:
cpu: 500m
diff --git a/go.mod b/go.mod
index 91d2545..7513182 100644
--- a/go.mod
+++ b/go.mod
@@ -10,6 +10,7 @@ require (
github.com/prometheus/client_golang v1.21.1
github.com/sebest/xff v0.0.0-20210106013422-671bd2870b3a
github.com/yl2chen/cidranger v1.0.2
+ golang.org/x/net v0.37.0
)
require (
@@ -42,7 +43,6 @@ require (
github.com/prometheus/procfs v0.15.1 // indirect
golang.org/x/exp/typeparams v0.0.0-20231108232855-2478ac86f678 // indirect
golang.org/x/mod v0.24.0 // indirect
- golang.org/x/net v0.37.0 // indirect
golang.org/x/sync v0.12.0 // indirect
golang.org/x/sys v0.31.0 // indirect
golang.org/x/tools v0.31.0 // indirect
diff --git a/internal/headers.go b/internal/headers.go
index 5c6a218..bdb5e9e 100644
--- a/internal/headers.go
+++ b/internal/headers.go
@@ -13,6 +13,7 @@ import (
// UnchangingCache sets the Cache-Control header to cache a response for 1 year if
// and only if the application is compiled in "release" mode by Docker.
func UnchangingCache(next http.Handler) http.Handler {
+ //goland:noinspection GoBoolExpressions
if anubis.Version == "devel" {
return next
}
@@ -68,11 +69,10 @@ func XForwardedForToXRealIP(next http.Handler) http.Handler {
func NoStoreCache(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Cache-Control", "no-store")
- next.ServeHTTP(w, r)
+ next.ServeHTTP(w, r)
})
}
-
// Do not allow browsing directory listings in paths that end with /
func NoBrowsing(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
diff --git a/internal/ogtags/cache.go b/internal/ogtags/cache.go
new file mode 100644
index 0000000..0d1a615
--- /dev/null
+++ b/internal/ogtags/cache.go
@@ -0,0 +1,51 @@
+package ogtags
+
+import (
+ "errors"
+ "log/slog"
+ "net/url"
+ "syscall"
+)
+
+// GetOGTags is the main function that retrieves Open Graph tags for a URL
+func (c *OGTagCache) GetOGTags(url *url.URL) (map[string]string, error) {
+ if url == nil {
+ return nil, errors.New("nil URL provided, cannot fetch OG tags")
+ }
+ urlStr := c.getTarget(url)
+ // Check cache first
+ if cachedTags := c.checkCache(urlStr); cachedTags != nil {
+ return cachedTags, nil
+ }
+
+ // Fetch HTML content
+ doc, err := c.fetchHTMLDocument(urlStr)
+ if errors.Is(err, syscall.ECONNREFUSED) {
+ slog.Debug("Connection refused, returning empty tags")
+ return nil, nil
+ } else if errors.Is(err, ErrNotFound) {
+ // not even worth a debug log...
+ return nil, nil
+ }
+ if err != nil {
+ return nil, err
+ }
+
+ // Extract OG tags
+ ogTags := c.extractOGTags(doc)
+
+ // Store in cache
+ c.cache.Set(urlStr, ogTags, c.ogTimeToLive)
+
+ return ogTags, nil
+}
+
+// checkCache checks if we have the tags cached and returns them if so
+func (c *OGTagCache) checkCache(urlStr string) map[string]string {
+ if cachedTags, ok := c.cache.Get(urlStr); ok {
+ slog.Debug("cache hit", "tags", cachedTags)
+ return cachedTags
+ }
+ slog.Debug("cache miss", "url", urlStr)
+ return nil
+}
diff --git a/internal/ogtags/cache_test.go b/internal/ogtags/cache_test.go
new file mode 100644
index 0000000..cd32414
--- /dev/null
+++ b/internal/ogtags/cache_test.go
@@ -0,0 +1,122 @@
+package ogtags
+
+import (
+ "net/http"
+ "net/http/httptest"
+ "net/url"
+ "testing"
+ "time"
+)
+
+func TestCheckCache(t *testing.T) {
+ cache := NewOGTagCache("http://example.com", true, time.Minute)
+
+ // Set up test data
+ urlStr := "http://example.com/page"
+ expectedTags := map[string]string{
+ "og:title": "Test Title",
+ "og:description": "Test Description",
+ }
+
+ // Test cache miss
+ tags := cache.checkCache(urlStr)
+ if tags != nil {
+ t.Errorf("expected nil tags on cache miss, got %v", tags)
+ }
+
+ // Manually add to cache
+ cache.cache.Set(urlStr, expectedTags, time.Minute)
+
+ // Test cache hit
+ tags = cache.checkCache(urlStr)
+ if tags == nil {
+ t.Fatal("expected non-nil tags on cache hit, got nil")
+ }
+
+ for key, expectedValue := range expectedTags {
+ if value, ok := tags[key]; !ok || value != expectedValue {
+ t.Errorf("expected %s: %s, got: %s", key, expectedValue, value)
+ }
+ }
+}
+
+func TestGetOGTags(t *testing.T) {
+ var loadCount int // Counter to track how many times the test route is loaded
+
+ // Create a test server to serve a sample HTML page with OG tags
+ ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ loadCount++
+ if loadCount > 1 {
+ t.Fatalf("Test route loaded more than once, cache failed")
+ }
+ w.Header().Set("Content-Type", "text/html")
+ w.Write([]byte(`
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <meta property="og:title" content="Test Title" />
+ <meta property="og:description" content="Test Description" />
+ <meta property="og:image" content="http://example.com/image.jpg" />
+ </head>
+ <body>
+ <p>Hello, world!</p>
+ </body>
+ </html>
+ `))
+ }))
+ defer ts.Close()
+
+ // Create an instance of OGTagCache with a short TTL for testing
+ cache := NewOGTagCache(ts.URL, true, 1*time.Minute)
+
+ // Parse the test server URL
+ parsedURL, err := url.Parse(ts.URL)
+ if err != nil {
+ t.Fatalf("failed to parse test server URL: %v", err)
+ }
+
+ // Test fetching OG tags from the test server
+ ogTags, err := cache.GetOGTags(parsedURL)
+ if err != nil {
+ t.Fatalf("failed to get OG tags: %v", err)
+ }
+
+ // Verify the fetched OG tags
+ expectedTags := map[string]string{
+ "og:title": "Test Title",
+ "og:description": "Test Description",
+ "og:image": "http://example.com/image.jpg",
+ }
+
+ for key, expectedValue := range expectedTags {
+ if value, ok := ogTags[key]; !ok || value != expectedValue {
+ t.Errorf("expected %s: %s, got: %s", key, expectedValue, value)
+ }
+ }
+
+ // Test fetching OG tags from the cache
+ ogTags, err = cache.GetOGTags(parsedURL)
+ if err != nil {
+ t.Fatalf("failed to get OG tags from cache: %v", err)
+ }
+
+ // Test fetching OG tags from the cache (3rd time)
+ newOgTags, err := cache.GetOGTags(parsedURL)
+ if err != nil {
+ t.Fatalf("failed to get OG tags from cache: %v", err)
+ }
+
+ // Verify the cached OG tags
+ for key, expectedValue := range expectedTags {
+ if value, ok := ogTags[key]; !ok || value != expectedValue {
+ t.Errorf("expected %s: %s, got: %s", key, expectedValue, value)
+ }
+
+ initialValue := ogTags[key]
+ cachedValue, ok := newOgTags[key]
+ if !ok || initialValue != cachedValue {
+ t.Errorf("Cache does not line up: expected %s: %s, got: %s", key, initialValue, cachedValue)
+ }
+
+ }
+}
diff --git a/internal/ogtags/fetch.go b/internal/ogtags/fetch.go
new file mode 100644
index 0000000..3ea9aac
--- /dev/null
+++ b/internal/ogtags/fetch.go
@@ -0,0 +1,69 @@
+package ogtags
+
+import (
+ "errors"
+ "fmt"
+ "golang.org/x/net/html"
+ "log/slog"
+ "mime"
+ "net"
+ "net/http"
+)
+
+var (
+ ErrNotFound = errors.New("page not found") /*todo: refactor into common errors lib? */
+ emptyMap = map[string]string{} // used to indicate an empty result in the cache. Can't use nil as it would be a cache miss.
+)
+
+func (c *OGTagCache) fetchHTMLDocument(urlStr string) (*html.Node, error) {
+ resp, err := c.client.Get(urlStr)
+ if err != nil {
+ var netErr net.Error
+ if errors.As(err, &netErr) && netErr.Timeout() {
+ slog.Debug("og: request timed out", "url", urlStr)
+ c.cache.Set(urlStr, emptyMap, c.ogTimeToLive/2) // Cache empty result for half the TTL to not spam the server
+ }
+ return nil, fmt.Errorf("http get failed: %w", err)
+ }
+ // this defer will call MaxBytesReader's Close, which closes the original body.
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ slog.Debug("og: received non-OK status code", "url", urlStr, "status", resp.StatusCode)
+ c.cache.Set(urlStr, emptyMap, c.ogTimeToLive) // Cache empty result for non-successful status codes
+ return nil, ErrNotFound
+ }
+
+ // Check content type
+ ct := resp.Header.Get("Content-Type")
+ if ct == "" {
+ // assume non html body
+ return nil, fmt.Errorf("missing Content-Type header")
+ } else {
+ mediaType, _, err := mime.ParseMediaType(ct)
+ if err != nil {
+ // Malformed Content-Type header
+ return nil, fmt.Errorf("invalid Content-Type '%s': %w", ct, err)
+ }
+
+ if mediaType != "text/html" && mediaType != "application/xhtml+xml" {
+ return nil, fmt.Errorf("unsupported Content-Type: %s", mediaType)
+ }
+ }
+
+ resp.Body = http.MaxBytesReader(nil, resp.Body, c.maxContentLength)
+
+ doc, err := html.Parse(resp.Body)
+ if err != nil {
+ // Check if the error is specifically because the limit was exceeded
+ var maxBytesErr *http.MaxBytesError
+ if errors.As(err, &maxBytesErr) {
+ slog.Debug("og: content exceeded max length", "url", urlStr, "limit", c.maxContentLength)
+ return nil, fmt.Errorf("content too large: exceeded %d bytes", c.maxContentLength)
+ }
+ // parsing error (e.g., malformed HTML)
+ return nil, fmt.Errorf("failed to parse HTML: %w", err)
+ }
+
+ return doc, nil
+}
diff --git a/internal/ogtags/fetch_test.go b/internal/ogtags/fetch_test.go
new file mode 100644
index 0000000..60af957
--- /dev/null
+++ b/internal/ogtags/fetch_test.go
@@ -0,0 +1,119 @@
+package ogtags
+
+import (
+ "fmt"
+ "io"
+ "net/http"
+ "net/http/httptest"
+ "os"
+ "strings"
+ "testing"
+ "time"
+)
+
+func TestFetchHTMLDocument(t *testing.T) {
+ tests := []struct {
+ name string
+ htmlContent string
+ contentType string
+ statusCode int
+ contentLength int64
+ expectError bool
+ }{
+ {
+ name: "Valid HTML",
+ htmlContent: `<!DOCTYPE html>
+ <html>
+ <head><title>Test</title></head>
+ <body><p>Test content</p></body>
+ </html>`,
+ contentType: "text/html",
+ statusCode: http.StatusOK,
+ expectError: false,
+ },
+ {
+ name: "Empty HTML",
+ htmlContent: "",
+ contentType: "text/html",
+ statusCode: http.StatusOK,
+ expectError: false,
+ },
+ {
+ name: "Not found error",
+ htmlContent: "",
+ contentType: "text/html",
+ statusCode: http.StatusNotFound,
+ expectError: true,
+ },
+ {
+ name: "Unsupported Content-Type",
+ htmlContent: "*Insert rick roll here*",
+ contentType: "video/mp4",
+ statusCode: http.StatusOK,
+ expectError: true,
+ },
+ {
+ name: "Too large content",
+ contentType: "text/html",
+ statusCode: http.StatusOK,
+ expectError: true,
+ contentLength: 5 * 1024 * 1024, // 5MB (over 2MB limit)
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ if tt.contentType != "" {
+ w.Header().Set("Content-Type", tt.contentType)
+ }
+ if tt.contentLength > 0 {
+ // Simulate content length but avoid sending too much actual data
+ w.Header().Set("Content-Length", fmt.Sprintf("%d", tt.contentLength))
+ io.CopyN(w, strings.NewReader("X"), tt.contentLength)
+ } else {
+ w.WriteHeader(tt.statusCode)
+ w.Write([]byte(tt.htmlContent))
+ }
+ }))
+ defer ts.Close()
+
+ cache := NewOGTagCache("", true, time.Minute)
+ doc, err := cache.fetchHTMLDocument(ts.URL)
+
+ if tt.expectError {
+ if err == nil {
+ t.Error("expected error, got nil")
+ }
+ if doc != nil {
+ t.Error("expected nil document on error, got non-nil")
+ }
+ } else {
+ if err != nil {
+ t.Errorf("unexpected error: %v", err)
+ }
+ if doc == nil {
+ t.Error("expected non-nil document, got nil")
+ }
+ }
+ })
+ }
+}
+
+func TestFetchHTMLDocumentInvalidURL(t *testing.T) {
+ if os.Getenv("DONT_USE_NETWORK") != "" {
+ t.Skip("test requires theoretical network egress")
+ }
+
+ cache := NewOGTagCache("", true, time.Minute)
+
+ doc, err := cache.fetchHTMLDocument("http://invalid.url.that.doesnt.exist.example")
+
+ if err == nil {
+ t.Error("expected error for invalid URL, got nil")
+ }
+
+ if doc != nil {
+ t.Error("expected nil document for invalid URL, got non-nil")
+ }
+}
diff --git a/internal/ogtags/integration_test.go b/internal/ogtags/integration_test.go
new file mode 100644
index 0000000..9eaaa3a
--- /dev/null
+++ b/internal/ogtags/integration_test.go
@@ -0,0 +1,155 @@
+package ogtags
+
+import (
+ "net/http"
+ "net/http/httptest"
+ "net/url"
+ "testing"
+ "time"
+)
+
+func TestIntegrationGetOGTags(t *testing.T) {
+ ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "text/html")
+
+ switch r.URL.Path {
+ case "/simple":
+ w.Write([]byte(`
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <meta property="og:title" content="Simple Page" />
+ <meta property="og:type" content="website" />
+ </head>
+ <body><p>Simple page content</p></body>
+ </html>
+ `))
+ case "/complete":
+ w.Write([]byte(`
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <meta property="og:title" content="Complete Page" />
+ <meta property="og:description" content="A page with many OG tags" />
+ <meta property="og:image" content="http://example.com/image.jpg" />
+ <meta property="og:url" content="http://example.com/complete" />
+ <meta property="og:type" content="article" />
+ </head>
+ <body><p>Complete page content</p></body>
+ </html>
+ `))
+ case "/no-og":
+ w.Write([]byte(`
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <title>No OG Tags</title>
+ </head>
+ <body><p>No OG tags here</p></body>
+ </html>
+ `))
+ default:
+ w.WriteHeader(http.StatusNotFound)
+ }
+ }))
+ defer ts.Close()
+
+ // Test with different configurations
+ testCases := []struct {
+ name string
+ path string
+ query string
+ expectedTags map[string]string
+ expectError bool
+ }{
+ {
+ name: "Simple page",
+ path: "/simple",
+ query: "",
+ expectedTags: map[string]string{
+ "og:title": "Simple Page",
+ "og:type": "website",
+ },
+ expectError: false,
+ },
+ {
+ name: "Complete page",
+ path: "/complete",
+ query: "ref=test",
+ expectedTags: map[string]string{
+ "og:title": "Complete Page",
+ "og:description": "A page with many OG tags",
+ "og:image": "http://example.com/image.jpg",
+ "og:url": "http://example.com/complete",
+ "og:type": "article",
+ },
+ expectError: false,
+ },
+ {
+ name: "Page with no OG tags",
+ path: "/no-og",
+ query: "",
+ expectedTags: map[string]string{},
+ expectError: false,
+ },
+ {
+ name: "Non-existent page",
+ path: "/not-found",