aboutsummaryrefslogtreecommitdiff
path: root/internal/ogtags/fetch.go
blob: 7e02ecaae0c2ceed531933d5e2d2f7d57083cd7c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
package ogtags

import (
	"errors"
	"fmt"
	"golang.org/x/net/html"
	"io"
	"log/slog"
	"mime"
	"net"
	"net/http"
)

var (
	ErrOgHandled = errors.New("og: handled error") // used to indicate that the error was handled and should not be logged
	emptyMap     = map[string]string{}             // used to indicate an empty result in the cache. Can't use nil as it would be a cache miss.
)

func (c *OGTagCache) fetchHTMLDocument(urlStr string) (*html.Node, error) {
	resp, err := c.client.Get(urlStr)
	if err != nil {
		var netErr net.Error
		if errors.As(err, &netErr) && netErr.Timeout() {
			slog.Debug("og: request timed out", "url", urlStr)
			c.cache.Set(urlStr, emptyMap, c.ogTimeToLive/2) // Cache empty result for half the TTL to not spam the server
		}
		return nil, fmt.Errorf("http get failed: %w", err)
	}
	// this defer will call MaxBytesReader's Close, which closes the original body.
	defer func(Body io.ReadCloser) {
		err := Body.Close()
		if err != nil {
			slog.Debug("og: error closing response body", "url", urlStr, "error", err)
		}
	}(resp.Body)

	if resp.StatusCode != http.StatusOK {
		slog.Debug("og: received non-OK status code", "url", urlStr, "status", resp.StatusCode)
		c.cache.Set(urlStr, emptyMap, c.ogTimeToLive) // Cache empty result for non-successful status codes
		return nil, fmt.Errorf("%w: page not found", ErrOgHandled)
	}

	// Check content type
	ct := resp.Header.Get("Content-Type")
	if ct == "" {
		// assume non html body
		return nil, fmt.Errorf("missing Content-Type header")
	} else {
		mediaType, _, err := mime.ParseMediaType(ct)
		if err != nil {
			// Malformed Content-Type header
			slog.Debug("og: malformed Content-Type header", "url", urlStr, "contentType", ct)
			return nil, fmt.Errorf("%w malformed Content-Type header: %w", ErrOgHandled, err)
		}

		if mediaType != "text/html" && mediaType != "application/xhtml+xml" {
			slog.Debug("og: unsupported Content-Type", "url", urlStr, "contentType", mediaType)
			return nil, fmt.Errorf("%w unsupported Content-Type: %s", ErrOgHandled, mediaType)
		}
	}

	resp.Body = http.MaxBytesReader(nil, resp.Body, c.maxContentLength)

	doc, err := html.Parse(resp.Body)
	if err != nil {
		// Check if the error is specifically because the limit was exceeded
		var maxBytesErr *http.MaxBytesError
		if errors.As(err, &maxBytesErr) {
			slog.Debug("og: content exceeded max length", "url", urlStr, "limit", c.maxContentLength)
			return nil, fmt.Errorf("content too large: exceeded %d bytes", c.maxContentLength)
		}
		// parsing error (e.g., malformed HTML)
		return nil, fmt.Errorf("failed to parse HTML: %w", err)
	}

	return doc, nil
}