diff options
| author | Xe Iaso <me@xeiaso.net> | 2025-04-20 20:09:27 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-04-21 00:09:27 +0000 |
| commit | d40b5cfdab11c62dc2ed226bde32b19ea7107f21 (patch) | |
| tree | 2c997fd907c0de22e98b1ab6dc233141e10567e1 | |
| parent | 022eb59ff337427ae4fe151fd1f24ec411418479 (diff) | |
| download | anubis-d40b5cfdab11c62dc2ed226bde32b19ea7107f21.tar.xz anubis-d40b5cfdab11c62dc2ed226bde32b19ea7107f21.zip | |
lib: move config to yaml (#307)
* lib: move config to yaml
Signed-off-by: Xe Iaso <me@xeiaso.net>
* web: run go generate
Signed-off-by: Xe Iaso <me@xeiaso.net>
* Add Haiku to known instances (#304)
Signed-off-by: Asmodeus <46908100+AsmodeumX@users.noreply.github.com>
* Add headers bot rule (#300)
* Closes #291: add headers support to bot policy rules
* Fix config validator
* update docs for JSON -> YAML
Signed-off-by: Xe Iaso <me@xeiaso.net>
* docs: document http header based actions
Signed-off-by: Xe Iaso <me@xeiaso.net>
* lib: add missing test
Signed-off-by: Xe Iaso <me@xeiaso.net>
* Apply suggestions from code review
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Signed-off-by: Xe Iaso <me@xeiaso.net>
---------
Signed-off-by: Xe Iaso <me@xeiaso.net>
Signed-off-by: Asmodeus <46908100+AsmodeumX@users.noreply.github.com>
Co-authored-by: Asmodeus <46908100+AsmodeumX@users.noreply.github.com>
Co-authored-by: Neur0toxine <pashok9825@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
| -rw-r--r-- | data/botPolicies.json | 9 | ||||
| -rw-r--r-- | data/botPolicies.yaml | 662 | ||||
| -rw-r--r-- | data/embed.go | 2 | ||||
| -rw-r--r-- | docs/docs/CHANGELOG.md | 1 | ||||
| -rw-r--r-- | docs/docs/admin/environments/docker-compose.mdx | 4 | ||||
| -rw-r--r-- | docs/docs/admin/installation.mdx | 2 | ||||
| -rw-r--r-- | docs/docs/admin/native-install.mdx | 8 | ||||
| -rw-r--r-- | docs/docs/admin/policies.mdx (renamed from docs/docs/admin/policies.md) | 110 | ||||
| -rw-r--r-- | docs/docs/index.mdx | 2 | ||||
| -rw-r--r-- | go.mod | 3 | ||||
| -rw-r--r-- | go.sum | 6 | ||||
| -rw-r--r-- | lib/anubis.go | 4 | ||||
| -rw-r--r-- | lib/anubis_test.go | 19 | ||||
| -rw-r--r-- | lib/policy/config/config_test.go | 7 | ||||
| -rw-r--r-- | lib/policy/config/testdata/bad/badregexes.yaml | 7 | ||||
| -rw-r--r-- | lib/policy/config/testdata/bad/invalid.yaml | 1 | ||||
| -rw-r--r-- | lib/policy/config/testdata/bad/nobots.yaml | 1 | ||||
| -rw-r--r-- | lib/policy/config/testdata/good/allow_everyone.yaml | 6 | ||||
| -rw-r--r-- | lib/policy/config/testdata/good/block_cf_workers.yaml | 5 | ||||
| -rw-r--r-- | lib/policy/config/testdata/good/challengemozilla.yaml | 4 | ||||
| -rw-r--r-- | lib/policy/config/testdata/good/everything_blocked.yaml | 4 | ||||
| -rw-r--r-- | lib/policy/policy.go | 6 |
22 files changed, 854 insertions, 19 deletions
diff --git a/data/botPolicies.json b/data/botPolicies.json index dbc3d35..1993d22 100644 --- a/data/botPolicies.json +++ b/data/botPolicies.json @@ -1,6 +1,13 @@ { "bots": [ { + "name": "cloudflare-workers", + "headers_regex": { + "CF-Worker": ".*" + }, + "action": "DENY" + }, + { "name": "ai-robots-txt", "user_agent_regex": "AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|Perplexity-User|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot", "action": "DENY" @@ -680,4 +687,4 @@ } ], "dnsbl": false -} +}
\ No newline at end of file diff --git a/data/botPolicies.yaml b/data/botPolicies.yaml new file mode 100644 index 0000000..e8448ac --- /dev/null +++ b/data/botPolicies.yaml @@ -0,0 +1,662 @@ +bots: +# Pathological bots to deny +- name: us-artificial-intelligence-scraper + user_agent_regex: \+https\://github\.com/US-Artificial-Intelligence/scraper + action: DENY +- name: lightpanda + user_agent_regex: ^LightPanda/.*$ + action: DENY +- name: headless-chrome + user_agent_regex: HeadlessChrome + action: DENY +- name: headless-chromium + user_agent_regex: HeadlessChromium + action: DENY +- name: "ai-robots-txt" + user_agent_regex: > + AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|Perplexity-User|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot + action: DENY +- name: cloudflare-workers + headers_regex: + CF-Worker: .* + action: DENY + +# search engines to allow +- name: googlebot + user_agent_regex: \+http\://www\.google\.com/bot\.html + action: ALLOW + # https://developers.google.com/static/search/apis/ipranges/googlebot.json + remote_addresses: [ + "2001:4860:4801:10::/64", + "2001:4860:4801:11::/64", + "2001:4860:4801:12::/64", + "2001:4860:4801:13::/64", + "2001:4860:4801:14::/64", + "2001:4860:4801:15::/64", + "2001:4860:4801:16::/64", + "2001:4860:4801:17::/64", + "2001:4860:4801:18::/64", + "2001:4860:4801:19::/64", + "2001:4860:4801:1a::/64", + "2001:4860:4801:1b::/64", + "2001:4860:4801:1c::/64", + "2001:4860:4801:1d::/64", + "2001:4860:4801:1e::/64", + "2001:4860:4801:1f::/64", + "2001:4860:4801:20::/64", + "2001:4860:4801:21::/64", + "2001:4860:4801:22::/64", + "2001:4860:4801:23::/64", + "2001:4860:4801:24::/64", + "2001:4860:4801:25::/64", + "2001:4860:4801:26::/64", + "2001:4860:4801:27::/64", + "2001:4860:4801:28::/64", + "2001:4860:4801:29::/64", + "2001:4860:4801:2::/64", + "2001:4860:4801:2a::/64", + "2001:4860:4801:2b::/64", + "2001:4860:4801:2c::/64", + "2001:4860:4801:2d::/64", + "2001:4860:4801:2e::/64", + "2001:4860:4801:2f::/64", + "2001:4860:4801:31::/64", + "2001:4860:4801:32::/64", + "2001:4860:4801:33::/64", + "2001:4860:4801:34::/64", + "2001:4860:4801:35::/64", + "2001:4860:4801:36::/64", + "2001:4860:4801:37::/64", + "2001:4860:4801:38::/64", + "2001:4860:4801:39::/64", + "2001:4860:4801:3a::/64", + "2001:4860:4801:3b::/64", + "2001:4860:4801:3c::/64", + "2001:4860:4801:3d::/64", + "2001:4860:4801:3e::/64", + "2001:4860:4801:40::/64", + "2001:4860:4801:41::/64", + "2001:4860:4801:42::/64", + "2001:4860:4801:43::/64", + "2001:4860:4801:44::/64", + "2001:4860:4801:45::/64", + "2001:4860:4801:46::/64", + "2001:4860:4801:47::/64", + "2001:4860:4801:48::/64", + "2001:4860:4801:49::/64", + "2001:4860:4801:4a::/64", + "2001:4860:4801:4b::/64", + "2001:4860:4801:4c::/64", + "2001:4860:4801:50::/64", + "2001:4860:4801:51::/64", + "2001:4860:4801:52::/64", + "2001:4860:4801:53::/64", + "2001:4860:4801:54::/64", + "2001:4860:4801:55::/64", + "2001:4860:4801:56::/64", + "2001:4860:4801:60::/64", + "2001:4860:4801:61::/64", + "2001:4860:4801:62::/64", + "2001:4860:4801:63::/64", + "2001:4860:4801:64::/64", + "2001:4860:4801:65::/64", + "2001:4860:4801:66::/64", + "2001:4860:4801:67::/64", + "2001:4860:4801:68::/64", + "2001:4860:4801:69::/64", + "2001:4860:4801:6a::/64", + "2001:4860:4801:6b::/64", + "2001:4860:4801:6c::/64", + "2001:4860:4801:6d::/64", + "2001:4860:4801:6e::/64", + "2001:4860:4801:6f::/64", + "2001:4860:4801:70::/64", + "2001:4860:4801:71::/64", + "2001:4860:4801:72::/64", + "2001:4860:4801:73::/64", + "2001:4860:4801:74::/64", + "2001:4860:4801:75::/64", + "2001:4860:4801:76::/64", + "2001:4860:4801:77::/64", + "2001:4860:4801:78::/64", + "2001:4860:4801:79::/64", + "2001:4860:4801:80::/64", + "2001:4860:4801:81::/64", + "2001:4860:4801:82::/64", + "2001:4860:4801:83::/64", + "2001:4860:4801:84::/64", + "2001:4860:4801:85::/64", + "2001:4860:4801:86::/64", + "2001:4860:4801:87::/64", + "2001:4860:4801:88::/64", + "2001:4860:4801:90::/64", + "2001:4860:4801:91::/64", + "2001:4860:4801:92::/64", + "2001:4860:4801:93::/64", + "2001:4860:4801:94::/64", + "2001:4860:4801:95::/64", + "2001:4860:4801:96::/64", + "2001:4860:4801:a0::/64", + "2001:4860:4801:a1::/64", + "2001:4860:4801:a2::/64", + "2001:4860:4801:a3::/64", + "2001:4860:4801:a4::/64", + "2001:4860:4801:a5::/64", + "2001:4860:4801:c::/64", + "2001:4860:4801:f::/64", + "192.178.5.0/27", + "192.178.6.0/27", + "192.178.6.128/27", + "192.178.6.160/27", + "192.178.6.192/27", + "192.178.6.32/27", + "192.178.6.64/27", + "192.178.6.96/27", + "34.100.182.96/28", + "34.101.50.144/28", + "34.118.254.0/28", + "34.118.66.0/28", + "34.126.178.96/28", + "34.146.150.144/28", + "34.147.110.144/28", + "34.151.74.144/28", + "34.152.50.64/28", + "34.154.114.144/28", + "34.155.98.32/28", + "34.165.18.176/28", + "34.175.160.64/28", + "34.176.130.16/28", + "34.22.85.0/27", + "34.64.82.64/28", + "34.65.242.112/28", + "34.80.50.80/28", + "34.88.194.0/28", + "34.89.10.80/28", + "34.89.198.80/28", + "34.96.162.48/28", + "35.247.243.240/28", + "66.249.64.0/27", + "66.249.64.128/27", + "66.249.64.160/27", + "66.249.64.224/27", + "66.249.64.32/27", + "66.249.64.64/27", + "66.249.64.96/27", + "66.249.65.0/27", + "66.249.65.128/27", + "66.249.65.160/27", + "66.249.65.192/27", + "66.249.65.224/27", + "66.249.65.32/27", + "66.249.65.64/27", + "66.249.65.96/27", + "66.249.66.0/27", + "66.249.66.128/27", + "66.249.66.160/27", + "66.249.66.192/27", + "66.249.66.224/27", + "66.249.66.32/27", + "66.249.66.64/27", + "66.249.66.96/27", + "66.249.68.0/27", + "66.249.68.128/27", + "66.249.68.32/27", + "66.249.68.64/27", + "66.249.68.96/27", + "66.249.69.0/27", + "66.249.69.128/27", + "66.249.69.160/27", + "66.249.69.192/27", + "66.249.69.224/27", + "66.249.69.32/27", + "66.249.69.64/27", + "66.249.69.96/27", + "66.249.70.0/27", + "66.249.70.128/27", + "66.249.70.160/27", + "66.249.70.192/27", + "66.249.70.224/27", + "66.249.70.32/27", + "66.249.70.64/27", + "66.249.70.96/27", + "66.249.71.0/27", + "66.249.71.128/27", + "66.249.71.160/27", + "66.249.71.192/27", + "66.249.71.224/27", + "66.249.71.32/27", + "66.249.71.64/27", + "66.249.71.96/27", + "66.249.72.0/27", + "66.249.72.128/27", + "66.249.72.160/27", + "66.249.72.192/27", + "66.249.72.224/27", + "66.249.72.32/27", + "66.249.72.64/27", + "66.249.72.96/27", + "66.249.73.0/27", + "66.249.73.128/27", + "66.249.73.160/27", + "66.249.73.192/27", + "66.249.73.224/27", + "66.249.73.32/27", + "66.249.73.64/27", + "66.249.73.96/27", + "66.249.74.0/27", + "66.249.74.128/27", + "66.249.74.160/27", + "66.249.74.192/27", + "66.249.74.32/27", + "66.249.74.64/27", + "66.249.74.96/27", + "66.249.75.0/27", + "66.249.75.128/27", + "66.249.75.160/27", + "66.249.75.192/27", + "66.249.75.224/27", + "66.249.75.32/27", + "66.249.75.64/27", + "66.249.75.96/27", + "66.249.76.0/27", + "66.249.76.128/27", + "66.249.76.160/27", + "66.249.76.192/27", + "66.249.76.224/27", + "66.249.76.32/27", + "66.249.76.64/27", + "66.249.76.96/27", + "66.249.77.0/27", + "66.249.77.128/27", + "66.249.77.160/27", + "66.249.77.192/27", + "66.249.77.224/27", + "66.249.77.32/27", + "66.249.77.64/27", + "66.249.77.96/27", + "66.249.78.0/27", + "66.249.78.32/27", + "66.249.79.0/27", + "66.249.79.128/27", + "66.249.79.160/27", + "66.249.79.192/27", + "66.249.79.224/27", + "66.249.79.32/27", + "66.249.79.64/27", + "66.249.79.96/27" + ] +- name: bingbot + user_agent_regex: \+http\://www\.bing\.com/bingbot\.htm + action: ALLOW + # https://www.bing.com/toolbox/bingbot.json + remote_addresses: [ + "157.55.39.0/24", + "207.46.13.0/24", + "40.77.167.0/24", + "13.66.139.0/24", + "13.66.144.0/24", + "52.167.144.0/24", + "13.67.10.16/28", + "13.69.66.240/28", + "13.71.172.224/28", + "139.217.52.0/28", + "191.233.204.224/28", + "20.36.108.32/28", + "20.43.120.16/28", + "40.79.131.208/28", + "40.79.186.176/28", + "52.231.148.0/28", + "20.79.107.240/28", + "51.105.67.0/28", + "20.125.163.80/28", + "40.77.188.0/22", + "65.55.210.0/24", + "199.30.24.0/23", + "40.77.202.0/24", + "40.77.139.0/25", + "20.74.197.0/28", + "20.15.133.160/27", + "40.77.177.0/24", + "40.77.178.0/23" + ] +- name: duckduckbot + user_agent_regex: DuckDuckBot/1\.1; \(\+http\://duckduckgo\.com/duckduckbot\.html\) + action: ALLOW + # https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot + remote_addresses: [ + "57.152.72.128/32", + "51.8.253.152/32", + "40.80.242.63/32", + "20.12.141.99/32", + "20.49.136.28/32", + "51.116.131.221/32", + "51.107.40.209/32", + "20.40.133.240/32", + "20.50.168.91/32", + "51.120.48.122/32", + "20.193.45.113/32", + "40.76.173.151/32", + "40.76.163.7/32", + "20.185.79.47/32", + "52.142.26.175/32", + "20.185.79.15/32", + "52.142.24.149/32", + "40.76.162.208/32", + "40.76.163.23/32", + "40.76.162.191/32", + "40.76.162.247/32", + "40.88.21.235/32", + "20.191.45.212/32", + "52.146.59.12/32", + "52.146.59.156/32", + "52.146.59.154/32", + "52.146.58.236/32", + "20.62.224.44/32", + "51.104.180.53/32", + "51.104.180.47/32", + "51.104.180.26/32", + "51.104.146.225/32", + "51.104.146.235/32", + "20.73.202.147/32", + "20.73.132.240/32", + "20.71.12.143/32", + "20.56.197.58/32", + "20.56.197.63/32", + "20.43.150.93/32", + "20.43.150.85/32", + "20.44.222.1/32", + "40.89.243.175/32", + "13.89.106.77/32", + "52.143.242.6/32", + "52.143.241.111/32", + "52.154.60.82/32", + "20.197.209.11/32", + "20.197.209.27/32", + "20.226.133.105/32", + "191.234.216.4/32", + "191.234.216.178/32", + "20.53.92.211/32", + "20.53.91.2/32", + "20.207.99.197/32", + "20.207.97.190/32", + "40.81.250.205/32", + "40.64.106.11/32", + "40.64.105.247/32", + "20.72.242.93/32", + "20.99.255.235/32", + "20.113.3.121/32", + "52.224.16.221/32", + "52.224.21.53/32", + "52.224.20.204/32", + "52.224.21.19/32", + "52.224.20.249/32", + "52.224.20.203/32", + "52.224.20.190/32", + "52.224.16.229/32", + "52.224.21.20/32", + "52.146.63.80/32", + "52.224.20.227/32", + "52.224.20.193/32", + "52.190.37.160/32", + "52.224.21.23/32", + "52.224.20.223/32", + "52.224.20.181/32", + "52.224.21.49/32", + "52.224.21.55/32", + "52.224.21.61/32", + "52.224.19.152/32", + "52.224.20.186/32", + "52.224.21.27/32", + "52.224.21.51/32", + "52.224.20.174/32", + "52.224.21.4/32", + "51.104.164.109/32", + "51.104.167.71/32", + "51.104.160.177/32", + "51.104.162.149/32", + "51.104.167.95/32", + "51.104.167.54/32", + "51.104.166.111/32", + "51.104.167.88/32", + "51.104.161.32/32", + "51.104.163.250/32", + "51.104.164.189/32", + "51.104.167.19/32", + "51.104.160.167/32", + "51.104.167.110/32", + "20.191.44.119/32", + "51.104.167.104/32", + "20.191.44.234/32", + "51.104.164.215/32", + "51.104.167.52/32", + "20.191.44.22/32", + "51.104.167.87/32", + "51.104.167.96/32", + "20.191.44.16/32", + "51.104.167.61/32", + "51.104.164.147/32", + "20.50.48.159/32", + "40.114.182.172/32", + "20.50.50.130/32", + "20.50.50.163/32", + "20.50.50.46/32", + "40.114.182.153/32", + "20.50.50.118/32", + "20.50.49.55/32", + "20.50.49.25/32", + "40.114.183.251/32", + "20.50.50.123/32", + "20.50.49.237/32", + "20.50.48.192/32", + "20.50.50.134/32", + "51.138.90.233/32", + "40.114.183.196/32", + "20.50.50.146/32", + "40.114.183.88/32", + "20.50.50.145/32", + "20.50.50.121/32", + "20.50.49.40/32", + "51.138.90.206/32", + "40.114.182.45/32", + "51.138.90.161/32", + "20.50.49.0/32", + "40.119.232.215/32", + "104.43.55.167/32", + "40.119.232.251/32", + "40.119.232.50/32", + "40.119.232.146/32", + "40.119.232.218/32", + "104.43.54.127/32", + "104.43.55.117/32", + "104.43.55.116/32", + "104.43.55.166/32", + "52.154.169.50/32", + "52.154.171.70/32", + "52.154.170.229/32", + "52.154.170.113/32", + "52.154.171.44/32", + "52.154.172.2/32", + "52.143.244.81/32", + "52.154.171.87/32", + "52.154.171.250/32", + "52.154.170.28/32", + "52.154.170.122/32", + "52.143.243.117/32", + "52.143.247.235/32", + "52.154.171.235/32", + "52.154.171.196/32", + "52.154.171.0/32", + "52.154.170.243/32", + "52.154.170.26/32", + "52.154.169.200/32", + "52.154.170.96/32", + "52.154.170.88/32", + "52.154.171.150/32", + "52.154.171.205/32", + "52.154.170.117/32", + "52.154.170.209/32", + "191.235.202.48/32", + "191.233.3.202/32", + "191.235.201.214/32", + "191.233.3.197/32", + "191.235.202.38/32", + "20.53.78.144/32", + "20.193.24.10/32", + "20.53.78.236/32", + "20.53.78.138/32", + "20.53.78.123/32", + "20.53.78.106/32", + "20.193.27.215/32", + "20.193.25.197/32", + "20.193.12.126/32", + "20.193.24.251/32", + "20.204.242.101/32", + "20.207.72.113/32", + "20.204.242.19/32", + "20.219.45.67/32", + "20.207.72.11/32", + "20.219.45.190/32", + "20.204.243.55/32", + "20.204.241.148/32", + "20.207.72.110/32", + "20.204.240.172/32", + "20.207.72.21/32", + "20.204.246.81/32", + "20.207.107.181/32", + "20.204.246.254/32", + "20.219.43.246/32", + "52.149.25.43/32", + "52.149.61.51/32", + "52.149.58.139/32", + "52.149.60.38/32", + "52.148.165.38/32", + "52.143.95.162/32", + "52.149.56.151/32", + "52.149.30.45/32", + "52.149.58.173/32", + "52.143.95.204/32", + "52.149.28.83/32", + "52.149.58.69/32", + "52.148.161.87/32", + "52.149.58.27/32", + "52.149.28.18/32", + "20.79.226.26/32", + "20.79.239.66/32", + "20.79.238.198/32", + "20.113.14.159/32", + "20.75.144.152/32", + "20.43.172.120/32", + "20.53.134.160/32", + "20.201.15.208/32", + "20.93.28.24/32", + "20.61.34.40/32", + "52.242.224.168/32", + "20.80.129.80/32", + "20.195.108.47/32", + "4.195.133.120/32", + "4.228.76.163/32", + "4.182.131.108/32", + "4.209.224.56/32", + "108.141.83.74/32", + "4.213.46.14/32", + "172.169.17.165/32", + "51.8.71.117/32", + "20.3.1.178/32", + "52.149.56.151/32", + "52.149.30.45/32", + "52.149.58.173/32", + "52.143.95.204/32", + "52.149.28.83/32", + "52.149.58.69/32", + "52.148.161.87/32", + "52.149.58.27/32", + "52.149.28.18/32", + "20.79.226.26/32", + "20.79.239.66/32", + "20.79.238.198/32", + "20.113.14.159/32", + "20.75.144.152/32", + "20.43.172.120/32", + "20.53.134.160/32", + "20.201.15.208/32", + "20.93.28.24/32", + "20.61.34.40/32", + "52.242.224.168/32", + "20.80.129.80/32", + "20.195.108.47/32", + "4.195.133.120/32", + "4.228.76.163/32", + "4.182.131.108/32", + "4.209.224.56/32", + "108.141.83.74/32", + "4.213.46.14/32", + "172.169.17.165/32", + "51.8.71.117/32", + "20.3.1.178/32" + ] +- name: qwantbot + user_agent_regex: \+https\://help\.qwant\.com/bot/ + action: ALLOW + # https://help.qwant.com/wp-content/uploads/sites/2/2025/01/qwantbot.json + remote_addresses: [ "91.242.162.0/24" ] +- name: internet-archive + action: ALLOW + # https://ipinfo.io/AS7941 + remote_addresses: [ + "207.241.224.0/20", + "208.70.24.0/21", + "2620:0:9c0::/48" + ] +- name: kagibot + user_agent_regex: \+https\://kagi\.com/bot + action: ALLOW + # https://kagi.com/bot + remote_addresses: [ + "216.18.205.234/32", + "35.212.27.76/32", + "104.254.65.50/32", + "209.151.156.194/32" + ] +- name: marginalia + user_agent_regex: search\.marginalia\.nu + action: ALLOW + # Received directly over email + remote_addresses: [ + "193.183.0.162/31", + "193.183.0.164/30", + "193.183.0.168/30", + "193.183.0.172/31", + "193.183.0.174/32" + ] +- name: mojeekbot + user_agent_regex: http\://www\.mojeek\.com/bot\.html + action: ALLOW + # https://www.mojeek.com/bot.html + remote_addresses: [ "5.102.173.71/32" ] + +# Common "keeping the internet working" routes +- name: well-known + path_regex: ^/.well-known/.*$ + action: ALLOW +- name: favicon + path_regex: ^/favicon.ico$ + action: ALLOW +- name: robots-txt + path_regex: ^/robots.txt$ + action: ALLOW + +# Punish any bot with "bot" in the user-agent string +- name: generic-bot-catchall + user_agent_regex: (?i:bot|crawler) + action: CHALLENGE + challenge: + difficulty: 16 # impossible + report_as: 4 # lie to the operator + algorithm: slow # intentionally waste CPU cycles and time + +- name: generic-browser + user_agent_regex: > + Mozilla|Opera + action: CHALLENGE + +dnsbl: false diff --git a/data/embed.go b/data/embed.go index 5a5f4d2..c1fbe68 100644 --- a/data/embed.go +++ b/data/embed.go @@ -3,6 +3,6 @@ package data import "embed" var ( - //go:embed botPolicies.json + //go:embed botPolicies.yaml botPolicies.json BotPolicies embed.FS ) diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md index 45c1f59..1c634a8 100644 --- a/docs/docs/CHANGELOG.md +++ b/docs/docs/CHANGELOG.md @@ -23,6 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added example Apache configuration to the documentation [#277](https://github.com/TecharoHQ/anubis/issues/277) - Move per-environment configuration details into their own pages - Added headers support to bot policy rules +- Moved configuration file from JSON to YAML by default - Added documentation on how to use Anubis with Traefik in Docker ## v1.16.0 diff --git a/docs/docs/admin/environments/docker-compose.mdx b/docs/docs/admin/environments/docker-compose.mdx index b40e0ea..6783808 100644 --- a/docs/docs/admin/environments/docker-compose.mdx +++ b/docs/docs/admin/environments/docker-compose.mdx @@ -12,13 +12,13 @@ services: METRICS_BIND: ":9090" SERVE_ROBOTS_TXT: "true" TARGET: "http://nginx" - POLICY_FNAME: "/data/cfg/botPolicy.json" + POLICY_FNAME: "/data/cfg/botPolicy.yaml" OG_PASSTHROUGH: "true" OG_EXPIRY_TIME: "24h" ports: - 8080:8080 volumes: - - "./botPolicy.json:/data/cfg/botPolicy.json:ro" + - "./botPolicy.yaml:/data/cfg/botPolicy.yaml:ro" nginx: image: nginx volumes: diff --git a/docs/docs/admin/installation.mdx b/docs/docs/admin/installation.mdx index 9c88930..2333b1d 100644 --- a/docs/docs/admin/installation.mdx +++ b/docs/docs/admin/installation.mdx @@ -62,7 +62,7 @@ Anubis uses these environment variables for configuration: | `METRICS_BIND_NETWORK` | `tcp` | The address family that the Anubis metrics server listens on. See `BIND_NETWORK` for more information. | | `OG_EXPIRY_TIME` | `24h` | The expiration time for the Open Graph tag cache. | | `OG_PASSTHROUGH` | `false` | If set to `true`, Anubis will enable Open Graph tag passthrough. | -| `POLICY_FNAME` | unset | The file containing [bot policy configuration](./policies.md). See the bot policy documentation for more details. If unset, the default bot policy configuration is used. | +| `POLICY_FNAME` | unset | The file containing [bot policy configuration](./policies.mdx). See the bot policy documentation for more details. If unset, the default bot policy configuration is used. | | `SERVE_ROBOTS_TXT` | `false` | If set `true`, Anubis will serve a default `robots.txt` file that disallows all known AI scrapers by name and then additionally disallows every scraper. This is useful if facts and circumstances make it difficult to change the underlying service to serve such a `robots.txt` file. | | `SOCKET_MODE` | `0770` | _Only used when at least one of the `*_BIND_NETWORK` variables are set to `unix`._ The socket mode (permissions) for Unix domain sockets. | | `TARGET` | `http://localhost:3923` | The URL of the service that Anubis should forward valid requests to. Supports Unix domain sockets, set this to a URI like so: `unix:///path/to/socket.sock`. | diff --git a/docs/docs/admin/native-install.mdx b/docs/docs/admin/native-install.mdx index 8faa5cb..a615929 100644 --- a/docs/docs/admin/native-install.mdx +++ b/docs/docs/admin/native-install.mdx @@ -86,20 +86,20 @@ Once it's installed, make a copy of the default configuration file `/etc/anubis/ sudo cp /etc/anubis/default.env /etc/anubis/gitea.env ``` -Copy the default bot policies file to `/etc/anubis/gitea.botPolicies.json`: +Copy the default bot policies file to `/etc/anubis/gitea.botPolicies.yaml`: <Tabs> <TabItem value="debrpm" label="Debian or Red Hat" default> ```text -sudo cp /usr/share/doc/anubis/botPolicies.json /etc/anubis/gitea.botPolicies.json +sudo cp /usr/share/doc/anubis/botPolicies.yaml /etc/anubis/gitea.botPolicies.yaml ``` </TabItem> <TabItem value="tarball" label="Tarball"> ```text -sudo cp ./doc/botPolicies.json /etc/anubis/gitea.botPolicies.json +sudo cp ./doc/botPolicies.yaml /etc/anubis/gitea.botPolicies.yaml ``` </TabItem> @@ -114,7 +114,7 @@ BIND_NETWORK=tcp DIFFICULTY=4 METRICS_BIND=[::1]:8240 METRICS_BIND_NETWORK=tcp -POLICY_FNAME=/etc/anubis/gitea.botPolicies.json +POLICY_FNAME=/etc/anubis/gitea.botPolicies.yaml TARGET=http://localhost:3000 ``` diff --git a/docs/docs/admin/policies.md b/docs/docs/admin/policies.mdx index c4034a3..a5f6f1e 100644 --- a/docs/docs/admin/policies.md +++ b/docs/docs/admin/policies.mdx @@ -2,15 +2,24 @@ title: Policy Definitions --- +import Tabs from "@theme/Tabs"; +import TabItem from "@theme/TabItem"; + Out of the box, Anubis is pretty heavy-handed. It will aggressively challenge everything that might be a browser (usually indicated by having `Mozilla` in its user agent). However, some bots are smart enough to get past the challenge. Some things that look like bots may actually be fine (IE: RSS readers). Some resources need to be visible no matter what. Some resources and remotes are fine to begin with. Bot policies let you customize the rules that Anubis uses to allow, deny, or challenge incoming requests. Currently you can set policies by the following matches: |
