diff options
| author | Xe Iaso <me@xeiaso.net> | 2025-04-22 07:47:33 -0400 |
|---|---|---|
| committer | Xe Iaso <me@xeiaso.net> | 2025-04-22 13:47:09 -0400 |
| commit | 7396ece1d77d5ad29b68a6357c6859ddcab84c7f (patch) | |
| tree | c463f08568f25be3c8858780cca42432e2c428ef | |
| parent | 2db4105479e5920e983b15b0341104d6e572c1ea (diff) | |
| download | anubis-7396ece1d77d5ad29b68a6357c6859ddcab84c7f.tar.xz anubis-7396ece1d77d5ad29b68a6357c6859ddcab84c7f.zip | |
feat(config): support importing bot policy snippets
This changes the grammar of the Anubis bot policy config to allow
importing from internal shared rules or external rules on the
filesystem.
This lets you create a file at `/data/policies/block-evilbot.yaml` and
then import it with:
```yaml
bots:
- import: /data/policies/block-evilbot.yaml
```
This also explodes the default policy file into a bunch of composable
snippets.
Thank you @Aibrew for your example gitea Atom / RSS feed rules!
Signed-off-by: Xe Iaso <me@xeiaso.net>
28 files changed, 996 insertions, 659 deletions
diff --git a/data/apps/gitea-rss-feeds.yaml b/data/apps/gitea-rss-feeds.yaml new file mode 100644 index 0000000..7bd34ce --- /dev/null +++ b/data/apps/gitea-rss-feeds.yaml @@ -0,0 +1,7 @@ +# By Aibrew: https://github.com/TecharoHQ/anubis/discussions/261#discussioncomment-12821065 +- name: gitea-feed-atom + action: ALLOW + path_regex: ^/[.A-Za-z0-9_-]{1,256}?[./A-Za-z0-9_-]*\.atom$ +- name: gitea-feed-rss + action: ALLOW + path_regex: ^/[.A-Za-z0-9_-]{1,256}?[./A-Za-z0-9_-]*\.rss$
\ No newline at end of file diff --git a/data/botPolicies.yaml b/data/botPolicies.yaml index cb4715a..0378b5e 100644 --- a/data/botPolicies.yaml +++ b/data/botPolicies.yaml @@ -1,651 +1,25 @@ bots: # Pathological bots to deny -- name: us-artificial-intelligence-scraper - user_agent_regex: \+https\://github\.com/US-Artificial-Intelligence/scraper - action: DENY -- name: lightpanda - user_agent_regex: ^LightPanda/.*$ - action: DENY -- name: headless-chrome - user_agent_regex: HeadlessChrome - action: DENY -- name: headless-chromium - user_agent_regex: HeadlessChromium - action: DENY -- name: "ai-robots-txt" - user_agent_regex: > - AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|Perplexity-User|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot - action: DENY -- name: cloudflare-workers - headers_regex: - CF-Worker: .* - action: DENY +- import: "(data)/bots/ai-robots-txt.yaml" +- import: "(data)/bots/cloudflare-workers.yaml" +- import: "(data)/bots/headless-browsers.yaml" +- import: "(data)/bots/us-ai-scraper.yaml" -# search engines to allow -- name: googlebot - user_agent_regex: \+http\://www\.google\.com/bot\.html - action: ALLOW - # https://developers.google.com/static/search/apis/ipranges/googlebot.json - remote_addresses: [ - "2001:4860:4801:10::/64", - "2001:4860:4801:11::/64", - "2001:4860:4801:12::/64", - "2001:4860:4801:13::/64", - "2001:4860:4801:14::/64", - "2001:4860:4801:15::/64", - "2001:4860:4801:16::/64", - "2001:4860:4801:17::/64", - "2001:4860:4801:18::/64", - "2001:4860:4801:19::/64", - "2001:4860:4801:1a::/64", - "2001:4860:4801:1b::/64", - "2001:4860:4801:1c::/64", - "2001:4860:4801:1d::/64", - "2001:4860:4801:1e::/64", - "2001:4860:4801:1f::/64", - "2001:4860:4801:20::/64", - "2001:4860:4801:21::/64", - "2001:4860:4801:22::/64", - "2001:4860:4801:23::/64", - "2001:4860:4801:24::/64", - "2001:4860:4801:25::/64", - "2001:4860:4801:26::/64", - "2001:4860:4801:27::/64", - "2001:4860:4801:28::/64", - "2001:4860:4801:29::/64", - "2001:4860:4801:2::/64", - "2001:4860:4801:2a::/64", - "2001:4860:4801:2b::/64", - "2001:4860:4801:2c::/64", - "2001:4860:4801:2d::/64", - "2001:4860:4801:2e::/64", - "2001:4860:4801:2f::/64", - "2001:4860:4801:31::/64", - "2001:4860:4801:32::/64", - "2001:4860:4801:33::/64", - "2001:4860:4801:34::/64", - "2001:4860:4801:35::/64", - "2001:4860:4801:36::/64", - "2001:4860:4801:37::/64", - "2001:4860:4801:38::/64", - "2001:4860:4801:39::/64", - "2001:4860:4801:3a::/64", - "2001:4860:4801:3b::/64", - "2001:4860:4801:3c::/64", - "2001:4860:4801:3d::/64", - "2001:4860:4801:3e::/64", - "2001:4860:4801:40::/64", - "2001:4860:4801:41::/64", - "2001:4860:4801:42::/64", - "2001:4860:4801:43::/64", - "2001:4860:4801:44::/64", - "2001:4860:4801:45::/64", - "2001:4860:4801:46::/64", - "2001:4860:4801:47::/64", - "2001:4860:4801:48::/64", - "2001:4860:4801:49::/64", - "2001:4860:4801:4a::/64", - "2001:4860:4801:4b::/64", - "2001:4860:4801:4c::/64", - "2001:4860:4801:50::/64", - "2001:4860:4801:51::/64", - "2001:4860:4801:52::/64", - "2001:4860:4801:53::/64", - "2001:4860:4801:54::/64", - "2001:4860:4801:55::/64", - "2001:4860:4801:56::/64", - "2001:4860:4801:60::/64", - "2001:4860:4801:61::/64", - "2001:4860:4801:62::/64", - "2001:4860:4801:63::/64", - "2001:4860:4801:64::/64", - "2001:4860:4801:65::/64", - "2001:4860:4801:66::/64", - "2001:4860:4801:67::/64", - "2001:4860:4801:68::/64", - "2001:4860:4801:69::/64", - "2001:4860:4801:6a::/64", - "2001:4860:4801:6b::/64", - "2001:4860:4801:6c::/64", - "2001:4860:4801:6d::/64", - "2001:4860:4801:6e::/64", - "2001:4860:4801:6f::/64", - "2001:4860:4801:70::/64", - "2001:4860:4801:71::/64", - "2001:4860:4801:72::/64", - "2001:4860:4801:73::/64", - "2001:4860:4801:74::/64", - "2001:4860:4801:75::/64", - "2001:4860:4801:76::/64", - "2001:4860:4801:77::/64", - "2001:4860:4801:78::/64", - "2001:4860:4801:79::/64", - "2001:4860:4801:80::/64", - "2001:4860:4801:81::/64", - "2001:4860:4801:82::/64", - "2001:4860:4801:83::/64", - "2001:4860:4801:84::/64", - "2001:4860:4801:85::/64", - "2001:4860:4801:86::/64", - "2001:4860:4801:87::/64", - "2001:4860:4801:88::/64", - "2001:4860:4801:90::/64", - "2001:4860:4801:91::/64", - "2001:4860:4801:92::/64", - "2001:4860:4801:93::/64", - "2001:4860:4801:94::/64", - "2001:4860:4801:95::/64", - "2001:4860:4801:96::/64", - "2001:4860:4801:a0::/64", - "2001:4860:4801:a1::/64", - "2001:4860:4801:a2::/64", - "2001:4860:4801:a3::/64", - "2001:4860:4801:a4::/64", - "2001:4860:4801:a5::/64", - "2001:4860:4801:c::/64", - "2001:4860:4801:f::/64", - "192.178.5.0/27", - "192.178.6.0/27", - "192.178.6.128/27", - "192.178.6.160/27", - "192.178.6.192/27", - "192.178.6.32/27", - "192.178.6.64/27", - "192.178.6.96/27", - "34.100.182.96/28", - "34.101.50.144/28", - "34.118.254.0/28", - "34.118.66.0/28", - "34.126.178.96/28", - "34.146.150.144/28", - "34.147.110.144/28", - "34.151.74.144/28", - "34.152.50.64/28", - "34.154.114.144/28", - "34.155.98.32/28", - "34.165.18.176/28", - "34.175.160.64/28", - "34.176.130.16/28", - "34.22.85.0/27", - "34.64.82.64/28", - "34.65.242.112/28", - "34.80.50.80/28", - "34.88.194.0/28", - "34.89.10.80/28", - "34.89.198.80/28", - "34.96.162.48/28", - "35.247.243.240/28", - "66.249.64.0/27", - "66.249.64.128/27", - "66.249.64.160/27", - "66.249.64.224/27", - "66.249.64.32/27", - "66.249.64.64/27", - "66.249.64.96/27", - "66.249.65.0/27", - "66.249.65.128/27", - "66.249.65.160/27", - "66.249.65.192/27", - "66.249.65.224/27", - "66.249.65.32/27", - "66.249.65.64/27", - "66.249.65.96/27", - "66.249.66.0/27", - "66.249.66.128/27", - "66.249.66.160/27", - "66.249.66.192/27", - "66.249.66.224/27", - "66.249.66.32/27", - "66.249.66.64/27", - "66.249.66.96/27", - "66.249.68.0/27", - "66.249.68.128/27", - "66.249.68.32/27", - "66.249.68.64/27", - "66.249.68.96/27", - "66.249.69.0/27", - "66.249.69.128/27", - "66.249.69.160/27", - "66.249.69.192/27", - "66.249.69.224/27", - "66.249.69.32/27", - "66.249.69.64/27", - "66.249.69.96/27", - "66.249.70.0/27", - "66.249.70.128/27", - "66.249.70.160/27", - "66.249.70.192/27", - "66.249.70.224/27", - "66.249.70.32/27", - "66.249.70.64/27", - "66.249.70.96/27", - "66.249.71.0/27", - "66.249.71.128/27", - "66.249.71.160/27", - "66.249.71.192/27", - "66.249.71.224/27", - "66.249.71.32/27", - "66.249.71.64/27", - "66.249.71.96/27", - "66.249.72.0/27", - "66.249.72.128/27", - "66.249.72.160/27", - "66.249.72.192/27", - "66.249.72.224/27", - "66.249.72.32/27", - "66.249.72.64/27", - "66.249.72.96/27", - "66.249.73.0/27", - "66.249.73.128/27", - "66.249.73.160/27", - "66.249.73.192/27", - "66.249.73.224/27", - "66.249.73.32/27", - "66.249.73.64/27", - "66.249.73.96/27", - "66.249.74.0/27", - "66.249.74.128/27", - "66.249.74.160/27", - "66.249.74.192/27", - "66.249.74.32/27", - "66.249.74.64/27", - "66.249.74.96/27", - "66.249.75.0/27", - "66.249.75.128/27", - "66.249.75.160/27", - "66.249.75.192/27", - "66.249.75.224/27", - "66.249.75.32/27", - "66.249.75.64/27", - "66.249.75.96/27", - "66.249.76.0/27", - "66.249.76.128/27", - "66.249.76.160/27", - "66.249.76.192/27", - "66.249.76.224/27", - "66.249.76.32/27", - "66.249.76.64/27", - "66.249.76.96/27", - "66.249.77.0/27", - "66.249.77.128/27", - "66.249.77.160/27", - "66.249.77.192/27", - "66.249.77.224/27", - "66.249.77.32/27", - "66.249.77.64/27", - "66.249.77.96/27", - "66.249.78.0/27", - "66.249.78.32/27", - "66.249.79.0/27", - "66.249.79.128/27", - "66.249.79.160/27", - "66.249.79.192/27", - "66.249.79.224/27", - "66.249.79.32/27", - "66.249.79.64/27", - "66.249.79.96/27" - ] -- name: bingbot - user_agent_regex: \+http\://www\.bing\.com/bingbot\.htm - action: ALLOW - # https://www.bing.com/toolbox/bingbot.json - remote_addresses: [ - "157.55.39.0/24", - "207.46.13.0/24", - "40.77.167.0/24", - "13.66.139.0/24", - "13.66.144.0/24", - "52.167.144.0/24", - "13.67.10.16/28", - "13.69.66.240/28", - "13.71.172.224/28", - "139.217.52.0/28", - "191.233.204.224/28", - "20.36.108.32/28", - "20.43.120.16/28", - "40.79.131.208/28", - "40.79.186.176/28", - "52.231.148.0/28", - "20.79.107.240/28", - "51.105.67.0/28", - "20.125.163.80/28", - "40.77.188.0/22", - "65.55.210.0/24", - "199.30.24.0/23", - "40.77.202.0/24", - "40.77.139.0/25", - "20.74.197.0/28", - "20.15.133.160/27", - "40.77.177.0/24", - "40.77.178.0/23" - ] -- name: duckduckbot - user_agent_regex: DuckDuckBot/1\.1; \(\+http\://duckduckgo\.com/duckduckbot\.html\) - action: ALLOW - # https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot - remote_addresses: [ - "57.152.72.128/32", - "51.8.253.152/32", - "40.80.242.63/32", - "20.12.141.99/32", - "20.49.136.28/32", - "51.116.131.221/32", - "51.107.40.209/32", - "20.40.133.240/32", - "20.50.168.91/32", - "51.120.48.122/32", - "20.193.45.113/32", - "40.76.173.151/32", - "40.76.163.7/32", - "20.185.79.47/32", - "52.142.26.175/32", - "20.185.79.15/32", - "52.142.24.149/32", - "40.76.162.208/32", - "40.76.163.23/32", - "40.76.162.191/32", - "40.76.162.247/32", - "40.88.21.235/32", - "20.191.45.212/32", - "52.146.59.12/32", - "52.146.59.156/32", - "52.146.59.154/32", - "52.146.58.236/32", - "20.62.224.44/32", - "51.104.180.53/32", - "51.104.180.47/32", - "51.104.180.26/32", - "51.104.146.225/32", - "51.104.146.235/32", - "20.73.202.147/32", - "20.73.132.240/32", - "20.71.12.143/32", - "20.56.197.58/32", - "20.56.197.63/32", - "20.43.150.93/32", - "20.43.150.85/32", - "20.44.222.1/32", - "40.89.243.175/32", - "13.89.106.77/32", - "52.143.242.6/32", - "52.143.241.111/32", - "52.154.60.82/32", - "20.197.209.11/32", - "20.197.209.27/32", - "20.226.133.105/32", - "191.234.216.4/32", - "191.234.216.178/32", - "20.53.92.211/32", - "20.53.91.2/32", - "20.207.99.197/32", - "20.207.97.190/32", - "40.81.250.205/32", - "40.64.106.11/32", - "40.64.105.247/32", - "20.72.242.93/32", - "20.99.255.235/32", - "20.113.3.121/32", - "52.224.16.221/32", - "52.224.21.53/32", - "52.224.20.204/32", - "52.224.21.19/32", - "52.224.20.249/32", - "52.224.20.203/32", - "52.224.20.190/32", - "52.224.16.229/32", - "52.224.21.20/32", - "52.146.63.80/32", - "52.224.20.227/32", - "52.224.20.193/32", - "52.190.37.160/32", - "52.224.21.23/32", - "52.224.20.223/32", - "52.224.20.181/32", - "52.224.21.49/32", - "52.224.21.55/32", - "52.224.21.61/32", - "52.224.19.152/32", - "52.224.20.186/32", - "52.224.21.27/32", - "52.224.21.51/32", - "52.224.20.174/32", - "52.224.21.4/32", - "51.104.164.109/32", - "51.104.167.71/32", - "51.104.160.177/32", - "51.104.162.149/32", - "51.104.167.95/32", - "51.104.167.54/32", - "51.104.166.111/32", - "51.104.167.88/32", - "51.104.161.32/32", - "51.104.163.250/32", - "51.104.164.189/32", - "51.104.167.19/32", - "51.104.160.167/32", - "51.104.167.110/32", - "20.191.44.119/32", - "51.104.167.104/32", - "20.191.44.234/32", - "51.104.164.215/32", - "51.104.167.52/32", - "20.191.44.22/32", - "51.104.167.87/32", - "51.104.167.96/32", - "20.191.44.16/32", - "51.104.167.61/32", - "51.104.164.147/32", - "20.50.48.159/32", - "40.114.182.172/32", - "20.50.50.130/32", - "20.50.50.163/32", - "20.50.50.46/32", - "40.114.182.153/32", - "20.50.50.118/32", - "20.50.49.55/32", - "20.50.49.25/32", - "40.114.183.251/32", - "20.50.50.123/32", - "20.50.49.237/32", - "20.50.48.192/32", - "20.50.50.134/32", - "51.138.90.233/32", - "40.114.183.196/32", - "20.50.50.146/32", - "40.114.183.88/32", - "20.50.50.145/32", - "20.50.50.121/32", - "20.50.49.40/32", - "51.138.90.206/32", - "40.114.182.45/32", - "51.138.90.161/32", - "20.50.49.0/32", - "40.119.232.215/32", - "104.43.55.167/32", - "40.119.232.251/32", - "40.119.232.50/32", - "40.119.232.146/32", - "40.119.232.218/32", - "104.43.54.127/32", - "104.43.55.117/32", - "104.43.55.116/32", - "104.43.55.166/32", - "52.154.169.50/32", - "52.154.171.70/32", - "52.154.170.229/32", - "52.154.170.113/32", - "52.154.171.44/32", - "52.154.172.2/32", - "52.143.244.81/32", - "52.154.171.87/32", - "52.154.171.250/32", - "52.154.170.28/32", - "52.154.170.122/32", - "52.143.243.117/32", - "52.143.247.235/32", - "52.154.171.235/32", - "52.154.171.196/32", - "52.154.171.0/32", - "52.154.170.243/32", - "52.154.170.26/32", - "52.154.169.200/32", - "52.154.170.96/32", - "52.154.170.88/32", - "52.154.171.150/32", - "52.154.171.205/32", - "52.154.170.117/32", - "52.154.170.209/32", - "191.235.202.48/32", - "191.233.3.202/32", - "191.235.201.214/32", - "191.233.3.197/32", - "191.235.202.38/32", - "20.53.78.144/32", - "20.193.24.10/32", - "20.53.78.236/32", - "20.53.78.138/32", - "20.53.78.123/32", - "20.53.78.106/32", - "20.193.27.215/32", - "20.193.25.197/32", - "20.193.12.126/32", - "20.193.24.251/32", - "20.204.242.101/32", - "20.207.72.113/32", - "20.204.242.19/32", - "20.219.45.67/32", - "20.207.72.11/32", - "20.219.45.190/32", - "20.204.243.55/32", - "20.204.241.148/32", - "20.207.72.110/32", - "20.204.240.172/32", - "20.207.72.21/32", - "20.204.246.81/32", - "20.207.107.181/32", - "20.204.246.254/32", - "20.219.43.246/32", - "52.149.25.43/32", - "52.149.61.51/32", - "52.149.58.139/32", - "52.149.60.38/32", - "52.148.165.38/32", - "52.143.95.162/32", - "52.149.56.151/32", - "52.149.30.45/32", - "52.149.58.173/32", - "52.143.95.204/32", - "52.149.28.83/32", - "52.149.58.69/32", - "52.148.161.87/32", - "52.149.58.27/32", - "52.149.28.18/32", - "20.79.226.26/32", - "20.79.239.66/32", - "20.79.238.198/32", - "20.113.14.159/32", - "20.75.144.152/32", - "20.43.172.120/32", - "20.53.134.160/32", - "20.201.15.208/32", - "20.93.28.24/32", - "20.61.34.40/32", - "52.242.224.168/32", - "20.80.129.80/32", - "20.195.108.47/32", - "4.195.133.120/32", - "4.228.76.163/32", - "4.182.131.108/32", - "4.209.224.56/32", - "108.141.83.74/32", - "4.213.46.14/32", - "172.169.17.165/32", - "51.8.71.117/32", - "20.3.1.178/32", - "52.149.56.151/32", - "52.149.30.45/32", - "52.149.58.173/32", - "52.143.95.204/32", - "52.149.28.83/32", - "52.149.58.69/32", - "52.148.161.87/32", - "52.149.58.27/32", - "52.149.28.18/32", - "20.79.226.26/32", - "20.79.239.66/32", - "20.79.238.198/32", - "20.113.14.159/32", - "20.75.144.152/32", - "20.43.172.120/32", - "20.53.134.160/32", - "20.201.15.208/32", - "20.93.28.24/32", - "20.61.34.40/32", - "52.242.224.168/32", - "20.80.129.80/32", - "20.195.108.47/32", - "4.195.133.120/32", - "4.228.76.163/32", - "4.182.131.108/32", - "4.209.224.56/32", - "108.141.83.74/32", - "4.213.46.14/32", - "172.169.17.165/32", - "51.8.71.117/32", - "20.3.1.178/32" - ] -- name: qwantbot - user_agent_regex: \+https\://help\.qwant\.com/bot/ - action: ALLOW - # https://help.qwant.com/wp-content/uploads/sites/2/2025/01/qwantbot.json - remote_addresses: [ "91.242.162.0/24" ] -- name: internet-archive - action: ALLOW - # https://ipinfo.io/AS7941 - remote_addresses: [ - "207.241.224.0/20", - "208.70.24.0/21", - "2620:0:9c0::/48" - ] -- name: kagibot - user_agent_regex: \+https\://kagi\.com/bot - action: ALLOW - # https://kagi.com/bot - remote_addresses: [ - "216.18.205.234/32", - "35.212.27.76/32", - "104.254.65.50/32", - "209.151.156.194/32" - ] -- name: marginalia - user_agent_regex: search\.marginalia\.nu - action: ALLOW - # Received directly over email - remote_addresses: [ - "193.183.0.162/31", - "193.183.0.164/30", - "193.183.0.168/30", - "193.183.0.172/31", - "193.183.0.174/32" - ] -- name: mojeekbot - user_agent_regex: http\://www\.mojeek\.com/bot\.html - action: ALLOW - # https://www.mojeek.com/bot.html - remote_addresses: [ "5.102.173.71/32" ] +# Search engines to allow +- import: "(data)/crawlers/googlebot.yaml" +- import: "(data)/crawlers/bingbot.yaml" +- import: "(data)/crawlers/duckduckbot.yaml" +- import: "(data)/crawlers/qwantbot.yaml" +- import: "(data)/crawlers/internet-archive.yaml" +- import: "(data)/crawlers/kagibot.yaml" +- import: "(data)/crawlers/marginalia.yaml" +- import: "(data)/crawlers/mojeekbot.yaml" -# Common "keeping the internet working" routes -- name: well-known - path_regex: ^/.well-known/.*$ - action: ALLOW -- name: favicon - path_regex: ^/favicon.ico$ - action: ALLOW -- name: robots-txt - path_regex: ^/robots.txt$ - action: ALLOW +# Allow common "keeping the internet working" routes (well-known, favicon, robots.txt) +- import: "(data)/common/keep-internet-working.yaml" # # Punish any bot with "bot" in the user-agent string +# # This is known to have a high false-positive rate, use at your own risk # - name: generic-bot-catchall # user_agent_regex: (?i:bot|crawler) # action: CHALLENGE @@ -654,6 +28,8 @@ bots: # report_as: 4 # lie to the operator # algorithm: slow # intentionally waste CPU cycles and time + +# Generic catchall rule - name: generic-browser user_agent_regex: > Mozilla|Opera diff --git a/data/bots/ai-robots-txt.yaml b/data/bots/ai-robots-txt.yaml new file mode 100644 index 0000000..19cbe93 --- /dev/null +++ b/data/bots/ai-robots-txt.yaml @@ -0,0 +1,4 @@ +- name: "ai-robots-txt" + user_agent_regex: > + AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|Perplexity-User|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot + action: DENY
\ No newline at end of file diff --git a/data/bots/cloudflare-workers.yaml b/data/bots/cloudflare-workers.yaml new file mode 100644 index 0000000..3fe051b --- /dev/null +++ b/data/bots/cloudflare-workers.yaml @@ -0,0 +1,4 @@ +- name: cloudflare-workers + headers_regex: + CF-Worker: .* + action: DENY
\ No newline at end of file diff --git a/data/bots/headless-browsers.yaml b/data/bots/headless-browsers.yaml new file mode 100644 index 0000000..9805290 --- /dev/null +++ b/data/bots/headless-browsers.yaml @@ -0,0 +1,9 @@ +- name: lightpanda + user_agent_regex: ^LightPanda/.*$ + action: DENY +- name: headless-chrome + user_agent_regex: HeadlessChrome + action: DENY +- name: headless-chromium + user_agent_regex: HeadlessChromium + action: DENY
\ No newline at end of file diff --git a/data/bots/us-ai-scraper.yaml b/data/bots/us-ai-scraper.yaml new file mode 100644 index 0000000..b68920f --- /dev/null +++ b/data/bots/us-ai-scraper.yaml @@ -0,0 +1,3 @@ +- name: us-artificial-intelligence-scraper + user_agent_regex: \+https\://github\.com/US-Artificial-Intelligence/scraper + action: DENY
\ No newline at end of file diff --git a/data/common/allow-private-addresses.yaml b/data/common/allow-private-addresses.yaml new file mode 100644 index 0000000..3a3c0dc --- /dev/null +++ b/data/common/allow-private-addresses.yaml @@ -0,0 +1,15 @@ +- name: ipv4-rfc-1918 + action: ALLOW + remote_addresses: + - 10.0.0.0/8 + - 172.16.0.0/12 + - 192.168.0.0/16 + - 100.64.0.0/10 +- name: ipv6-ula + action: ALLOW + remote_addresses: + - fc00::/7 +- name: ipv6-link-local + action: ALLOW + remote_addresses: + - fe80::/10
\ No newline at end of file diff --git a/data/common/keep-internet-working.yaml b/data/common/keep-internet-working.yaml new file mode 100644 index 0000000..8270ef4 --- /dev/null +++ b/data/common/keep-internet-working.yaml @@ -0,0 +1,10 @@ +# Common "keeping the internet working" routes +- name: well-known + path_regex: ^/.well-known/.*$ + action: ALLOW +- name: favicon + path_regex: ^/favicon.ico$ + action: ALLOW +- name: robots-txt + path_regex: ^/robots.txt$ + action: ALLOW
\ No newline at end of file diff --git a/data/crawlers/bingbot.yaml b/data/crawlers/bingbot.yaml new file mode 100644 index 0000000..2f7885d --- /dev/null +++ b/data/crawlers/bingbot.yaml @@ -0,0 +1,34 @@ +- name: bingbot + user_agent_regex: \+http\://www\.bing\.com/bingbot\.htm + action: ALLOW + # https://www.bing.com/toolbox/bingbot.json + remote_addresses: [ + "157.55.39.0/24", + "207.46.13.0/24", + "40.77.167.0/24", + "13.66.139.0/24", + "13.66.144.0/24", + "52.167.144.0/24", + "13.67.10.16/28", + "13.69.66.240/28", + "13.71.172.224/28", + "139.217.52.0/28", + "191.233.204.224/28", + "20.36.108.32/28", + "20.43.120.16/28", + "40.79.131.208/28", + "40.79.186.176/28", + "52.231.148.0/28", + "20.79.107.240/28", + "51.105.67.0/28", + "20.125.163.80/28", + "40.77.188.0/22", + "65.55.210.0/24", + "199.30.24.0/23", + "40.77.202.0/24", + "40.77.139.0/25", + "20.74.197.0/28", + "20.15.133.160/27", + "40.77.177.0/24", + "40.77.178.0/23" + ] diff --git a/data/crawlers/duckduckbot.yaml b/data/crawlers/duckduckbot.yaml new file mode 100644 index 0000000..302a1e3 --- /dev/null +++ b/data/crawlers/duckduckbot.yaml @@ -0,0 +1,275 @@ +- name: duckduckbot + user_agent_regex: DuckDuckBot/1\.1; \(\+http\://duckduckgo\.com/duckduckbot\.html\) + action: ALLOW + # https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot + r |
