aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDryusdan <dryusdan@dryusdan.fr>2025-04-19 00:28:56 +0200
committerGitHub <noreply@github.com>2025-04-18 22:28:56 +0000
commita40c5e99fc182b6f2fbff5c3f94d9f46c1abce45 (patch)
treedb20e43233c4fa841d0813e82540b93eac7778a3
parentaf831f0d7fa6ed23a7818771e7d2a492c9329dc6 (diff)
downloadanubis-a40c5e99fc182b6f2fbff5c3f94d9f46c1abce45.tar.xz
anubis-a40c5e99fc182b6f2fbff5c3f94d9f46c1abce45.zip
Add more AI user agent in botPolicies.json (#249)
* Add more IA user agent in bot policies * Update data/botPolicies.json Signed-off-by: Xe Iaso <me@xeiaso.net> * Fix trailling pipe that deny all requests --------- Signed-off-by: Xe Iaso <me@xeiaso.net> Co-authored-by: Xe Iaso <me@xeiaso.net>
-rw-r--r--data/botPolicies.json4
-rw-r--r--docs/docs/CHANGELOG.md1
-rw-r--r--internal/test/playwright_test.go18
3 files changed, 21 insertions, 2 deletions
diff --git a/data/botPolicies.json b/data/botPolicies.json
index 7d6e4cb..dbc3d35 100644
--- a/data/botPolicies.json
+++ b/data/botPolicies.json
@@ -1,8 +1,8 @@
{
"bots": [
{
- "name": "amazonbot",
- "user_agent_regex": "Amazonbot",
+ "name": "ai-robots-txt",
+ "user_agent_regex": "AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|Perplexity-User|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot",
"action": "DENY"
},
{
diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md
index ebcd35c..94cf468 100644
--- a/docs/docs/CHANGELOG.md
+++ b/docs/docs/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
+- Add more AI user agents based on the [ai.robots.txt](https://github.com/ai-robots-txt/ai.robots.txt) project
- Embedded challenge data in initial HTML response to improve performance
- Whitelisted [DuckDuckBot](https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot/) in botPolicies
- Improvements to build scripts to make them less independent of the build host
diff --git a/internal/test/playwright_test.go b/internal/test/playwright_test.go
index 7859b71..69652ce 100644
--- a/internal/test/playwright_test.go
+++ b/internal/test/playwright_test.go
@@ -53,6 +53,24 @@ var (
userAgent: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/120.0.6099.28 Safari/537.36",
},
{
+ name: "Amazonbot",
+ action: actionDeny,
+ realIP: placeholderIP,
+ userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/600.2.5 (KHTML, like Gecko) Version/8.0.2 Safari/600.2.5 (Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot)",
+ },
+ {
+ name: "Amazonbot",
+ action: actionDeny,
+ realIP: placeholderIP,
+ userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/600.2.5 (KHTML, like Gecko) Version/8.0.2 Safari/600.2.5 (Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot)",
+ },
+ {
+ name: "PerplexityAI",
+ action: actionDeny,
+ realIP: placeholderIP,
+ userAgent: "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; PerplexityBot/1.0; +https://perplexity.ai/perplexitybot)",
+ },
+ {
name: "kagiBadIP",
action: actionChallenge,
isHard: true,