From 74e11505c6133ee1107811e81a0fd53e1d7876dd Mon Sep 17 00:00:00 2001 From: Xe Iaso Date: Wed, 23 Apr 2025 07:01:28 -0400 Subject: feat: enable loading config fragments (#321) * feat(config): support importing bot policy snippets This changes the grammar of the Anubis bot policy config to allow importing from internal shared rules or external rules on the filesystem. This lets you create a file at `/data/policies/block-evilbot.yaml` and then import it with: ```yaml bots: - import: /data/policies/block-evilbot.yaml ``` This also explodes the default policy file into a bunch of composable snippets. Thank you @Aibrew for your example gitea Atom / RSS feed rules! Signed-off-by: Xe Iaso * fix(data): update botPolicies.json to use imports Signed-off-by: Xe Iaso * fix(cmd/anubis): extract bot policies with --extract-resources This allows a user that doesn't have anything but the Anubis binary to figure out what the default configuration does. * docs(data/botPolices.yaml): document import syntax in-line Signed-off-by: Xe Iaso * fix(lib/policy): better test importing from JSON snippets Signed-off-by: Xe Iaso * docs(admin): Add import syntax documentation This documents the import syntax and is based on the block comment at the top of the default bot policy file. * docs(changelog): add note about importing snippets Signed-off-by: Xe Iaso * style(lib/policy/config): use an error value instead of an inline error Signed-off-by: Xe Iaso --------- Signed-off-by: Xe Iaso --- data/bots/ai-robots-txt.yaml | 4 ++++ data/bots/cloudflare-workers.yaml | 4 ++++ data/bots/headless-browsers.yaml | 9 +++++++++ data/bots/us-ai-scraper.yaml | 3 +++ 4 files changed, 20 insertions(+) create mode 100644 data/bots/ai-robots-txt.yaml create mode 100644 data/bots/cloudflare-workers.yaml create mode 100644 data/bots/headless-browsers.yaml create mode 100644 data/bots/us-ai-scraper.yaml (limited to 'data/bots') diff --git a/data/bots/ai-robots-txt.yaml b/data/bots/ai-robots-txt.yaml new file mode 100644 index 0000000..19cbe93 --- /dev/null +++ b/data/bots/ai-robots-txt.yaml @@ -0,0 +1,4 @@ +- name: "ai-robots-txt" + user_agent_regex: > + AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|Perplexity-User|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot + action: DENY \ No newline at end of file diff --git a/data/bots/cloudflare-workers.yaml b/data/bots/cloudflare-workers.yaml new file mode 100644 index 0000000..3fe051b --- /dev/null +++ b/data/bots/cloudflare-workers.yaml @@ -0,0 +1,4 @@ +- name: cloudflare-workers + headers_regex: + CF-Worker: .* + action: DENY \ No newline at end of file diff --git a/data/bots/headless-browsers.yaml b/data/bots/headless-browsers.yaml new file mode 100644 index 0000000..9805290 --- /dev/null +++ b/data/bots/headless-browsers.yaml @@ -0,0 +1,9 @@ +- name: lightpanda + user_agent_regex: ^LightPanda/.*$ + action: DENY +- name: headless-chrome + user_agent_regex: HeadlessChrome + action: DENY +- name: headless-chromium + user_agent_regex: HeadlessChromium + action: DENY \ No newline at end of file diff --git a/data/bots/us-ai-scraper.yaml b/data/bots/us-ai-scraper.yaml new file mode 100644 index 0000000..b68920f --- /dev/null +++ b/data/bots/us-ai-scraper.yaml @@ -0,0 +1,3 @@ +- name: us-artificial-intelligence-scraper + user_agent_regex: \+https\://github\.com/US-Artificial-Intelligence/scraper + action: DENY \ No newline at end of file -- cgit v1.2.3