aboutsummaryrefslogtreecommitdiff
path: root/data/botPolicies.yaml
blob: 51af499eb2df683a73a487bb333940fbb628c014 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
## Anubis has the ability to let you import snippets of configuration into the main
## configuration file. This allows you to break up your config into smaller parts
## that get logically assembled into one big file.
##
## Of note, a bot rule can either have inline bot configuration or import a
## bot config snippet. You cannot do both in a single bot rule.
##
## Import paths can either be prefixed with (data) to import from the common/shared
## rules in the data folder in the Anubis source tree or will point to absolute/relative
## paths in your filesystem. If you don't have access to the Anubis source tree, check
## /usr/share/docs/anubis/data or in the tarball you extracted Anubis from.

bots:
# Pathological bots to deny
- # This correlates to data/bots/ai-robots-txt.yaml in the source tree
  import: (data)/bots/ai-robots-txt.yaml
- import: (data)/bots/cloudflare-workers.yaml 
- import: (data)/bots/headless-browsers.yaml
- import: (data)/bots/us-ai-scraper.yaml

# Search engines to allow
- import: (data)/crawlers/googlebot.yaml
- import: (data)/crawlers/bingbot.yaml
- import: (data)/crawlers/duckduckbot.yaml
- import: (data)/crawlers/qwantbot.yaml
- import: (data)/crawlers/internet-archive.yaml
- import: (data)/crawlers/kagibot.yaml
- import: (data)/crawlers/marginalia.yaml
- import: (data)/crawlers/mojeekbot.yaml

# Allow common "keeping the internet working" routes (well-known, favicon, robots.txt)
- import: (data)/common/keep-internet-working.yaml

# # Punish any bot with "bot" in the user-agent string
# # This is known to have a high false-positive rate, use at your own risk
# - name: generic-bot-catchall
#   user_agent_regex: (?i:bot|crawler)
#   action: CHALLENGE
#   challenge:
#     difficulty: 16  # impossible
#     report_as: 4    # lie to the operator
#     algorithm: slow # intentionally waste CPU cycles and time

# Generic catchall rule
- name: generic-browser
  user_agent_regex: >-
    Mozilla|Opera
  action: CHALLENGE

dnsbl: false