aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorXe Iaso <me@xeiaso.net>2025-04-26 10:01:15 -0400
committerGitHub <noreply@github.com>2025-04-26 14:01:15 +0000
commitef52550e70a44c318dca8a406750589f67fac0eb (patch)
treed00e403f38960ab9f86c8dadf41907099ecad391
parentc669b47b570d222a9a902705adeff8fb26c989c4 (diff)
downloadanubis-ef52550e70a44c318dca8a406750589f67fac0eb.tar.xz
anubis-ef52550e70a44c318dca8a406750589f67fac0eb.zip
fix(config): remove trailing newlines in regexes (#373)v1.17.0-beta31.17.0-beta2
Closes #372 Fun YAML fact of the day: What is the difference between how these two expressions are parsed? ```yaml foo: > bar ``` ```yaml foo: >- bar ``` They are invisible in yaml, but when you evaluate them to JSON the difference is obvious: ```json { "foo": "bar\n" } ``` ```json { "foo": "bar" } ``` User-Agent strings, URL path values, and HTTP headers _do_ end in newlines in HTTP/1.1 wire form, but that newline is usually stripped before the server actually handles it. Also HTTP/2 is a thing and does not terminate header values with newlines. This change makes Anubis more aggressively detect mistaken uses of the yaml `>` operator and nudges the user into using the yaml `>-` operator which does not append the trailing newline. I had honestly forgotten about this YAML behavior because it wasn't relevant for so long. Oops! Glad I released a beta. Whenever you get into this state, Anubis will throw a config parsing error and then give you a message hinting at the folly of your ways. ``` config.Bot: regular expression ends with newline (try >- instead of > in yaml) ``` Big thanks to https://yaml-multiline.info, this helped me realize my folly instantly. @aiverson, this is official permission to say "told you so". Signed-off-by: Xe Iaso <me@xeiaso.net>
-rw-r--r--data/botPolicies.json2
-rw-r--r--data/botPolicies.yaml2
-rw-r--r--data/bots/ai-robots-txt.yaml2
-rw-r--r--docs/docs/CHANGELOG.md1
-rw-r--r--lib/policy/checker.go12
-rw-r--r--lib/policy/config/config.go13
-rw-r--r--lib/policy/config/testdata/bad/regex_ends_newline.json21
-rw-r--r--lib/policy/config/testdata/bad/regex_ends_newline.yaml17
8 files changed, 61 insertions, 9 deletions
diff --git a/data/botPolicies.json b/data/botPolicies.json
index dad04e8..160bbf0 100644
--- a/data/botPolicies.json
+++ b/data/botPolicies.json
@@ -41,7 +41,7 @@
},
{
"name": "generic-browser",
- "user_agent_regex": "Mozilla|Opera\n",
+ "user_agent_regex": "Mozilla|Opera",
"action": "CHALLENGE"
}
],
diff --git a/data/botPolicies.yaml b/data/botPolicies.yaml
index 585be15..51af499 100644
--- a/data/botPolicies.yaml
+++ b/data/botPolicies.yaml
@@ -43,7 +43,7 @@ bots:
# Generic catchall rule
- name: generic-browser
- user_agent_regex: >
+ user_agent_regex: >-
Mozilla|Opera
action: CHALLENGE
diff --git a/data/bots/ai-robots-txt.yaml b/data/bots/ai-robots-txt.yaml
index 19cbe93..ef2790c 100644
--- a/data/bots/ai-robots-txt.yaml
+++ b/data/bots/ai-robots-txt.yaml
@@ -1,4 +1,4 @@
- name: "ai-robots-txt"
- user_agent_regex: >
+ user_agent_regex: >-
AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|Perplexity-User|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot
action: DENY \ No newline at end of file
diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md
index bedd38e..2e7c311 100644
--- a/docs/docs/CHANGELOG.md
+++ b/docs/docs/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
+- Ensure regexes can't end in newlines ([#372](https://github.com/TecharoHQ/anubis/issues/372))
- Add documentation for default allow behavior (implicit rule)
- Enable [importing configuration snippets](./admin/configuration/import.mdx) ([#321](https://github.com/TecharoHQ/anubis/pull/321))
- Refactor check logic to be more generic and work on a Checker type
diff --git a/lib/policy/checker.go b/lib/policy/checker.go
index ad98ced..51223cd 100644
--- a/lib/policy/checker.go
+++ b/lib/policy/checker.go
@@ -110,11 +110,11 @@ func NewUserAgentChecker(rexStr string) (Checker, error) {
}
func NewHeaderMatchesChecker(header, rexStr string) (Checker, error) {
- rex, err := regexp.Compile(rexStr)
+ rex, err := regexp.Compile(strings.TrimSpace(rexStr))
if err != nil {
return nil, fmt.Errorf("%w: regex %s failed parse: %w", ErrMisconfiguration, rexStr, err)
}
- return &HeaderMatchesChecker{header, rex, internal.SHA256sum(header + ": " + rexStr)}, nil
+ return &HeaderMatchesChecker{strings.TrimSpace(header), rex, internal.SHA256sum(header + ": " + rexStr)}, nil
}
func (hmc *HeaderMatchesChecker) Check(r *http.Request) (bool, error) {
@@ -135,7 +135,7 @@ type PathChecker struct {
}
func NewPathChecker(rexStr string) (Checker, error) {
- rex, err := regexp.Compile(rexStr)
+ rex, err := regexp.Compile(strings.TrimSpace(rexStr))
if err != nil {
return nil, fmt.Errorf("%w: regex %s failed parse: %w", ErrMisconfiguration, rexStr, err)
}
@@ -155,7 +155,7 @@ func (pc *PathChecker) Hash() string {
}
func NewHeaderExistsChecker(key string) Checker {
- return headerExistsChecker{key}
+ return headerExistsChecker{strings.TrimSpace(key)}
}
type headerExistsChecker struct {
@@ -180,11 +180,11 @@ func NewHeadersChecker(headermap map[string]string) (Checker, error) {
for key, rexStr := range headermap {
if rexStr == ".*" {
- result = append(result, headerExistsChecker{key})
+ result = append(result, headerExistsChecker{strings.TrimSpace(key)})
continue
}
- rex, err := regexp.Compile(rexStr)
+ rex, err := regexp.Compile(strings.TrimSpace(rexStr))
if err != nil {
errs = append(errs, fmt.Errorf("while compiling header %s regex %s: %w", key, rexStr, err))
continue
diff --git a/lib/policy/config/config.go b/lib/policy/config/config.go
index 627e9cf..c670bac 100644
--- a/lib/policy/config/config.go
+++ b/lib/policy/config/config.go
@@ -24,6 +24,7 @@ var (
ErrInvalidPathRegex = errors.New("config.Bot: invalid path regex")
ErrInvalidHeadersRegex = errors.New("config.Bot: invalid headers regex")
ErrInvalidCIDR = errors.New("config.Bot: invalid CIDR")
+ ErrRegexEndsWithNewline = errors.New("config.Bot: regular expression ends with newline (try >- instead of > in yaml)")
ErrInvalidImportStatement = errors.New("config.ImportStatement: invalid source file")
ErrCantSetBotAndImportValuesAtOnce = errors.New("config.BotOrImport: can't set bot rules and import values at the same time")
ErrMustSetBotOrImportRules = errors.New("config.BotOrImport: rule definition is invalid, you must set either bot rules or an import statement, not both")
@@ -91,12 +92,20 @@ func (b BotConfig) Valid() error {
}
if b.UserAgentRegex != nil {
+ if strings.HasSuffix(*b.UserAgentRegex, "\n") {
+ errs = append(errs, fmt.Errorf("%w: user agent regex: %q", ErrRegexEndsWithNewline, *b.UserAgentRegex))
+ }
+
if _, err := regexp.Compile(*b.UserAgentRegex); err != nil {
errs = append(errs, ErrInvalidUserAgentRegex, err)
}
}
if b.PathRegex != nil {
+ if strings.HasSuffix(*b.PathRegex, "\n") {
+ errs = append(errs, fmt.Errorf("%w: path regex: %q", ErrRegexEndsWithNewline, *b.PathRegex))
+ }
+
if _, err := regexp.Compile(*b.PathRegex); err != nil {
errs = append(errs, ErrInvalidPathRegex, err)
}
@@ -108,6 +117,10 @@ func (b BotConfig) Valid() error {
continue
}
+ if strings.HasSuffix(expr, "\n") {
+ errs = append(errs, fmt.Errorf("%w: header %s regex: %q", ErrRegexEndsWithNewline, name, expr))
+ }
+
if _, err := regexp.Compile(expr); err != nil {
errs = append(errs, ErrInvalidHeadersRegex, err)
}
diff --git a/lib/policy/config/testdata/bad/regex_ends_newline.json b/lib/policy/config/testdata/bad/regex_ends_newline.json
new file mode 100644
index 0000000..14c7fa9
--- /dev/null
+++ b/lib/policy/config/testdata/bad/regex_ends_newline.json
@@ -0,0 +1,21 @@
+{
+ "bots": [
+ {
+ "name": "user-agent-ends-newline",
+ "user_agent_regex": "Mozilla\n",
+ "action": "CHALLENGE"
+ },
+ {
+ "name": "path-ends-newline",
+ "path_regex": "^/evil/.*$\n",
+ "action": "CHALLENGE"
+ },
+ {
+ "name": "headers-ends-newline",
+ "headers_regex": {
+ "CF-Worker": ".*\n"
+ },
+ "action": "CHALLENGE"
+ }
+ ]
+}
diff --git a/lib/policy/config/testdata/bad/regex_ends_newline.yaml b/lib/policy/config/testdata/bad/regex_ends_newline.yaml
new file mode 100644
index 0000000..1f0ae85
--- /dev/null
+++ b/lib/policy/config/testdata/bad/regex_ends_newline.yaml
@@ -0,0 +1,17 @@
+bots:
+- name: user-agent-ends-newline
+ # Subtle bug: this ends with a newline
+ user_agent_regex: >
+ Mozilla
+ action: CHALLENGE
+- name: path-ends-newline
+ # Subtle bug: this ends with a newline
+ path_regex: >
+ ^/evil/.*$
+ action: CHALLENGE
+- name: headers-ends-newline
+ # Subtle bug: this ends with a newline
+ headers_regex:
+ CF-Worker: >
+ .*
+ action: CHALLENGE \ No newline at end of file