services: nginx: rate limit crawlers via nginx instead for forgejo
All checks were successful
flake / build (push) Successful in 3m2s

.. as these requests are actually expensive, for other services it's
fine.

Signed-off-by: Christoph Heiss <christoph@c8h4.io>
This commit is contained in:
Christoph Heiss 2024-09-30 14:00:41 +02:00
parent 4a3fc75793
commit 0cdf73b58c
Signed by: c8h4
GPG key ID: 1538094429952F86
2 changed files with 28 additions and 49 deletions

View file

@ -146,7 +146,18 @@ in {
};
};
services.nginx.virtualHosts.${fqdn} =
services.nginx = {
appendHttpConfig = ''
map $http_user_agent $git_bad_crawlers {
default "";
~*(?<name>Amazonbot|Bytespider|meta-externalagent|ClaudeBot|YandexBot) $name;
}
# heavily limit these crawlers to 1 request per minute, since requests
# on git repositories are quite heavy in comparison
limit_req_zone $git_bad_crawlers zone=gitbadcrawlers:32m rate=1r/m;
'';
virtualHosts.${fqdn} =
let inherit (config.services.forgejo.settings.server) HTTP_ADDR HTTP_PORT;
in {
forceSSL = true;
@ -156,10 +167,13 @@ in {
proxyPass = "http://[${HTTP_ADDR}]:${toString HTTP_PORT}";
proxyWebsockets = true;
extraConfig = ''
limit_req_status 429;
limit_req zone=gitbadcrawlers burst=2 nodelay;
client_max_body_size 256M;
'';
};
};
};
users.groups.${cfg.group} = { };
users.users.${cfg.user} = {

View file

@ -19,7 +19,7 @@
recommendedTlsSettings = true;
recommendedZstdSettings = true;
clientMaxBodySize = lib.mkDefault "16M";
appendHttpConfig = ''
commonHttpConfig = ''
# avoid hitting the disk
proxy_max_temp_file_size 0;
'';
@ -48,39 +48,4 @@
after = [ "network-online.target" ];
wants = [ "network-online.target" ];
};
services.fail2ban.jails = lib.mkIf config.services.fail2ban.enable {
apache-badbots.settings = {
enabled = true;
backend = "pyinotify";
port = "http,https";
filter = "apache-badbots";
logpath = "/var/log/nginx/access.log";
maxretry = 1;
bantime = "72h";
};
disrespectful-crawlers = {
filter = {
Definition = {
badcrawlers =
".*(Amazonbot|Bytespider|meta-externalagent|ClaudeBot).*";
failregex =
''^<HOST> -.*"(GET|POST|HEAD).*HTTP.*"(?:%(badcrawlers)s)"$'';
ignoreregex = "";
datepattern = ''
^[^\[]*\[({DATE})
{^LN-BEG}'';
};
};
settings = {
enabled = true;
backend = "pyinotify";
port = "http,https";
filter = "disrespectful-crawlers";
logpath = "/var/log/nginx/access.log";
maxretry = 1;
bantime = "72h";
};
};
};
}