feat(haproxy-logs): ingest HAProxy request logs into Richie DB

Add a pipeline to load HAProxy `option httplog` lines into the Richie
database so bot/crawler traffic can be analyzed.

- model: HaproxyRequest mirroring the httplog format, with a unique
  line_hash dedup key and indexes on common filter columns
- migration: create the haproxy_request table (unique line_hash + indexes)
- haproxy_logs package:
  - parser: httplog line -> columns, strips the journald prefix and
    hashes the normalized line
  - ingest: batched, idempotent insert that skips rows whose line_hash
    already exists, so re-ingesting the same logs is a no-op
  - cli: ingest-only `haproxy-logs` command reading stdin or a file
- tests: parsing of a real GPTBot line and idempotent re-ingestion
This commit is contained in:
2026-06-23 21:13:20 -04:00
parent e1c4ae0d6e
commit 1d1bafbd30
7 changed files with 576 additions and 0 deletions
+127
View File
@@ -0,0 +1,127 @@
"""test_haproxy_logs."""
from __future__ import annotations
from typing import TYPE_CHECKING
import pytest
from sqlalchemy import create_engine, func, select
from sqlalchemy.orm import Session
from sqlalchemy.pool import StaticPool
from python.haproxy_logs.ingest import ingest_lines
from python.haproxy_logs.parser import parse_line
from python.orm.richie.haproxy import HaproxyRequest
if TYPE_CHECKING:
from collections.abc import Iterator
GPTBOT_LINE = (
"Jun 22 20:17:46 jeeves haproxy[688739]: 74.7.242.30:59644 "
"[22/Jun/2026:20:17:46.227] ContentSwitching~ gitea/server 0/0/0/133/148 "
"200 292890 - - ---- 7/7/5/5/0 0/0 "
"{gitea.tmmworkshop.com|like Gecko; compatible; GPTBot/1.4; +https://openai.com/gptbot)} "
'"GET https://gitea.tmmworkshop.com/Richie/dotfiles/src/commit/abc/installer.py?display=source HTTP/2.0"'
)
def _line(client_ip: str, path: str, time_response: int, user_agent: str = "curl/8") -> str:
return (
f"{client_ip}:50000 [22/Jun/2026:20:17:46.227] ContentSwitching~ gitea/server "
f"0/0/0/{time_response}/{time_response} 200 100 - - ---- 1/1/1/1/0 0/0 "
f'{{gitea.tmmworkshop.com|{user_agent}}} "GET https://gitea.tmmworkshop.com{path} HTTP/2.0"'
)
def test_parse_real_gptbot_line() -> None:
"""A real GPTBot request line parses into the expected fields."""
parsed = parse_line(GPTBOT_LINE)
assert parsed is not None
assert parsed["client_ip"] == "74.7.242.30"
assert parsed["client_port"] == 59644
assert parsed["frontend"] == "ContentSwitching"
assert parsed["ssl"] is True
assert parsed["backend"] == "gitea"
assert parsed["time_response"] == 133
assert parsed["time_total"] == 148
assert parsed["status_code"] == 200
assert parsed["bytes_read"] == 292890
assert parsed["host"] == "gitea.tmmworkshop.com"
assert "GPTBot/1.4" in parsed["user_agent"]
assert parsed["method"] == "GET"
assert parsed["path"] == "https://gitea.tmmworkshop.com/Richie/dotfiles/src/commit/abc/installer.py"
assert parsed["query"] == "display=source"
assert parsed["http_version"] == "HTTP/2.0"
def test_parse_non_request_line_returns_none() -> None:
"""A non-request log line is ignored rather than raising."""
assert parse_line("Jun 22 20:00:00 jeeves haproxy[1]: Proxy ContentSwitching started.") is None
assert parse_line("") is None
@pytest.fixture
def session() -> Iterator[Session]:
"""In-memory SQLite session with just the haproxy_request table created."""
engine = create_engine(
"sqlite://",
connect_args={"check_same_thread": False},
poolclass=StaticPool,
)
HaproxyRequest.__table__.create(bind=engine, checkfirst=True)
with Session(engine) as open_session:
yield open_session
def test_ingest_inserts_and_skips(session: Session) -> None:
"""Parseable lines are inserted; non-request lines are skipped, not fatal."""
lines = [
_line("10.0.0.1", "/a", 10),
_line("10.0.0.2", "/b", 20, user_agent="GPTBot/1.4"),
"Jun 22 20:00:00 jeeves haproxy[1]: Proxy ContentSwitching started.",
"",
]
result = ingest_lines(lines, session)
assert result.inserted == 2
assert result.skipped == 1
assert result.duplicates == 0
assert session.scalar(select(func.count()).select_from(HaproxyRequest)) == 2
def test_reingest_is_idempotent(session: Session) -> None:
"""Re-ingesting the same lines creates no duplicate rows."""
lines = [_line("10.0.0.1", "/a", 10), _line("10.0.0.2", "/b", 20)]
first = ingest_lines(lines, session)
assert first.inserted == 2
assert first.duplicates == 0
second = ingest_lines(lines, session)
assert second.inserted == 0
assert second.duplicates == 2
assert session.scalar(select(func.count()).select_from(HaproxyRequest)) == 2
def test_duplicate_lines_within_one_run_collapse(session: Session) -> None:
"""Identical lines in a single run are stored once."""
line = _line("10.0.0.1", "/a", 10)
result = ingest_lines([line, line, line], session)
assert result.inserted == 1
assert result.duplicates == 2
assert session.scalar(select(func.count()).select_from(HaproxyRequest)) == 1
def test_ingest_persists_parsed_fields(session: Session) -> None:
"""A stored row keeps the fields pulled out by the parser."""
ingest_lines([GPTBOT_LINE], session)
stored = session.scalar(select(HaproxyRequest))
assert stored is not None
assert stored.client_ip == "74.7.242.30"
assert stored.backend == "gitea"
assert stored.time_response == 133
assert "GPTBot/1.4" in stored.user_agent