feat(haproxy-logs): ingest HAProxy request logs into Richie DB
Add a pipeline to load HAProxy `option httplog` lines into the Richie
database so bot/crawler traffic can be analyzed.
- model: HaproxyRequest mirroring the httplog format, with a unique
line_hash dedup key and indexes on common filter columns
- migration: create the haproxy_request table (unique line_hash + indexes)
- haproxy_logs package:
- parser: httplog line -> columns, strips the journald prefix and
hashes the normalized line
- ingest: batched, idempotent insert that skips rows whose line_hash
already exists, so re-ingesting the same logs is a no-op
- cli: ingest-only `haproxy-logs` command reading stdin or a file
- tests: parsing of a real GPTBot line and idempotent re-ingestion
This commit is contained in:
@@ -0,0 +1,127 @@
|
||||
"""test_haproxy_logs."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
from sqlalchemy import create_engine, func, select
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy.pool import StaticPool
|
||||
|
||||
from python.haproxy_logs.ingest import ingest_lines
|
||||
from python.haproxy_logs.parser import parse_line
|
||||
from python.orm.richie.haproxy import HaproxyRequest
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterator
|
||||
|
||||
GPTBOT_LINE = (
|
||||
"Jun 22 20:17:46 jeeves haproxy[688739]: 74.7.242.30:59644 "
|
||||
"[22/Jun/2026:20:17:46.227] ContentSwitching~ gitea/server 0/0/0/133/148 "
|
||||
"200 292890 - - ---- 7/7/5/5/0 0/0 "
|
||||
"{gitea.tmmworkshop.com|like Gecko; compatible; GPTBot/1.4; +https://openai.com/gptbot)} "
|
||||
'"GET https://gitea.tmmworkshop.com/Richie/dotfiles/src/commit/abc/installer.py?display=source HTTP/2.0"'
|
||||
)
|
||||
|
||||
|
||||
def _line(client_ip: str, path: str, time_response: int, user_agent: str = "curl/8") -> str:
|
||||
return (
|
||||
f"{client_ip}:50000 [22/Jun/2026:20:17:46.227] ContentSwitching~ gitea/server "
|
||||
f"0/0/0/{time_response}/{time_response} 200 100 - - ---- 1/1/1/1/0 0/0 "
|
||||
f'{{gitea.tmmworkshop.com|{user_agent}}} "GET https://gitea.tmmworkshop.com{path} HTTP/2.0"'
|
||||
)
|
||||
|
||||
|
||||
def test_parse_real_gptbot_line() -> None:
|
||||
"""A real GPTBot request line parses into the expected fields."""
|
||||
parsed = parse_line(GPTBOT_LINE)
|
||||
|
||||
assert parsed is not None
|
||||
assert parsed["client_ip"] == "74.7.242.30"
|
||||
assert parsed["client_port"] == 59644
|
||||
assert parsed["frontend"] == "ContentSwitching"
|
||||
assert parsed["ssl"] is True
|
||||
assert parsed["backend"] == "gitea"
|
||||
assert parsed["time_response"] == 133
|
||||
assert parsed["time_total"] == 148
|
||||
assert parsed["status_code"] == 200
|
||||
assert parsed["bytes_read"] == 292890
|
||||
assert parsed["host"] == "gitea.tmmworkshop.com"
|
||||
assert "GPTBot/1.4" in parsed["user_agent"]
|
||||
assert parsed["method"] == "GET"
|
||||
assert parsed["path"] == "https://gitea.tmmworkshop.com/Richie/dotfiles/src/commit/abc/installer.py"
|
||||
assert parsed["query"] == "display=source"
|
||||
assert parsed["http_version"] == "HTTP/2.0"
|
||||
|
||||
|
||||
def test_parse_non_request_line_returns_none() -> None:
|
||||
"""A non-request log line is ignored rather than raising."""
|
||||
assert parse_line("Jun 22 20:00:00 jeeves haproxy[1]: Proxy ContentSwitching started.") is None
|
||||
assert parse_line("") is None
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def session() -> Iterator[Session]:
|
||||
"""In-memory SQLite session with just the haproxy_request table created."""
|
||||
engine = create_engine(
|
||||
"sqlite://",
|
||||
connect_args={"check_same_thread": False},
|
||||
poolclass=StaticPool,
|
||||
)
|
||||
HaproxyRequest.__table__.create(bind=engine, checkfirst=True)
|
||||
with Session(engine) as open_session:
|
||||
yield open_session
|
||||
|
||||
|
||||
def test_ingest_inserts_and_skips(session: Session) -> None:
|
||||
"""Parseable lines are inserted; non-request lines are skipped, not fatal."""
|
||||
lines = [
|
||||
_line("10.0.0.1", "/a", 10),
|
||||
_line("10.0.0.2", "/b", 20, user_agent="GPTBot/1.4"),
|
||||
"Jun 22 20:00:00 jeeves haproxy[1]: Proxy ContentSwitching started.",
|
||||
"",
|
||||
]
|
||||
result = ingest_lines(lines, session)
|
||||
|
||||
assert result.inserted == 2
|
||||
assert result.skipped == 1
|
||||
assert result.duplicates == 0
|
||||
assert session.scalar(select(func.count()).select_from(HaproxyRequest)) == 2
|
||||
|
||||
|
||||
def test_reingest_is_idempotent(session: Session) -> None:
|
||||
"""Re-ingesting the same lines creates no duplicate rows."""
|
||||
lines = [_line("10.0.0.1", "/a", 10), _line("10.0.0.2", "/b", 20)]
|
||||
|
||||
first = ingest_lines(lines, session)
|
||||
assert first.inserted == 2
|
||||
assert first.duplicates == 0
|
||||
|
||||
second = ingest_lines(lines, session)
|
||||
assert second.inserted == 0
|
||||
assert second.duplicates == 2
|
||||
|
||||
assert session.scalar(select(func.count()).select_from(HaproxyRequest)) == 2
|
||||
|
||||
|
||||
def test_duplicate_lines_within_one_run_collapse(session: Session) -> None:
|
||||
"""Identical lines in a single run are stored once."""
|
||||
line = _line("10.0.0.1", "/a", 10)
|
||||
result = ingest_lines([line, line, line], session)
|
||||
|
||||
assert result.inserted == 1
|
||||
assert result.duplicates == 2
|
||||
assert session.scalar(select(func.count()).select_from(HaproxyRequest)) == 1
|
||||
|
||||
|
||||
def test_ingest_persists_parsed_fields(session: Session) -> None:
|
||||
"""A stored row keeps the fields pulled out by the parser."""
|
||||
ingest_lines([GPTBOT_LINE], session)
|
||||
|
||||
stored = session.scalar(select(HaproxyRequest))
|
||||
assert stored is not None
|
||||
assert stored.client_ip == "74.7.242.30"
|
||||
assert stored.backend == "gitea"
|
||||
assert stored.time_response == 133
|
||||
assert "GPTBot/1.4" in stored.user_agent
|
||||
Reference in New Issue
Block a user