feat(haproxy-logs): ingest HAProxy request logs into Richie DB

Add a pipeline to load HAProxy `option httplog` lines into the Richie database so bot/crawler traffic can be analyzed. - model: HaproxyRequest mirroring the httplog format, with a unique line_hash dedup key and indexes on common filter columns - migration: create the haproxy_request table (unique line_hash + indexes) - haproxy_logs package: - parser: httplog line -> columns, strips the journald prefix and hashes the normalized line - ingest: batched, idempotent insert that skips rows whose line_hash already exists, so re-ingesting the same logs is a no-op - cli: ingest-only `haproxy-logs` command reading stdin or a file - tests: parsing of a real GPTBot line and idempotent re-ingestion
2026-06-23 21:13:20 -04:00
parent e1c4ae0d6e
commit 1d1bafbd30
7 changed files with 576 additions and 0 deletions
@@ -0,0 +1 @@
+"""Load HAProxy ``option httplog`` lines into SQLite and query them."""
@@ -0,0 +1,54 @@
+"""Command-line interface: load HAProxy logs into the Richie database.
+
+The table schema is managed with alembic (``database richie upgrade head``); this
+command only inserts rows.
+
+Examples:
+    # stream the live log into the database (commit every line)
+    journalctl -u haproxy -o cat -f | haproxy-logs ingest --batch-size 1
+
+    # backfill from a saved log file
+    haproxy-logs ingest --file /tmp/haproxy.log
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+from typing import TYPE_CHECKING, Annotated
+
+import typer
+from sqlalchemy.orm import Session
+
+from python.haproxy_logs.ingest import ingest_lines
+from python.orm.common import get_postgres_engine
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+app = typer.Typer(help="Load HAProxy logs into the Richie database.", no_args_is_help=True)
+
+
+@app.command()
+def ingest(
+    file: Annotated[str | None, typer.Option(help="Read lines from a file instead of stdin.")] = None,
+    batch_size: Annotated[int, typer.Option(help="Rows per commit; use 1 when tailing a live log.")] = 100,
+) -> None:
+    """Parse HAProxy log lines from stdin (or a file) and store them in the Richie DB."""
+    engine = get_postgres_engine(name="RICHIE")
+    with Session(engine) as session:
+        result = ingest_lines(_read_lines(file), session, batch_size=batch_size)
+    typer.echo(f"inserted={result.inserted} duplicates={result.duplicates} skipped={result.skipped}")
+
+
+def _read_lines(file: str | None) -> Iterable[str]:
+    """Yield log lines from a file, or from stdin when no file is given."""
+    if file is None:
+        yield from sys.stdin
+        return
+    with Path(file).open(encoding="utf-8", errors="replace") as handle:
+        yield from handle
+
+
+if __name__ == "__main__":
+    app()
@@ -0,0 +1,88 @@
+"""Parse HAProxy log lines and persist them to the database.
+
+Ingestion is idempotent: each row carries a ``line_hash`` (a unique column), and
+this module skips lines whose hash already exists, so the same logs can be
+re-ingested without creating duplicate records.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+from sqlalchemy import select
+
+from python.haproxy_logs.parser import parse_line
+from python.orm.richie.haproxy import HaproxyRequest
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+    from sqlalchemy.orm import Session
+
+
+@dataclass(frozen=True)
+class IngestResult:
+    """Summary counts for an ingest run."""
+
+    inserted: int
+    skipped: int
+    duplicates: int
+
+
+def ingest_lines(lines: Iterable[str], session: Session, *, batch_size: int = 100) -> IngestResult:
+    """Parse log lines and insert new ones into the database in batches.
+
+    Lines already present (same ``line_hash``) are dropped, as are duplicate
+    lines within the same run. Unparseable non-blank lines are counted as
+    ``skipped`` rather than raising, so a stray startup message never aborts a
+    streaming ingest.
+
+    Args:
+        lines: Iterable of raw log lines (with or without a journald prefix).
+        session: Database session to insert into.
+        batch_size: Number of distinct rows to buffer before committing. Use
+            ``1`` when tailing a live log so rows land immediately.
+
+    Returns:
+        Counts of inserted, skipped (unparseable) and duplicate lines.
+    """
+    inserted = 0
+    skipped = 0
+    parsed_count = 0
+    batch: dict[str, HaproxyRequest] = {}
+
+    for line in lines:
+        parsed = parse_line(line)
+        if parsed is None:
+            if line.strip():
+                skipped += 1
+            continue
+        parsed_count += 1
+        batch[parsed["line_hash"]] = HaproxyRequest(**parsed)
+        if len(batch) >= batch_size:
+            inserted += _flush(batch, session)
+
+    inserted += _flush(batch, session)
+    return IngestResult(inserted=inserted, skipped=skipped, duplicates=parsed_count - inserted)
+
+
+def _flush(batch: dict[str, HaproxyRequest], session: Session) -> int:
+    """Insert the rows whose hash is not already stored, then clear the batch.
+
+    Returns the number of rows actually written.
+    """
+    if not batch:
+        return 0
+
+    existing = set(
+        session.scalars(select(HaproxyRequest.line_hash).where(HaproxyRequest.line_hash.in_(batch.keys()))),
+    )
+    new_rows = [row for line_hash, row in batch.items() if line_hash not in existing]
+    batch.clear()
+
+    if not new_rows:
+        return 0
+    session.add_all(new_rows)
+    session.commit()
+    return len(new_rows)
@@ -0,0 +1,132 @@
+"""Parse HAProxy ``option httplog`` lines into column mappings.
+
+The expected format (with the request-header capture this project configures) is::
+
+    <client_ip>:<port> [<accept_date>] <frontend> <backend>/<server>
+        <TR>/<Tw>/<Tc>/<Tr>/<Ta> <status> <bytes> <req_cookie> <resp_cookie>
+        <term_state> <ac>/<fc>/<bc>/<sc>/<rc> <srv_q>/<back_q>
+        {<host>|<user_agent>} "<method> <target> <version>"
+
+Lines may still carry a systemd-journal prefix (``... haproxy[123]: ``); it is
+stripped before parsing. Lines that are not request logs (startup messages,
+health checks, ...) return ``None``.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import re
+from datetime import datetime
+from typing import Any
+
+# Strips an optional journal prefix such as "Jun 22 20:17:46 jeeves haproxy[688739]: ".
+_JOURNAL_PREFIX = re.compile(r"^.*?haproxy(?:\[\d+\])?:\s+")
+
+_LOG_LINE = re.compile(
+    r"""
+    ^
+    (?P<client_ip>[0-9a-fA-F:.]+):(?P<client_port>\d+)\s+
+    \[(?P<accept_date>[^\]]+)\]\s+
+    (?P<frontend>\S+)\s+
+    (?P<backend>[^/\s]+)/(?P<server>\S+)\s+
+    (?P<time_request>-?\d+)/(?P<time_queue>-?\d+)/(?P<time_connect>-?\d+)/
+    (?P<time_response>-?\d+)/(?P<time_total>-?\d+)\s+
+    (?P<status_code>-?\d+)\s+
+    (?P<bytes_read>\d+)\s+
+    \S+\s+\S+\s+                                    # captured request/response cookies
+    (?P<termination_state>\S+)\s+
+    (?P<active_connections>\d+)/(?P<frontend_connections>\d+)/
+    (?P<backend_connections>\d+)/(?P<server_connections>\d+)/(?P<retries>\d+)\s+
+    (?P<server_queue>\d+)/(?P<backend_queue>\d+)\s+
+    (?P<captures>(?:\{[^}]*\}\s+)*)
+    "(?P<request>[^"]*)"
+    \s*$
+    """,
+    re.VERBOSE,
+)
+
+_ACCEPT_DATE_FORMAT = "%d/%b/%Y:%H:%M:%S.%f"
+_HEADER_CAPTURE = re.compile(r"\{([^}]*)\}")
+
+
+def parse_line(line: str) -> dict[str, Any] | None:
+    """Parse one HAProxy http-log line into a ``HaproxyRequest`` column mapping.
+
+    Args:
+        line: A raw log line, optionally still carrying a journald prefix.
+
+    Returns:
+        A mapping of column name to value, or ``None`` if the line is not a
+        request log (blank lines, startup messages, aborted health checks, ...).
+    """
+    stripped = _JOURNAL_PREFIX.sub("", line.strip())
+    match = _LOG_LINE.match(stripped)
+    if match is None:
+        return None
+
+    groups = match.groupdict()
+    frontend = groups["frontend"]
+    is_ssl = frontend.endswith("~")
+    host, user_agent = _split_request_headers(groups["captures"])
+    method, target, http_version = _split_request(groups["request"])
+    path, _, query = target.partition("?")
+
+    return {
+        "line_hash": hashlib.sha256(stripped.encode("utf-8")).hexdigest(),
+        "requested_at": _parse_accept_date(groups["accept_date"]),
+        "client_ip": groups["client_ip"],
+        "client_port": int(groups["client_port"]),
+        "frontend": frontend.rstrip("~"),
+        "ssl": is_ssl,
+        "backend": groups["backend"],
+        "server": groups["server"],
+        "time_request": int(groups["time_request"]),
+        "time_queue": int(groups["time_queue"]),
+        "time_connect": int(groups["time_connect"]),
+        "time_response": int(groups["time_response"]),
+        "time_total": int(groups["time_total"]),
+        "status_code": int(groups["status_code"]),
+        "bytes_read": int(groups["bytes_read"]),
+        "termination_state": groups["termination_state"],
+        "active_connections": int(groups["active_connections"]),
+        "frontend_connections": int(groups["frontend_connections"]),
+        "backend_connections": int(groups["backend_connections"]),
+        "server_connections": int(groups["server_connections"]),
+        "retries": int(groups["retries"]),
+        "server_queue": int(groups["server_queue"]),
+        "backend_queue": int(groups["backend_queue"]),
+        "host": host,
+        "user_agent": user_agent,
+        "method": method,
+        "target": target,
+        "path": path,
+        "query": query or None,
+        "http_version": http_version,
+    }
+
+
+def _parse_accept_date(value: str) -> datetime:
+    """Parse HAProxy's accept date and attach the host's local timezone."""
+    # HAProxy logs naive local wall-clock time; astimezone() interprets it as the
+    # host's local zone and returns a timezone-aware datetime.
+    return datetime.strptime(value, _ACCEPT_DATE_FORMAT).astimezone()
+
+
+def _split_request_headers(captures: str) -> tuple[str | None, str | None]:
+    """Pull the Host and User-Agent out of the first ``{host|user-agent}`` capture."""
+    match = _HEADER_CAPTURE.search(captures)
+    if match is None:
+        return None, None
+    host, _, user_agent = match.group(1).partition("|")
+    return (host or None), (user_agent or None)
+
+
+def _split_request(request: str) -> tuple[str, str, str]:
+    """Split a request line into method, target and HTTP version.
+
+    Tolerates malformed values such as ``<BADREQ>`` by returning empty strings
+    for the missing parts.
+    """
+    method, _, remainder = request.partition(" ")
+    target, _, http_version = remainder.partition(" ")
+    return method, target, http_version
				`@@ -0,0 +1 @@`
				"""Load HAProxy ``option httplog`` lines into SQLite and query them."""