feat(haproxy-logs): ingest HAProxy request logs into Richie DB

Add a pipeline to load HAProxy `option httplog` lines into the Richie
database so bot/crawler traffic can be analyzed.

- model: HaproxyRequest mirroring the httplog format, with a unique
  line_hash dedup key and indexes on common filter columns
- migration: create the haproxy_request table (unique line_hash + indexes)
- haproxy_logs package:
  - parser: httplog line -> columns, strips the journald prefix and
    hashes the normalized line
  - ingest: batched, idempotent insert that skips rows whose line_hash
    already exists, so re-ingesting the same logs is a no-op
  - cli: ingest-only `haproxy-logs` command reading stdin or a file
- tests: parsing of a real GPTBot line and idempotent re-ingestion
This commit is contained in:
2026-06-23 21:13:20 -04:00
parent e1c4ae0d6e
commit 1d1bafbd30
7 changed files with 576 additions and 0 deletions
+71
View File
@@ -0,0 +1,71 @@
"""HAProxy request-log ORM model.
The columns mirror HAProxy's default ``option httplog`` format. The table lives in
the Richie database; manage its schema with the usual alembic workflow::
database richie revision --autogenerate -m "add haproxy_request table"
database richie upgrade head
"""
from __future__ import annotations
from datetime import datetime
from sqlalchemy import BigInteger, DateTime
from sqlalchemy.orm import Mapped, mapped_column
from python.orm.richie.base import TableBase
class HaproxyRequest(TableBase):
"""A single HAProxy HTTP request log line.
Timer fields (``time_*``) are milliseconds and may be ``-1`` when a phase did
not complete, for example a client that disconnected before the response was
fully sent.
"""
__tablename__ = "haproxy_request"
# SHA-256 of the normalized log line. HAProxy log lines have no natural key,
# so this hash is the dedup key: re-ingesting the same lines is a no-op.
line_hash: Mapped[str] = mapped_column(unique=True)
# When HAProxy accepted the request. HAProxy logs local wall-clock time; the
# parser attaches the host's timezone so this is stored timezone-aware.
requested_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), index=True)
client_ip: Mapped[str] = mapped_column(index=True)
client_port: Mapped[int]
frontend: Mapped[str]
ssl: Mapped[bool]
backend: Mapped[str] = mapped_column(index=True)
server: Mapped[str]
time_request: Mapped[int]
time_queue: Mapped[int]
time_connect: Mapped[int]
time_response: Mapped[int] = mapped_column(index=True)
time_total: Mapped[int]
status_code: Mapped[int] = mapped_column(index=True)
bytes_read: Mapped[int] = mapped_column(BigInteger)
termination_state: Mapped[str]
active_connections: Mapped[int]
frontend_connections: Mapped[int]
backend_connections: Mapped[int]
server_connections: Mapped[int]
retries: Mapped[int]
server_queue: Mapped[int]
backend_queue: Mapped[int]
host: Mapped[str | None] = mapped_column(index=True)
user_agent: Mapped[str | None] = mapped_column(index=True)
method: Mapped[str]
target: Mapped[str]
path: Mapped[str] = mapped_column(index=True)
query: Mapped[str | None]
http_version: Mapped[str]