feat(haproxy-logs): ingest HAProxy request logs into Richie DB
Add a pipeline to load HAProxy `option httplog` lines into the Richie
database so bot/crawler traffic can be analyzed.
- model: HaproxyRequest mirroring the httplog format, with a unique
line_hash dedup key and indexes on common filter columns
- migration: create the haproxy_request table (unique line_hash + indexes)
- haproxy_logs package:
- parser: httplog line -> columns, strips the journald prefix and
hashes the normalized line
- ingest: batched, idempotent insert that skips rows whose line_hash
already exists, so re-ingesting the same logs is a no-op
- cli: ingest-only `haproxy-logs` command reading stdin or a file
- tests: parsing of a real GPTBot line and idempotent re-ingestion
This commit is contained in:
@@ -0,0 +1,71 @@
|
||||
"""HAProxy request-log ORM model.
|
||||
|
||||
The columns mirror HAProxy's default ``option httplog`` format. The table lives in
|
||||
the Richie database; manage its schema with the usual alembic workflow::
|
||||
|
||||
database richie revision --autogenerate -m "add haproxy_request table"
|
||||
database richie upgrade head
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy import BigInteger, DateTime
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
|
||||
from python.orm.richie.base import TableBase
|
||||
|
||||
|
||||
class HaproxyRequest(TableBase):
|
||||
"""A single HAProxy HTTP request log line.
|
||||
|
||||
Timer fields (``time_*``) are milliseconds and may be ``-1`` when a phase did
|
||||
not complete, for example a client that disconnected before the response was
|
||||
fully sent.
|
||||
"""
|
||||
|
||||
__tablename__ = "haproxy_request"
|
||||
|
||||
# SHA-256 of the normalized log line. HAProxy log lines have no natural key,
|
||||
# so this hash is the dedup key: re-ingesting the same lines is a no-op.
|
||||
line_hash: Mapped[str] = mapped_column(unique=True)
|
||||
|
||||
# When HAProxy accepted the request. HAProxy logs local wall-clock time; the
|
||||
# parser attaches the host's timezone so this is stored timezone-aware.
|
||||
requested_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), index=True)
|
||||
|
||||
client_ip: Mapped[str] = mapped_column(index=True)
|
||||
client_port: Mapped[int]
|
||||
|
||||
frontend: Mapped[str]
|
||||
ssl: Mapped[bool]
|
||||
backend: Mapped[str] = mapped_column(index=True)
|
||||
server: Mapped[str]
|
||||
|
||||
time_request: Mapped[int]
|
||||
time_queue: Mapped[int]
|
||||
time_connect: Mapped[int]
|
||||
time_response: Mapped[int] = mapped_column(index=True)
|
||||
time_total: Mapped[int]
|
||||
|
||||
status_code: Mapped[int] = mapped_column(index=True)
|
||||
bytes_read: Mapped[int] = mapped_column(BigInteger)
|
||||
termination_state: Mapped[str]
|
||||
|
||||
active_connections: Mapped[int]
|
||||
frontend_connections: Mapped[int]
|
||||
backend_connections: Mapped[int]
|
||||
server_connections: Mapped[int]
|
||||
retries: Mapped[int]
|
||||
server_queue: Mapped[int]
|
||||
backend_queue: Mapped[int]
|
||||
|
||||
host: Mapped[str | None] = mapped_column(index=True)
|
||||
user_agent: Mapped[str | None] = mapped_column(index=True)
|
||||
|
||||
method: Mapped[str]
|
||||
target: Mapped[str]
|
||||
path: Mapped[str] = mapped_column(index=True)
|
||||
query: Mapped[str | None]
|
||||
http_version: Mapped[str]
|
||||
Reference in New Issue
Block a user