1d1bafbd30
Add a pipeline to load HAProxy `option httplog` lines into the Richie
database so bot/crawler traffic can be analyzed.
- model: HaproxyRequest mirroring the httplog format, with a unique
line_hash dedup key and indexes on common filter columns
- migration: create the haproxy_request table (unique line_hash + indexes)
- haproxy_logs package:
- parser: httplog line -> columns, strips the journald prefix and
hashes the normalized line
- ingest: batched, idempotent insert that skips rows whose line_hash
already exists, so re-ingesting the same logs is a no-op
- cli: ingest-only `haproxy-logs` command reading stdin or a file
- tests: parsing of a real GPTBot line and idempotent re-ingestion
72 lines
2.3 KiB
Python
72 lines
2.3 KiB
Python
"""HAProxy request-log ORM model.
|
|
|
|
The columns mirror HAProxy's default ``option httplog`` format. The table lives in
|
|
the Richie database; manage its schema with the usual alembic workflow::
|
|
|
|
database richie revision --autogenerate -m "add haproxy_request table"
|
|
database richie upgrade head
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import datetime
|
|
|
|
from sqlalchemy import BigInteger, DateTime
|
|
from sqlalchemy.orm import Mapped, mapped_column
|
|
|
|
from python.orm.richie.base import TableBase
|
|
|
|
|
|
class HaproxyRequest(TableBase):
|
|
"""A single HAProxy HTTP request log line.
|
|
|
|
Timer fields (``time_*``) are milliseconds and may be ``-1`` when a phase did
|
|
not complete, for example a client that disconnected before the response was
|
|
fully sent.
|
|
"""
|
|
|
|
__tablename__ = "haproxy_request"
|
|
|
|
# SHA-256 of the normalized log line. HAProxy log lines have no natural key,
|
|
# so this hash is the dedup key: re-ingesting the same lines is a no-op.
|
|
line_hash: Mapped[str] = mapped_column(unique=True)
|
|
|
|
# When HAProxy accepted the request. HAProxy logs local wall-clock time; the
|
|
# parser attaches the host's timezone so this is stored timezone-aware.
|
|
requested_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), index=True)
|
|
|
|
client_ip: Mapped[str] = mapped_column(index=True)
|
|
client_port: Mapped[int]
|
|
|
|
frontend: Mapped[str]
|
|
ssl: Mapped[bool]
|
|
backend: Mapped[str] = mapped_column(index=True)
|
|
server: Mapped[str]
|
|
|
|
time_request: Mapped[int]
|
|
time_queue: Mapped[int]
|
|
time_connect: Mapped[int]
|
|
time_response: Mapped[int] = mapped_column(index=True)
|
|
time_total: Mapped[int]
|
|
|
|
status_code: Mapped[int] = mapped_column(index=True)
|
|
bytes_read: Mapped[int] = mapped_column(BigInteger)
|
|
termination_state: Mapped[str]
|
|
|
|
active_connections: Mapped[int]
|
|
frontend_connections: Mapped[int]
|
|
backend_connections: Mapped[int]
|
|
server_connections: Mapped[int]
|
|
retries: Mapped[int]
|
|
server_queue: Mapped[int]
|
|
backend_queue: Mapped[int]
|
|
|
|
host: Mapped[str | None] = mapped_column(index=True)
|
|
user_agent: Mapped[str | None] = mapped_column(index=True)
|
|
|
|
method: Mapped[str]
|
|
target: Mapped[str]
|
|
path: Mapped[str] = mapped_column(index=True)
|
|
query: Mapped[str | None]
|
|
http_version: Mapped[str]
|