feat(haproxy-logs): ingest HAProxy request logs into Richie DB
Add a pipeline to load HAProxy `option httplog` lines into the Richie
database so bot/crawler traffic can be analyzed.
- model: HaproxyRequest mirroring the httplog format, with a unique
line_hash dedup key and indexes on common filter columns
- migration: create the haproxy_request table (unique line_hash + indexes)
- haproxy_logs package:
- parser: httplog line -> columns, strips the journald prefix and
hashes the normalized line
- ingest: batched, idempotent insert that skips rows whose line_hash
already exists, so re-ingesting the same logs is a no-op
- cli: ingest-only `haproxy-logs` command reading stdin or a file
- tests: parsing of a real GPTBot line and idempotent re-ingestion
This commit is contained in:
@@ -0,0 +1,54 @@
|
||||
"""Command-line interface: load HAProxy logs into the Richie database.
|
||||
|
||||
The table schema is managed with alembic (``database richie upgrade head``); this
|
||||
command only inserts rows.
|
||||
|
||||
Examples:
|
||||
# stream the live log into the database (commit every line)
|
||||
journalctl -u haproxy -o cat -f | haproxy-logs ingest --batch-size 1
|
||||
|
||||
# backfill from a saved log file
|
||||
haproxy-logs ingest --file /tmp/haproxy.log
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Annotated
|
||||
|
||||
import typer
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from python.haproxy_logs.ingest import ingest_lines
|
||||
from python.orm.common import get_postgres_engine
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
|
||||
app = typer.Typer(help="Load HAProxy logs into the Richie database.", no_args_is_help=True)
|
||||
|
||||
|
||||
@app.command()
|
||||
def ingest(
|
||||
file: Annotated[str | None, typer.Option(help="Read lines from a file instead of stdin.")] = None,
|
||||
batch_size: Annotated[int, typer.Option(help="Rows per commit; use 1 when tailing a live log.")] = 100,
|
||||
) -> None:
|
||||
"""Parse HAProxy log lines from stdin (or a file) and store them in the Richie DB."""
|
||||
engine = get_postgres_engine(name="RICHIE")
|
||||
with Session(engine) as session:
|
||||
result = ingest_lines(_read_lines(file), session, batch_size=batch_size)
|
||||
typer.echo(f"inserted={result.inserted} duplicates={result.duplicates} skipped={result.skipped}")
|
||||
|
||||
|
||||
def _read_lines(file: str | None) -> Iterable[str]:
|
||||
"""Yield log lines from a file, or from stdin when no file is given."""
|
||||
if file is None:
|
||||
yield from sys.stdin
|
||||
return
|
||||
with Path(file).open(encoding="utf-8", errors="replace") as handle:
|
||||
yield from handle
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
Reference in New Issue
Block a user