Files
dotfiles/python/haproxy_logs/cli.py
T
Richie 1d1bafbd30 feat(haproxy-logs): ingest HAProxy request logs into Richie DB
Add a pipeline to load HAProxy `option httplog` lines into the Richie
database so bot/crawler traffic can be analyzed.

- model: HaproxyRequest mirroring the httplog format, with a unique
  line_hash dedup key and indexes on common filter columns
- migration: create the haproxy_request table (unique line_hash + indexes)
- haproxy_logs package:
  - parser: httplog line -> columns, strips the journald prefix and
    hashes the normalized line
  - ingest: batched, idempotent insert that skips rows whose line_hash
    already exists, so re-ingesting the same logs is a no-op
  - cli: ingest-only `haproxy-logs` command reading stdin or a file
- tests: parsing of a real GPTBot line and idempotent re-ingestion
2026-06-23 21:13:20 -04:00

55 lines
1.7 KiB
Python

"""Command-line interface: load HAProxy logs into the Richie database.
The table schema is managed with alembic (``database richie upgrade head``); this
command only inserts rows.
Examples:
# stream the live log into the database (commit every line)
journalctl -u haproxy -o cat -f | haproxy-logs ingest --batch-size 1
# backfill from a saved log file
haproxy-logs ingest --file /tmp/haproxy.log
"""
from __future__ import annotations
import sys
from pathlib import Path
from typing import TYPE_CHECKING, Annotated
import typer
from sqlalchemy.orm import Session
from python.haproxy_logs.ingest import ingest_lines
from python.orm.common import get_postgres_engine
if TYPE_CHECKING:
from collections.abc import Iterable
app = typer.Typer(help="Load HAProxy logs into the Richie database.", no_args_is_help=True)
@app.command()
def ingest(
file: Annotated[str | None, typer.Option(help="Read lines from a file instead of stdin.")] = None,
batch_size: Annotated[int, typer.Option(help="Rows per commit; use 1 when tailing a live log.")] = 100,
) -> None:
"""Parse HAProxy log lines from stdin (or a file) and store them in the Richie DB."""
engine = get_postgres_engine(name="RICHIE")
with Session(engine) as session:
result = ingest_lines(_read_lines(file), session, batch_size=batch_size)
typer.echo(f"inserted={result.inserted} duplicates={result.duplicates} skipped={result.skipped}")
def _read_lines(file: str | None) -> Iterable[str]:
"""Yield log lines from a file, or from stdin when no file is given."""
if file is None:
yield from sys.stdin
return
with Path(file).open(encoding="utf-8", errors="replace") as handle:
yield from handle
if __name__ == "__main__":
app()