1d1bafbd30
Add a pipeline to load HAProxy `option httplog` lines into the Richie
database so bot/crawler traffic can be analyzed.
- model: HaproxyRequest mirroring the httplog format, with a unique
line_hash dedup key and indexes on common filter columns
- migration: create the haproxy_request table (unique line_hash + indexes)
- haproxy_logs package:
- parser: httplog line -> columns, strips the journald prefix and
hashes the normalized line
- ingest: batched, idempotent insert that skips rows whose line_hash
already exists, so re-ingesting the same logs is a no-op
- cli: ingest-only `haproxy-logs` command reading stdin or a file
- tests: parsing of a real GPTBot line and idempotent re-ingestion
55 lines
1.7 KiB
Python
55 lines
1.7 KiB
Python
"""Command-line interface: load HAProxy logs into the Richie database.
|
|
|
|
The table schema is managed with alembic (``database richie upgrade head``); this
|
|
command only inserts rows.
|
|
|
|
Examples:
|
|
# stream the live log into the database (commit every line)
|
|
journalctl -u haproxy -o cat -f | haproxy-logs ingest --batch-size 1
|
|
|
|
# backfill from a saved log file
|
|
haproxy-logs ingest --file /tmp/haproxy.log
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import TYPE_CHECKING, Annotated
|
|
|
|
import typer
|
|
from sqlalchemy.orm import Session
|
|
|
|
from python.haproxy_logs.ingest import ingest_lines
|
|
from python.orm.common import get_postgres_engine
|
|
|
|
if TYPE_CHECKING:
|
|
from collections.abc import Iterable
|
|
|
|
app = typer.Typer(help="Load HAProxy logs into the Richie database.", no_args_is_help=True)
|
|
|
|
|
|
@app.command()
|
|
def ingest(
|
|
file: Annotated[str | None, typer.Option(help="Read lines from a file instead of stdin.")] = None,
|
|
batch_size: Annotated[int, typer.Option(help="Rows per commit; use 1 when tailing a live log.")] = 100,
|
|
) -> None:
|
|
"""Parse HAProxy log lines from stdin (or a file) and store them in the Richie DB."""
|
|
engine = get_postgres_engine(name="RICHIE")
|
|
with Session(engine) as session:
|
|
result = ingest_lines(_read_lines(file), session, batch_size=batch_size)
|
|
typer.echo(f"inserted={result.inserted} duplicates={result.duplicates} skipped={result.skipped}")
|
|
|
|
|
|
def _read_lines(file: str | None) -> Iterable[str]:
|
|
"""Yield log lines from a file, or from stdin when no file is given."""
|
|
if file is None:
|
|
yield from sys.stdin
|
|
return
|
|
with Path(file).open(encoding="utf-8", errors="replace") as handle:
|
|
yield from handle
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app()
|