feat(haproxy-logs): ingest HAProxy request logs into Richie DB
Add a pipeline to load HAProxy `option httplog` lines into the Richie
database so bot/crawler traffic can be analyzed.
- model: HaproxyRequest mirroring the httplog format, with a unique
line_hash dedup key and indexes on common filter columns
- migration: create the haproxy_request table (unique line_hash + indexes)
- haproxy_logs package:
- parser: httplog line -> columns, strips the journald prefix and
hashes the normalized line
- ingest: batched, idempotent insert that skips rows whose line_hash
already exists, so re-ingesting the same logs is a no-op
- cli: ingest-only `haproxy-logs` command reading stdin or a file
- tests: parsing of a real GPTBot line and idempotent re-ingestion
This commit is contained in:
@@ -0,0 +1 @@
|
||||
"""Load HAProxy ``option httplog`` lines into SQLite and query them."""
|
||||
@@ -0,0 +1,54 @@
|
||||
"""Command-line interface: load HAProxy logs into the Richie database.
|
||||
|
||||
The table schema is managed with alembic (``database richie upgrade head``); this
|
||||
command only inserts rows.
|
||||
|
||||
Examples:
|
||||
# stream the live log into the database (commit every line)
|
||||
journalctl -u haproxy -o cat -f | haproxy-logs ingest --batch-size 1
|
||||
|
||||
# backfill from a saved log file
|
||||
haproxy-logs ingest --file /tmp/haproxy.log
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Annotated
|
||||
|
||||
import typer
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from python.haproxy_logs.ingest import ingest_lines
|
||||
from python.orm.common import get_postgres_engine
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
|
||||
app = typer.Typer(help="Load HAProxy logs into the Richie database.", no_args_is_help=True)
|
||||
|
||||
|
||||
@app.command()
|
||||
def ingest(
|
||||
file: Annotated[str | None, typer.Option(help="Read lines from a file instead of stdin.")] = None,
|
||||
batch_size: Annotated[int, typer.Option(help="Rows per commit; use 1 when tailing a live log.")] = 100,
|
||||
) -> None:
|
||||
"""Parse HAProxy log lines from stdin (or a file) and store them in the Richie DB."""
|
||||
engine = get_postgres_engine(name="RICHIE")
|
||||
with Session(engine) as session:
|
||||
result = ingest_lines(_read_lines(file), session, batch_size=batch_size)
|
||||
typer.echo(f"inserted={result.inserted} duplicates={result.duplicates} skipped={result.skipped}")
|
||||
|
||||
|
||||
def _read_lines(file: str | None) -> Iterable[str]:
|
||||
"""Yield log lines from a file, or from stdin when no file is given."""
|
||||
if file is None:
|
||||
yield from sys.stdin
|
||||
return
|
||||
with Path(file).open(encoding="utf-8", errors="replace") as handle:
|
||||
yield from handle
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
@@ -0,0 +1,88 @@
|
||||
"""Parse HAProxy log lines and persist them to the database.
|
||||
|
||||
Ingestion is idempotent: each row carries a ``line_hash`` (a unique column), and
|
||||
this module skips lines whose hash already exists, so the same logs can be
|
||||
re-ingested without creating duplicate records.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from sqlalchemy import select
|
||||
|
||||
from python.haproxy_logs.parser import parse_line
|
||||
from python.orm.richie.haproxy import HaproxyRequest
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class IngestResult:
|
||||
"""Summary counts for an ingest run."""
|
||||
|
||||
inserted: int
|
||||
skipped: int
|
||||
duplicates: int
|
||||
|
||||
|
||||
def ingest_lines(lines: Iterable[str], session: Session, *, batch_size: int = 100) -> IngestResult:
|
||||
"""Parse log lines and insert new ones into the database in batches.
|
||||
|
||||
Lines already present (same ``line_hash``) are dropped, as are duplicate
|
||||
lines within the same run. Unparseable non-blank lines are counted as
|
||||
``skipped`` rather than raising, so a stray startup message never aborts a
|
||||
streaming ingest.
|
||||
|
||||
Args:
|
||||
lines: Iterable of raw log lines (with or without a journald prefix).
|
||||
session: Database session to insert into.
|
||||
batch_size: Number of distinct rows to buffer before committing. Use
|
||||
``1`` when tailing a live log so rows land immediately.
|
||||
|
||||
Returns:
|
||||
Counts of inserted, skipped (unparseable) and duplicate lines.
|
||||
"""
|
||||
inserted = 0
|
||||
skipped = 0
|
||||
parsed_count = 0
|
||||
batch: dict[str, HaproxyRequest] = {}
|
||||
|
||||
for line in lines:
|
||||
parsed = parse_line(line)
|
||||
if parsed is None:
|
||||
if line.strip():
|
||||
skipped += 1
|
||||
continue
|
||||
parsed_count += 1
|
||||
batch[parsed["line_hash"]] = HaproxyRequest(**parsed)
|
||||
if len(batch) >= batch_size:
|
||||
inserted += _flush(batch, session)
|
||||
|
||||
inserted += _flush(batch, session)
|
||||
return IngestResult(inserted=inserted, skipped=skipped, duplicates=parsed_count - inserted)
|
||||
|
||||
|
||||
def _flush(batch: dict[str, HaproxyRequest], session: Session) -> int:
|
||||
"""Insert the rows whose hash is not already stored, then clear the batch.
|
||||
|
||||
Returns the number of rows actually written.
|
||||
"""
|
||||
if not batch:
|
||||
return 0
|
||||
|
||||
existing = set(
|
||||
session.scalars(select(HaproxyRequest.line_hash).where(HaproxyRequest.line_hash.in_(batch.keys()))),
|
||||
)
|
||||
new_rows = [row for line_hash, row in batch.items() if line_hash not in existing]
|
||||
batch.clear()
|
||||
|
||||
if not new_rows:
|
||||
return 0
|
||||
session.add_all(new_rows)
|
||||
session.commit()
|
||||
return len(new_rows)
|
||||
@@ -0,0 +1,132 @@
|
||||
"""Parse HAProxy ``option httplog`` lines into column mappings.
|
||||
|
||||
The expected format (with the request-header capture this project configures) is::
|
||||
|
||||
<client_ip>:<port> [<accept_date>] <frontend> <backend>/<server>
|
||||
<TR>/<Tw>/<Tc>/<Tr>/<Ta> <status> <bytes> <req_cookie> <resp_cookie>
|
||||
<term_state> <ac>/<fc>/<bc>/<sc>/<rc> <srv_q>/<back_q>
|
||||
{<host>|<user_agent>} "<method> <target> <version>"
|
||||
|
||||
Lines may still carry a systemd-journal prefix (``... haproxy[123]: ``); it is
|
||||
stripped before parsing. Lines that are not request logs (startup messages,
|
||||
health checks, ...) return ``None``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
# Strips an optional journal prefix such as "Jun 22 20:17:46 jeeves haproxy[688739]: ".
|
||||
_JOURNAL_PREFIX = re.compile(r"^.*?haproxy(?:\[\d+\])?:\s+")
|
||||
|
||||
_LOG_LINE = re.compile(
|
||||
r"""
|
||||
^
|
||||
(?P<client_ip>[0-9a-fA-F:.]+):(?P<client_port>\d+)\s+
|
||||
\[(?P<accept_date>[^\]]+)\]\s+
|
||||
(?P<frontend>\S+)\s+
|
||||
(?P<backend>[^/\s]+)/(?P<server>\S+)\s+
|
||||
(?P<time_request>-?\d+)/(?P<time_queue>-?\d+)/(?P<time_connect>-?\d+)/
|
||||
(?P<time_response>-?\d+)/(?P<time_total>-?\d+)\s+
|
||||
(?P<status_code>-?\d+)\s+
|
||||
(?P<bytes_read>\d+)\s+
|
||||
\S+\s+\S+\s+ # captured request/response cookies
|
||||
(?P<termination_state>\S+)\s+
|
||||
(?P<active_connections>\d+)/(?P<frontend_connections>\d+)/
|
||||
(?P<backend_connections>\d+)/(?P<server_connections>\d+)/(?P<retries>\d+)\s+
|
||||
(?P<server_queue>\d+)/(?P<backend_queue>\d+)\s+
|
||||
(?P<captures>(?:\{[^}]*\}\s+)*)
|
||||
"(?P<request>[^"]*)"
|
||||
\s*$
|
||||
""",
|
||||
re.VERBOSE,
|
||||
)
|
||||
|
||||
_ACCEPT_DATE_FORMAT = "%d/%b/%Y:%H:%M:%S.%f"
|
||||
_HEADER_CAPTURE = re.compile(r"\{([^}]*)\}")
|
||||
|
||||
|
||||
def parse_line(line: str) -> dict[str, Any] | None:
|
||||
"""Parse one HAProxy http-log line into a ``HaproxyRequest`` column mapping.
|
||||
|
||||
Args:
|
||||
line: A raw log line, optionally still carrying a journald prefix.
|
||||
|
||||
Returns:
|
||||
A mapping of column name to value, or ``None`` if the line is not a
|
||||
request log (blank lines, startup messages, aborted health checks, ...).
|
||||
"""
|
||||
stripped = _JOURNAL_PREFIX.sub("", line.strip())
|
||||
match = _LOG_LINE.match(stripped)
|
||||
if match is None:
|
||||
return None
|
||||
|
||||
groups = match.groupdict()
|
||||
frontend = groups["frontend"]
|
||||
is_ssl = frontend.endswith("~")
|
||||
host, user_agent = _split_request_headers(groups["captures"])
|
||||
method, target, http_version = _split_request(groups["request"])
|
||||
path, _, query = target.partition("?")
|
||||
|
||||
return {
|
||||
"line_hash": hashlib.sha256(stripped.encode("utf-8")).hexdigest(),
|
||||
"requested_at": _parse_accept_date(groups["accept_date"]),
|
||||
"client_ip": groups["client_ip"],
|
||||
"client_port": int(groups["client_port"]),
|
||||
"frontend": frontend.rstrip("~"),
|
||||
"ssl": is_ssl,
|
||||
"backend": groups["backend"],
|
||||
"server": groups["server"],
|
||||
"time_request": int(groups["time_request"]),
|
||||
"time_queue": int(groups["time_queue"]),
|
||||
"time_connect": int(groups["time_connect"]),
|
||||
"time_response": int(groups["time_response"]),
|
||||
"time_total": int(groups["time_total"]),
|
||||
"status_code": int(groups["status_code"]),
|
||||
"bytes_read": int(groups["bytes_read"]),
|
||||
"termination_state": groups["termination_state"],
|
||||
"active_connections": int(groups["active_connections"]),
|
||||
"frontend_connections": int(groups["frontend_connections"]),
|
||||
"backend_connections": int(groups["backend_connections"]),
|
||||
"server_connections": int(groups["server_connections"]),
|
||||
"retries": int(groups["retries"]),
|
||||
"server_queue": int(groups["server_queue"]),
|
||||
"backend_queue": int(groups["backend_queue"]),
|
||||
"host": host,
|
||||
"user_agent": user_agent,
|
||||
"method": method,
|
||||
"target": target,
|
||||
"path": path,
|
||||
"query": query or None,
|
||||
"http_version": http_version,
|
||||
}
|
||||
|
||||
|
||||
def _parse_accept_date(value: str) -> datetime:
|
||||
"""Parse HAProxy's accept date and attach the host's local timezone."""
|
||||
# HAProxy logs naive local wall-clock time; astimezone() interprets it as the
|
||||
# host's local zone and returns a timezone-aware datetime.
|
||||
return datetime.strptime(value, _ACCEPT_DATE_FORMAT).astimezone()
|
||||
|
||||
|
||||
def _split_request_headers(captures: str) -> tuple[str | None, str | None]:
|
||||
"""Pull the Host and User-Agent out of the first ``{host|user-agent}`` capture."""
|
||||
match = _HEADER_CAPTURE.search(captures)
|
||||
if match is None:
|
||||
return None, None
|
||||
host, _, user_agent = match.group(1).partition("|")
|
||||
return (host or None), (user_agent or None)
|
||||
|
||||
|
||||
def _split_request(request: str) -> tuple[str, str, str]:
|
||||
"""Split a request line into method, target and HTTP version.
|
||||
|
||||
Tolerates malformed values such as ``<BADREQ>`` by returning empty strings
|
||||
for the missing parts.
|
||||
"""
|
||||
method, _, remainder = request.partition(" ")
|
||||
target, _, http_version = remainder.partition(" ")
|
||||
return method, target, http_version
|
||||
Reference in New Issue
Block a user