Files
dotfiles/python/haproxy_logs/parser.py
T
Richie 1d1bafbd30 feat(haproxy-logs): ingest HAProxy request logs into Richie DB
Add a pipeline to load HAProxy `option httplog` lines into the Richie
database so bot/crawler traffic can be analyzed.

- model: HaproxyRequest mirroring the httplog format, with a unique
  line_hash dedup key and indexes on common filter columns
- migration: create the haproxy_request table (unique line_hash + indexes)
- haproxy_logs package:
  - parser: httplog line -> columns, strips the journald prefix and
    hashes the normalized line
  - ingest: batched, idempotent insert that skips rows whose line_hash
    already exists, so re-ingesting the same logs is a no-op
  - cli: ingest-only `haproxy-logs` command reading stdin or a file
- tests: parsing of a real GPTBot line and idempotent re-ingestion
2026-06-23 21:13:20 -04:00

133 lines
5.0 KiB
Python

"""Parse HAProxy ``option httplog`` lines into column mappings.
The expected format (with the request-header capture this project configures) is::
<client_ip>:<port> [<accept_date>] <frontend> <backend>/<server>
<TR>/<Tw>/<Tc>/<Tr>/<Ta> <status> <bytes> <req_cookie> <resp_cookie>
<term_state> <ac>/<fc>/<bc>/<sc>/<rc> <srv_q>/<back_q>
{<host>|<user_agent>} "<method> <target> <version>"
Lines may still carry a systemd-journal prefix (``... haproxy[123]: ``); it is
stripped before parsing. Lines that are not request logs (startup messages,
health checks, ...) return ``None``.
"""
from __future__ import annotations
import hashlib
import re
from datetime import datetime
from typing import Any
# Strips an optional journal prefix such as "Jun 22 20:17:46 jeeves haproxy[688739]: ".
_JOURNAL_PREFIX = re.compile(r"^.*?haproxy(?:\[\d+\])?:\s+")
_LOG_LINE = re.compile(
r"""
^
(?P<client_ip>[0-9a-fA-F:.]+):(?P<client_port>\d+)\s+
\[(?P<accept_date>[^\]]+)\]\s+
(?P<frontend>\S+)\s+
(?P<backend>[^/\s]+)/(?P<server>\S+)\s+
(?P<time_request>-?\d+)/(?P<time_queue>-?\d+)/(?P<time_connect>-?\d+)/
(?P<time_response>-?\d+)/(?P<time_total>-?\d+)\s+
(?P<status_code>-?\d+)\s+
(?P<bytes_read>\d+)\s+
\S+\s+\S+\s+ # captured request/response cookies
(?P<termination_state>\S+)\s+
(?P<active_connections>\d+)/(?P<frontend_connections>\d+)/
(?P<backend_connections>\d+)/(?P<server_connections>\d+)/(?P<retries>\d+)\s+
(?P<server_queue>\d+)/(?P<backend_queue>\d+)\s+
(?P<captures>(?:\{[^}]*\}\s+)*)
"(?P<request>[^"]*)"
\s*$
""",
re.VERBOSE,
)
_ACCEPT_DATE_FORMAT = "%d/%b/%Y:%H:%M:%S.%f"
_HEADER_CAPTURE = re.compile(r"\{([^}]*)\}")
def parse_line(line: str) -> dict[str, Any] | None:
"""Parse one HAProxy http-log line into a ``HaproxyRequest`` column mapping.
Args:
line: A raw log line, optionally still carrying a journald prefix.
Returns:
A mapping of column name to value, or ``None`` if the line is not a
request log (blank lines, startup messages, aborted health checks, ...).
"""
stripped = _JOURNAL_PREFIX.sub("", line.strip())
match = _LOG_LINE.match(stripped)
if match is None:
return None
groups = match.groupdict()
frontend = groups["frontend"]
is_ssl = frontend.endswith("~")
host, user_agent = _split_request_headers(groups["captures"])
method, target, http_version = _split_request(groups["request"])
path, _, query = target.partition("?")
return {
"line_hash": hashlib.sha256(stripped.encode("utf-8")).hexdigest(),
"requested_at": _parse_accept_date(groups["accept_date"]),
"client_ip": groups["client_ip"],
"client_port": int(groups["client_port"]),
"frontend": frontend.rstrip("~"),
"ssl": is_ssl,
"backend": groups["backend"],
"server": groups["server"],
"time_request": int(groups["time_request"]),
"time_queue": int(groups["time_queue"]),
"time_connect": int(groups["time_connect"]),
"time_response": int(groups["time_response"]),
"time_total": int(groups["time_total"]),
"status_code": int(groups["status_code"]),
"bytes_read": int(groups["bytes_read"]),
"termination_state": groups["termination_state"],
"active_connections": int(groups["active_connections"]),
"frontend_connections": int(groups["frontend_connections"]),
"backend_connections": int(groups["backend_connections"]),
"server_connections": int(groups["server_connections"]),
"retries": int(groups["retries"]),
"server_queue": int(groups["server_queue"]),
"backend_queue": int(groups["backend_queue"]),
"host": host,
"user_agent": user_agent,
"method": method,
"target": target,
"path": path,
"query": query or None,
"http_version": http_version,
}
def _parse_accept_date(value: str) -> datetime:
"""Parse HAProxy's accept date and attach the host's local timezone."""
# HAProxy logs naive local wall-clock time; astimezone() interprets it as the
# host's local zone and returns a timezone-aware datetime.
return datetime.strptime(value, _ACCEPT_DATE_FORMAT).astimezone()
def _split_request_headers(captures: str) -> tuple[str | None, str | None]:
"""Pull the Host and User-Agent out of the first ``{host|user-agent}`` capture."""
match = _HEADER_CAPTURE.search(captures)
if match is None:
return None, None
host, _, user_agent = match.group(1).partition("|")
return (host or None), (user_agent or None)
def _split_request(request: str) -> tuple[str, str, str]:
"""Split a request line into method, target and HTTP version.
Tolerates malformed values such as ``<BADREQ>`` by returning empty strings
for the missing parts.
"""
method, _, remainder = request.partition(" ")
target, _, http_version = remainder.partition(" ")
return method, target, http_version