"""Parse HAProxy ``option httplog`` lines into column mappings. The expected format (with the request-header capture this project configures) is:: : [] / //// //// / {|} " " Lines may still carry a systemd-journal prefix (``... haproxy[123]: ``); it is stripped before parsing. Lines that are not request logs (startup messages, health checks, ...) return ``None``. """ from __future__ import annotations import hashlib import re from datetime import datetime from typing import Any # Strips an optional journal prefix such as "Jun 22 20:17:46 jeeves haproxy[688739]: ". _JOURNAL_PREFIX = re.compile(r"^.*?haproxy(?:\[\d+\])?:\s+") _LOG_LINE = re.compile( r""" ^ (?P[0-9a-fA-F:.]+):(?P\d+)\s+ \[(?P[^\]]+)\]\s+ (?P\S+)\s+ (?P[^/\s]+)/(?P\S+)\s+ (?P-?\d+)/(?P-?\d+)/(?P-?\d+)/ (?P-?\d+)/(?P-?\d+)\s+ (?P-?\d+)\s+ (?P\d+)\s+ \S+\s+\S+\s+ # captured request/response cookies (?P\S+)\s+ (?P\d+)/(?P\d+)/ (?P\d+)/(?P\d+)/(?P\d+)\s+ (?P\d+)/(?P\d+)\s+ (?P(?:\{[^}]*\}\s+)*) "(?P[^"]*)" \s*$ """, re.VERBOSE, ) _ACCEPT_DATE_FORMAT = "%d/%b/%Y:%H:%M:%S.%f" _HEADER_CAPTURE = re.compile(r"\{([^}]*)\}") def parse_line(line: str) -> dict[str, Any] | None: """Parse one HAProxy http-log line into a ``HaproxyRequest`` column mapping. Args: line: A raw log line, optionally still carrying a journald prefix. Returns: A mapping of column name to value, or ``None`` if the line is not a request log (blank lines, startup messages, aborted health checks, ...). """ stripped = _JOURNAL_PREFIX.sub("", line.strip()) match = _LOG_LINE.match(stripped) if match is None: return None groups = match.groupdict() frontend = groups["frontend"] is_ssl = frontend.endswith("~") host, user_agent = _split_request_headers(groups["captures"]) method, target, http_version = _split_request(groups["request"]) path, _, query = target.partition("?") return { "line_hash": hashlib.sha256(stripped.encode("utf-8")).hexdigest(), "requested_at": _parse_accept_date(groups["accept_date"]), "client_ip": groups["client_ip"], "client_port": int(groups["client_port"]), "frontend": frontend.rstrip("~"), "ssl": is_ssl, "backend": groups["backend"], "server": groups["server"], "time_request": int(groups["time_request"]), "time_queue": int(groups["time_queue"]), "time_connect": int(groups["time_connect"]), "time_response": int(groups["time_response"]), "time_total": int(groups["time_total"]), "status_code": int(groups["status_code"]), "bytes_read": int(groups["bytes_read"]), "termination_state": groups["termination_state"], "active_connections": int(groups["active_connections"]), "frontend_connections": int(groups["frontend_connections"]), "backend_connections": int(groups["backend_connections"]), "server_connections": int(groups["server_connections"]), "retries": int(groups["retries"]), "server_queue": int(groups["server_queue"]), "backend_queue": int(groups["backend_queue"]), "host": host, "user_agent": user_agent, "method": method, "target": target, "path": path, "query": query or None, "http_version": http_version, } def _parse_accept_date(value: str) -> datetime: """Parse HAProxy's accept date and attach the host's local timezone.""" # HAProxy logs naive local wall-clock time; astimezone() interprets it as the # host's local zone and returns a timezone-aware datetime. return datetime.strptime(value, _ACCEPT_DATE_FORMAT).astimezone() def _split_request_headers(captures: str) -> tuple[str | None, str | None]: """Pull the Host and User-Agent out of the first ``{host|user-agent}`` capture.""" match = _HEADER_CAPTURE.search(captures) if match is None: return None, None host, _, user_agent = match.group(1).partition("|") return (host or None), (user_agent or None) def _split_request(request: str) -> tuple[str, str, str]: """Split a request line into method, target and HTTP version. Tolerates malformed values such as ```` by returning empty strings for the missing parts. """ method, _, remainder = request.partition(" ") target, _, http_version = remainder.partition(" ") return method, target, http_version