"""test_haproxy_logs.""" from __future__ import annotations from typing import TYPE_CHECKING import pytest from sqlalchemy import create_engine, func, select from sqlalchemy.orm import Session from sqlalchemy.pool import StaticPool from python.haproxy_logs.ingest import ingest_lines from python.haproxy_logs.parser import parse_line from python.orm.richie.haproxy import HaproxyRequest if TYPE_CHECKING: from collections.abc import Iterator GPTBOT_LINE = ( "Jun 22 20:17:46 jeeves haproxy[688739]: 74.7.242.30:59644 " "[22/Jun/2026:20:17:46.227] ContentSwitching~ gitea/server 0/0/0/133/148 " "200 292890 - - ---- 7/7/5/5/0 0/0 " "{gitea.tmmworkshop.com|like Gecko; compatible; GPTBot/1.4; +https://openai.com/gptbot)} " '"GET https://gitea.tmmworkshop.com/Richie/dotfiles/src/commit/abc/installer.py?display=source HTTP/2.0"' ) def _line(client_ip: str, path: str, time_response: int, user_agent: str = "curl/8") -> str: return ( f"{client_ip}:50000 [22/Jun/2026:20:17:46.227] ContentSwitching~ gitea/server " f"0/0/0/{time_response}/{time_response} 200 100 - - ---- 1/1/1/1/0 0/0 " f'{{gitea.tmmworkshop.com|{user_agent}}} "GET https://gitea.tmmworkshop.com{path} HTTP/2.0"' ) def test_parse_real_gptbot_line() -> None: """A real GPTBot request line parses into the expected fields.""" parsed = parse_line(GPTBOT_LINE) assert parsed is not None assert parsed["client_ip"] == "74.7.242.30" assert parsed["client_port"] == 59644 assert parsed["frontend"] == "ContentSwitching" assert parsed["ssl"] is True assert parsed["backend"] == "gitea" assert parsed["time_response"] == 133 assert parsed["time_total"] == 148 assert parsed["status_code"] == 200 assert parsed["bytes_read"] == 292890 assert parsed["host"] == "gitea.tmmworkshop.com" assert "GPTBot/1.4" in parsed["user_agent"] assert parsed["method"] == "GET" assert parsed["path"] == "https://gitea.tmmworkshop.com/Richie/dotfiles/src/commit/abc/installer.py" assert parsed["query"] == "display=source" assert parsed["http_version"] == "HTTP/2.0" def test_parse_non_request_line_returns_none() -> None: """A non-request log line is ignored rather than raising.""" assert parse_line("Jun 22 20:00:00 jeeves haproxy[1]: Proxy ContentSwitching started.") is None assert parse_line("") is None @pytest.fixture def session() -> Iterator[Session]: """In-memory SQLite session with just the haproxy_request table created.""" engine = create_engine( "sqlite://", connect_args={"check_same_thread": False}, poolclass=StaticPool, ) HaproxyRequest.__table__.create(bind=engine, checkfirst=True) with Session(engine) as open_session: yield open_session def test_ingest_inserts_and_skips(session: Session) -> None: """Parseable lines are inserted; non-request lines are skipped, not fatal.""" lines = [ _line("10.0.0.1", "/a", 10), _line("10.0.0.2", "/b", 20, user_agent="GPTBot/1.4"), "Jun 22 20:00:00 jeeves haproxy[1]: Proxy ContentSwitching started.", "", ] result = ingest_lines(lines, session) assert result.inserted == 2 assert result.skipped == 1 assert result.duplicates == 0 assert session.scalar(select(func.count()).select_from(HaproxyRequest)) == 2 def test_reingest_is_idempotent(session: Session) -> None: """Re-ingesting the same lines creates no duplicate rows.""" lines = [_line("10.0.0.1", "/a", 10), _line("10.0.0.2", "/b", 20)] first = ingest_lines(lines, session) assert first.inserted == 2 assert first.duplicates == 0 second = ingest_lines(lines, session) assert second.inserted == 0 assert second.duplicates == 2 assert session.scalar(select(func.count()).select_from(HaproxyRequest)) == 2 def test_duplicate_lines_within_one_run_collapse(session: Session) -> None: """Identical lines in a single run are stored once.""" line = _line("10.0.0.1", "/a", 10) result = ingest_lines([line, line, line], session) assert result.inserted == 1 assert result.duplicates == 2 assert session.scalar(select(func.count()).select_from(HaproxyRequest)) == 1 def test_ingest_persists_parsed_fields(session: Session) -> None: """A stored row keeps the fields pulled out by the parser.""" ingest_lines([GPTBOT_LINE], session) stored = session.scalar(select(HaproxyRequest)) assert stored is not None assert stored.client_ip == "74.7.242.30" assert stored.backend == "gitea" assert stored.time_response == 133 assert "GPTBot/1.4" in stored.user_agent