feat(haproxy-logs): ingest HAProxy request logs into Richie DB
Add a pipeline to load HAProxy `option httplog` lines into the Richie
database so bot/crawler traffic can be analyzed.
- model: HaproxyRequest mirroring the httplog format, with a unique
line_hash dedup key and indexes on common filter columns
- migration: create the haproxy_request table (unique line_hash + indexes)
- haproxy_logs package:
- parser: httplog line -> columns, strips the journald prefix and
hashes the normalized line
- ingest: batched, idempotent insert that skips rows whose line_hash
already exists, so re-ingesting the same logs is a no-op
- cli: ingest-only `haproxy-logs` command reading stdin or a file
- tests: parsing of a real GPTBot line and idempotent re-ingestion
This commit is contained in:
@@ -0,0 +1,103 @@
|
||||
"""adding haproxy data.
|
||||
|
||||
Revision ID: 96d72c748c24
|
||||
Revises: c460105682d2
|
||||
Create Date: 2026-06-23 16:37:17.768851
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import sqlalchemy as sa
|
||||
from alembic import op
|
||||
|
||||
from python.orm import RichieBase
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "96d72c748c24"
|
||||
down_revision: str | None = "c460105682d2"
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
schema = RichieBase.schema_name
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""Upgrade."""
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.create_table(
|
||||
"haproxy_request",
|
||||
sa.Column("line_hash", sa.String(), nullable=False),
|
||||
sa.Column("requested_at", sa.DateTime(timezone=True), nullable=False),
|
||||
sa.Column("client_ip", sa.String(), nullable=False),
|
||||
sa.Column("client_port", sa.Integer(), nullable=False),
|
||||
sa.Column("frontend", sa.String(), nullable=False),
|
||||
sa.Column("ssl", sa.Boolean(), nullable=False),
|
||||
sa.Column("backend", sa.String(), nullable=False),
|
||||
sa.Column("server", sa.String(), nullable=False),
|
||||
sa.Column("time_request", sa.Integer(), nullable=False),
|
||||
sa.Column("time_queue", sa.Integer(), nullable=False),
|
||||
sa.Column("time_connect", sa.Integer(), nullable=False),
|
||||
sa.Column("time_response", sa.Integer(), nullable=False),
|
||||
sa.Column("time_total", sa.Integer(), nullable=False),
|
||||
sa.Column("status_code", sa.Integer(), nullable=False),
|
||||
sa.Column("bytes_read", sa.BigInteger(), nullable=False),
|
||||
sa.Column("termination_state", sa.String(), nullable=False),
|
||||
sa.Column("active_connections", sa.Integer(), nullable=False),
|
||||
sa.Column("frontend_connections", sa.Integer(), nullable=False),
|
||||
sa.Column("backend_connections", sa.Integer(), nullable=False),
|
||||
sa.Column("server_connections", sa.Integer(), nullable=False),
|
||||
sa.Column("retries", sa.Integer(), nullable=False),
|
||||
sa.Column("server_queue", sa.Integer(), nullable=False),
|
||||
sa.Column("backend_queue", sa.Integer(), nullable=False),
|
||||
sa.Column("host", sa.String(), nullable=True),
|
||||
sa.Column("user_agent", sa.String(), nullable=True),
|
||||
sa.Column("method", sa.String(), nullable=False),
|
||||
sa.Column("target", sa.String(), nullable=False),
|
||||
sa.Column("path", sa.String(), nullable=False),
|
||||
sa.Column("query", sa.String(), nullable=True),
|
||||
sa.Column("http_version", sa.String(), nullable=False),
|
||||
sa.Column("id", sa.Integer(), nullable=False),
|
||||
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
|
||||
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
|
||||
sa.PrimaryKeyConstraint("id", name=op.f("pk_haproxy_request")),
|
||||
sa.UniqueConstraint("line_hash", name=op.f("uq_haproxy_request_line_hash")),
|
||||
schema=schema,
|
||||
)
|
||||
op.create_index(op.f("ix_haproxy_request_backend"), "haproxy_request", ["backend"], unique=False, schema=schema)
|
||||
op.create_index(op.f("ix_haproxy_request_client_ip"), "haproxy_request", ["client_ip"], unique=False, schema=schema)
|
||||
op.create_index(op.f("ix_haproxy_request_host"), "haproxy_request", ["host"], unique=False, schema=schema)
|
||||
op.create_index(op.f("ix_haproxy_request_path"), "haproxy_request", ["path"], unique=False, schema=schema)
|
||||
op.create_index(
|
||||
op.f("ix_haproxy_request_requested_at"), "haproxy_request", ["requested_at"], unique=False, schema=schema
|
||||
)
|
||||
op.create_index(
|
||||
op.f("ix_haproxy_request_status_code"), "haproxy_request", ["status_code"], unique=False, schema=schema
|
||||
)
|
||||
op.create_index(
|
||||
op.f("ix_haproxy_request_time_response"), "haproxy_request", ["time_response"], unique=False, schema=schema
|
||||
)
|
||||
op.create_index(
|
||||
op.f("ix_haproxy_request_user_agent"), "haproxy_request", ["user_agent"], unique=False, schema=schema
|
||||
)
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""Downgrade."""
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.drop_index(op.f("ix_haproxy_request_user_agent"), table_name="haproxy_request", schema=schema)
|
||||
op.drop_index(op.f("ix_haproxy_request_time_response"), table_name="haproxy_request", schema=schema)
|
||||
op.drop_index(op.f("ix_haproxy_request_status_code"), table_name="haproxy_request", schema=schema)
|
||||
op.drop_index(op.f("ix_haproxy_request_requested_at"), table_name="haproxy_request", schema=schema)
|
||||
op.drop_index(op.f("ix_haproxy_request_path"), table_name="haproxy_request", schema=schema)
|
||||
op.drop_index(op.f("ix_haproxy_request_host"), table_name="haproxy_request", schema=schema)
|
||||
op.drop_index(op.f("ix_haproxy_request_client_ip"), table_name="haproxy_request", schema=schema)
|
||||
op.drop_index(op.f("ix_haproxy_request_backend"), table_name="haproxy_request", schema=schema)
|
||||
op.drop_table("haproxy_request", schema=schema)
|
||||
# ### end Alembic commands ###
|
||||
Reference in New Issue
Block a user