allowing multiple summaries per bill text

Merge pull request 'added bert_topic train.py and infer.py' (#3 ) from feature/added-bert_topic-train.py-and-infer.py into main
Reviewed-on: #3
2026-05-08 18:30:07 -04:00 · 2026-05-02 20:59:08 -04:00 · 2026-05-02 20:58:33 -04:00 · 2026-04-28 23:07:41 -04:00 · 2026-04-28 23:05:56 -04:00 · 2026-04-28 23:02:31 -04:00
20 changed files with 4149 additions and 62 deletions
@@ -0,0 +1,211 @@
+"""move bill text summaries into a child table.
+
+Revision ID: 4b2e1c9d8f70
+Revises: b9360b0b0c22
+Create Date: 2026-05-03 00:00:00.000000
+
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import sqlalchemy as sa
+
+from alembic import op
+from pipelines.orm import DataScienceDevBase
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+# revision identifiers, used by Alembic.
+revision: str = "4b2e1c9d8f70"
+down_revision: str | None = "b9360b0b0c22"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+schema = DataScienceDevBase.schema_name
+
+
+def upgrade() -> None:
+    """Upgrade."""
+    op.create_table(
+        "bill_text_summary",
+        sa.Column("bill_text_id", sa.Integer(), nullable=False),
+        sa.Column("summary", sa.String(), nullable=False),
+        sa.Column("summarization_model", sa.String(), nullable=True),
+        sa.Column("summarization_user_prompt_version", sa.String(), nullable=True),
+        sa.Column("summarization_system_prompt_version", sa.String(), nullable=True),
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column(
+            "created",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("now()"),
+            nullable=False,
+        ),
+        sa.Column(
+            "updated",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("now()"),
+            nullable=False,
+        ),
+        sa.ForeignKeyConstraint(
+            ["bill_text_id"],
+            [f"{schema}.bill_text.id"],
+            name=op.f("fk_bill_text_summary_bill_text_id_bill_text"),
+            ondelete="CASCADE",
+        ),
+        sa.PrimaryKeyConstraint("id", name=op.f("pk_bill_text_summary")),
+        schema=schema,
+    )
+    op.create_index(
+        "ix_bill_text_summary_bill_text_id",
+        "bill_text_summary",
+        ["bill_text_id"],
+        unique=False,
+        schema=schema,
+    )
+    op.create_index(
+        "ix_bill_text_summary_bill_text_id_created",
+        "bill_text_summary",
+        ["bill_text_id", "created"],
+        unique=False,
+        schema=schema,
+    )
+    op.add_column(
+        "bill_text",
+        sa.Column("primary_summary_id", sa.Integer(), nullable=True),
+        schema=schema,
+    )
+    op.create_foreign_key(
+        op.f("fk_bill_text_primary_summary_id_bill_text_summary"),
+        "bill_text",
+        "bill_text_summary",
+        ["primary_summary_id"],
+        ["id"],
+        source_schema=schema,
+        referent_schema=schema,
+        ondelete="SET NULL",
+    )
+
+    op.execute(
+        sa.text(
+            f"""
+            INSERT INTO {schema}.bill_text_summary (
+                bill_text_id,
+                summary,
+                summarization_model,
+                summarization_user_prompt_version,
+                summarization_system_prompt_version,
+                created,
+                updated
+            )
+            SELECT
+                bill_text.id,
+                bill_text.summary,
+                bill_text.summarization_model,
+                bill_text.summarization_user_prompt_version,
+                bill_text.summarization_system_prompt_version,
+                COALESCE(bill_text.updated, bill_text.created, now()),
+                COALESCE(bill_text.updated, bill_text.created, now())
+            FROM {schema}.bill_text
+            WHERE bill_text.summary IS NOT NULL
+              AND btrim(bill_text.summary) <> ''
+            """
+        )
+    )
+
+    op.drop_column("bill_text", "summary", schema=schema)
+    op.drop_column("bill_text", "summarization_model", schema=schema)
+    op.drop_column("bill_text", "summarization_user_prompt_version", schema=schema)
+    op.drop_column("bill_text", "summarization_system_prompt_version", schema=schema)
+
+
+def downgrade() -> None:
+    """Downgrade."""
+    op.add_column(
+        "bill_text",
+        sa.Column("summarization_system_prompt_version", sa.String(), nullable=True),
+        schema=schema,
+    )
+    op.add_column(
+        "bill_text",
+        sa.Column("summarization_user_prompt_version", sa.String(), nullable=True),
+        schema=schema,
+    )
+    op.add_column(
+        "bill_text",
+        sa.Column("summarization_model", sa.String(), nullable=True),
+        schema=schema,
+    )
+    op.add_column(
+        "bill_text",
+        sa.Column("summary", sa.String(), nullable=True),
+        schema=schema,
+    )
+
+    op.execute(
+        sa.text(
+            f"""
+            WITH ranked AS (
+                SELECT
+                    bts.*,
+                    row_number() OVER (
+                        PARTITION BY bts.bill_text_id
+                        ORDER BY bts.created DESC, bts.id DESC
+                    ) AS rn
+                FROM {schema}.bill_text_summary AS bts
+            ),
+            chosen AS (
+                SELECT
+                    bill_text.id AS bill_text_id,
+                    COALESCE(ps.summary, ls.summary) AS summary,
+                    COALESCE(
+                        ps.summarization_model,
+                        ls.summarization_model
+                    ) AS summarization_model,
+                    COALESCE(
+                        ps.summarization_user_prompt_version,
+                        ls.summarization_user_prompt_version
+                    ) AS summarization_user_prompt_version,
+                    COALESCE(
+                        ps.summarization_system_prompt_version,
+                        ls.summarization_system_prompt_version
+                    ) AS summarization_system_prompt_version
+                FROM {schema}.bill_text
+                LEFT JOIN {schema}.bill_text_summary AS ps
+                    ON ps.id = bill_text.primary_summary_id
+                LEFT JOIN ranked AS ls
+                    ON ls.bill_text_id = bill_text.id
+                   AND ls.rn = 1
+            )
+            UPDATE {schema}.bill_text
+            SET
+                summary = chosen.summary,
+                summarization_model = chosen.summarization_model,
+                summarization_user_prompt_version = chosen.summarization_user_prompt_version,
+                summarization_system_prompt_version = chosen.summarization_system_prompt_version
+            FROM chosen
+            WHERE chosen.bill_text_id = bill_text.id
+            """
+        )
+    )
+
+    op.drop_constraint(
+        op.f("fk_bill_text_primary_summary_id_bill_text_summary"),
+        "bill_text",
+        schema=schema,
+        type_="foreignkey",
+    )
+    op.drop_column("bill_text", "primary_summary_id", schema=schema)
+    op.drop_index(
+        "ix_bill_text_summary_bill_text_id_created",
+        table_name="bill_text_summary",
+        schema=schema,
+    )
+    op.drop_index(
+        "ix_bill_text_summary_bill_text_id",
+        table_name="bill_text_summary",
+        schema=schema,
+    )
+    op.drop_table("bill_text_summary", schema=schema)
@@ -1 +1 @@
-"""Prompt benchmarking system for evaluating LLMs via vLLM."""
+"""Init."""
@@ -0,0 +1,116 @@
+"""Nornsight — BERTopic POC Inference Script.
+
+Loads the trained model and labels a small batch of posts,
+writing results to main.post_topic for inspection.
+
+POC: processes a single batch of 1k posts to validate the pipeline end-to-end.
+"""
+
+from __future__ import annotations
+
+import logging
+import time
+from collections import Counter
+from pathlib import Path
+
+from bertopic import BERTopic
+from sqlalchemy import Engine, func, insert, select
+from sqlalchemy.orm import Session
+
+from pipelines.config import BertTopicInferConfig, get_bertopic_infer_config
+from pipelines.orm.common import get_postgres_engine
+from pipelines.orm.data_science_dev.posts import PostTopic, Posts
+from pipelines.orm.data_science_dev.posts.lang_filters import ENGLISH_LANGS
+from pipelines.pipelines.common import configure_logger
+
+logger = logging.getLogger(__name__)
+
+
+def main() -> None:
+    """Run BERTopic inference against a sample of posts."""
+    configure_logger()
+
+    config = get_bertopic_infer_config()
+    run_inference(config)
+    logger.info(
+        "POC inference complete. Check main.post_topic in DBeaver to inspect results."
+    )
+
+
+def run_inference(config: BertTopicInferConfig) -> None:
+    model_save_path = Path(config.model_save_path)
+
+    logger.info(f"Loading BERTopic model from {model_save_path}")
+    topic_model = BERTopic.load(str(model_save_path))
+
+    topic_info = topic_model.get_topic_info()
+    label_map: dict[int, str] = dict(zip(topic_info["Topic"], topic_info["Name"]))
+    logger.info(f"Model loaded with {len(label_map)} topics")
+
+    engine = get_postgres_engine(name="DATA_SCIENCE_DEV")
+
+    post_ids, texts = get_post_ids_and_test(engine, config)
+
+    logger.info(f"Fetched {len(texts)} posts")
+
+    logger.info("Running BERTopic transform")
+    start = time.perf_counter()
+    topics, _probabilities = topic_model.transform(texts)
+    elapsed = time.perf_counter() - start
+    logger.info(f"Transform complete in {elapsed:.1f}s")
+
+    # Write results to main.post_topic
+    records = [
+        {
+            "post_id": pid,
+            "topic_id": int(topic_id),
+            "topic_label": label_map.get(int(topic_id), "unknown"),
+            "model_version": config.model_version,
+        }
+        for pid, topic_id in zip(post_ids, topics)
+    ]
+    with Session(engine) as session:
+        session.execute(insert(PostTopic), records)
+        session.commit()
+
+    count_topics(records)
+    logger.info(f"Wrote {len(records)} topic labels to main.post_topic")
+
+
+def get_post_ids_and_test(
+    engine: Engine,
+    config: BertTopicInferConfig,
+) -> None | tuple[list[int], list[str]]:
+    with Session(engine) as session:
+        logger.info(f"Fetching {config.poc_batch_size} posts for inference")
+        # Pull a fresh batch for inference — distinct from training sample
+        # using a fixed seed offset so we're not re-labeling training posts
+        stmt = select(Posts).where(
+            Posts.text.is_not(None),
+            Posts.langs.in_(ENGLISH_LANGS),
+            func.length(Posts.text) > config.min_text_length,
+        )
+        if config.poc_batch_size > 0:
+            stmt = stmt.limit(config.poc_batch_size)
+
+        posts = session.scalars(stmt).all()
+        if not posts:
+            logger.warning("No posts were selected for inference")
+            return [], []
+
+        post_ids = [post.post_id for post in posts]
+        texts = [post.text.strip() for post in posts]
+
+    return post_ids, texts
+
+
+def count_topics(records: list[dict]) -> None:
+    topic_counts = Counter(record.get("topic_label", "unknown") for record in records)
+
+    logger.info("Topic distribution in this batch:")
+    for label, count in topic_counts.most_common(10):
+        logger.info("  %s: %d", label, count)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,119 @@
+"""Nornsight — BERTopic POC Training Script.
+
+Pulls a small stratified sample (~11.5k posts) from main.posts,
+trains BERTopic with MiniBatchKMeans on Jeeves, and saves the model locally.
+
+POC sample rate: random() < 0.00005 (~0.005% of 230M = ~11.5k posts)
+Full training rate will be: random() < 0.005 (~1.08M posts)
+"""
+
+from __future__ import annotations
+
+import logging
+import time
+from pathlib import Path
+
+from bertopic import BERTopic
+from sklearn.cluster import MiniBatchKMeans
+from sqlalchemy import func, select
+from sqlalchemy.orm import Session
+
+from pipelines.config import BertTopicTrainConfig, get_bertopic_train_config
+from pipelines.orm.common import get_postgres_engine
+from pipelines.orm.data_science_dev.posts import Posts
+from pipelines.orm.data_science_dev.posts.lang_filters import ENGLISH_LANGS
+from pipelines.pipelines.common import configure_logger
+
+logger = logging.getLogger(__name__)
+
+
+def main() -> None:
+    """Train and persist the BERTopic model."""
+    configure_logger()
+
+    config = get_bertopic_train_config()
+    docs = load_sample(config)
+    if not docs:
+        logger.warning("No training documents were selected")
+        return
+
+    train(docs, config)
+    logger.info(f"Done. Model saved as version {config.model_version}")
+    logger.info("Next: run infer.py to label a sample of posts in the database")
+
+
+def load_sample(config: BertTopicTrainConfig) -> list[str]:
+    logger.info("Connecting to PostgreSQL via SQLAlchemy")
+    engine = get_postgres_engine(name="DATA_SCIENCE_DEV")
+
+    logger.info(f"Pulling sample from main.posts (sample_rate={config.sample_rate})")
+    start = time.perf_counter()
+
+    with Session(engine) as session:
+        texts = session.scalars(
+            select(Posts.text).where(
+                Posts.text.is_not(None),
+                Posts.langs.in_(ENGLISH_LANGS),
+                func.length(Posts.text) > config.min_text_length,
+                func.random() < config.sample_rate,
+            )
+        ).all()
+
+    elapsed = time.perf_counter() - start
+    logger.info(f"Fetched {len(texts)} rows in {elapsed:.1f}s")
+
+    # Basic cleaning — strip whitespace and deduplicate
+    docs = list({text.strip() for text in texts})
+    logger.info(f"After cleaning and dedup: {len(docs)} posts")
+
+    return docs
+
+
+def train(docs: list[str], config: BertTopicTrainConfig) -> None:
+    logger.info(
+        f"Initialising BERTopic with MiniBatchKMeans (n_topics={config.n_topics})"
+    )
+
+    cluster_model = MiniBatchKMeans(
+        n_clusters=config.n_topics,
+        random_state=42,
+        batch_size=1024,
+        n_init=3,
+        verbose=1,
+    )
+
+    topic_model = BERTopic(
+        hdbscan_model=cluster_model,
+        language="english",
+        calculate_probabilities=False,  # saves memory
+        verbose=True,
+    )
+
+    logger.info(f"Starting fit_transform on {len(docs)} posts (CPU)")
+    start = time.perf_counter()
+
+    topic_model.fit_transform(docs)
+
+    elapsed = time.perf_counter() - start
+    logger.info(f"Training complete in {elapsed:.1f}s ({elapsed / 60:.1f} min)")
+
+    # Log topic summary for quick inspection
+    topic_info = topic_model.get_topic_info()
+    logger.info(f"Topics found: {len(topic_info)}")
+    logger.info(f"\n{topic_info.to_string()}")
+
+    model_save_path = Path(config.model_save_path)
+    model_save_path.mkdir(parents=True, exist_ok=True)
+    logger.info(f"Saving model to {model_save_path}")
+
+    topic_model.save(
+        str(model_save_path),
+        serialization="safetensors",
+        save_ctfidf=True,
+        save_embedding_model=True,
+    )
+    logger.info("Model saved")
+
+
+if __name__ == "__main__":
+    main()
@@ -2,6 +2,7 @@ from __future__ import annotations

 from dataclasses import dataclass
 from os import getenv
+from datetime import date
 from pathlib import Path
 import tomllib

@@ -50,6 +51,7 @@ class FinetuneConfig:
        )


+@dataclass
 class BenchmarkConfig:
    """Top-level benchmark configuration loaded from TOML."""

@@ -101,6 +103,45 @@ class OpenAIConfig:
        )


+@dataclass
+class BertTopicTrainConfig:
+    """BERTopic training configuration loaded from TOML."""
+
+    sample_rate: float
+    min_text_length: int
+    n_topics: int
+    model_save_path: str
+    model_version: str | None = None
+
+    @classmethod
+    def from_toml(cls, config_path: Path) -> BertTopicTrainConfig:
+        """Load BERTopic training config from a TOML file."""
+        raw = tomllib.loads(config_path.read_text())["bertopic"]["train"]
+
+        today = date.today().isoformat()
+        if raw.get("model_version") is None:
+            raw["model_version"] = (
+                f"{today}-{raw['sample_rate']}-{raw['min_text_length']}-{raw['n_topics']}"
+            )
+        return cls(**raw)
+
+
+@dataclass
+class BertTopicInferConfig:
+    """BERTopic inference configuration loaded from TOML."""
+
+    min_text_length: int
+    poc_batch_size: int
+    model_version: str
+    model_save_path: str
+
+    @classmethod
+    def from_toml(cls, config_path: Path) -> BertTopicInferConfig:
+        """Load BERTopic inference config from a TOML file."""
+        raw = tomllib.loads(config_path.read_text())["bertopic"]["infer"]
+        return cls(**raw)
+
+
 def get_config_dir() -> Path:
    """Get the path to the config directory."""
    return Path(__file__).resolve().parents[2] / "config"
@@ -127,3 +168,19 @@ def get_benchmark_config(config_path: Path | None = None) -> BenchmarkConfig:
    if config_path is None:
        config_path = default_config_path()
    return BenchmarkConfig.from_toml(config_path)
+
+
+def get_bertopic_train_config(
+    config_path: Path | None = None,
+) -> BertTopicTrainConfig:
+    if config_path is None:
+        config_path = default_config_path()
+    return BertTopicTrainConfig.from_toml(config_path)
+
+
+def get_bertopic_infer_config(
+    config_path: Path | None = None,
+) -> BertTopicInferConfig:
+    if config_path is None:
+        config_path = default_config_path()
+    return BertTopicInferConfig.from_toml(config_path)
@@ -23,7 +23,7 @@ from sqlalchemy import (
 )
 from sqlalchemy.orm import Session

-from pipelines.congress_vote_context import create_score_run, finalize_score_run
+from pipelines.jobs.congress_vote_context import create_score_run, finalize_score_run
 from pipelines.orm.common import get_postgres_engine
 from pipelines.orm.data_science_dev.congress import (
    BillTopic,
@@ -39,7 +39,7 @@ from pipelines.orm.data_science_dev.congress import (
    VoteRelationship,
    VoteRecord,
 )
-from pipelines.pipelines.jobs.extract_bill_topics import normalize_topic_label
+from pipelines.jobs.extract_bill_topics import normalize_topic_label
 from pipelines.web.scoring import (
    OPPOSE_POSITIONS,
    SUPPORT_POSITIONS,
@@ -19,6 +19,7 @@ from pipelines.orm.common import get_postgres_engine
 from pipelines.orm.data_science_dev.congress import (
    Bill,
    BillText,
+    BillTextSummary,
    BillTopic,
    BillTopicPosition,
    SubjectType,
@@ -72,11 +73,19 @@ class ExtractedBillTopic:
 def _select_bill_text_for_topic_extraction(bill: Bill) -> BillText | None:
    """Pick one summarized bill_text row from the already-loaded relationship."""
    for bill_text in bill.bill_texts:
-        if bill_text.summary and bill_text.summary.strip():
+        summary_row = bill_text.default_summary()
+        if summary_row and summary_row.summary.strip():
            return bill_text
    return None


+def _bill_text_has_summary_clause() -> ColumnElement[bool]:
+    """Return a correlated EXISTS clause for bill texts with at least one summary."""
+    return exists(
+        select(BillTextSummary.id).where(BillTextSummary.bill_text_id == BillText.id)
+    )
+
+
 def normalize_topic_label(value: str) -> str:
    """Normalize a topic label for storage, comparison, and de-duping."""
    normalized = value.strip().strip("\"'")
@@ -323,11 +332,7 @@ def create_select_bills_for_topic_extraction(
    limit: int | None = None,
 ) -> Select[tuple[Bill]]:
    """Select bill rows that have summarized bill_text rows for topic extraction."""
-    has_summary = (BillText.summary.is_not(None), BillText.summary != "")
-    summarized_text_filters: list[ColumnElement[bool]] = [
-        BillText.bill_id == Bill.id,
-        *has_summary,
-    ]
+    summarized_text_filters: list[ColumnElement[bool]] = [_bill_text_has_summary_clause()]
    if with_votes_only:
        summarized_text_filters.append(
            exists(
@@ -347,11 +352,17 @@ def create_select_bills_for_topic_extraction(
                )
            )
        )
-    summarized_text_exists = exists(select(BillText.id).where(*summarized_text_filters))
+    summarized_text_exists = exists(
+        select(BillText.id).where(BillText.bill_id == Bill.id, *summarized_text_filters)
+    )
+    bill_text_loader = selectinload(Bill.bill_texts.and_(*summarized_text_filters))
    stmt = (
        select(Bill)
        .where(summarized_text_exists)
-        .options(selectinload(Bill.bill_texts.and_(*summarized_text_filters[1:])))
+        .options(
+            bill_text_loader.selectinload(BillText.summaries),
+            bill_text_loader.selectinload(BillText.primary_summary),
+        )
        .order_by(Bill.id)
    )
    if congress is not None:
@@ -363,7 +374,7 @@ def create_select_bills_for_topic_extraction(
            select(BillText.id).where(
                BillText.bill_id == Bill.id,
                BillText.id.in_(bill_text_ids),
-                *summarized_text_filters[1:],
+                *summarized_text_filters,
            )
        )
        stmt = stmt.where(selected_text_exists)
@@ -416,8 +427,7 @@ def collect_topic_extraction_diagnostics(
            )
        )

-    has_summary = (BillText.summary.is_not(None), BillText.summary != "")
-    summary_filters = [*bill_text_filters, *has_summary]
+    summary_filters = [*bill_text_filters, _bill_text_has_summary_clause()]

    bills_with_summaries = session.scalar(
        select(func.count(func.distinct(Bill.id)))
@@ -607,7 +617,11 @@ def main(
            if bill_text is None:
                logger.warning("Skipping bill id=%s: no usable summary", bill.id)
                continue
-            summary = bill_text.summary.strip()
+            summary_row = bill_text.default_summary()
+            if summary_row is None:
+                logger.warning("Skipping bill id=%s: no default summary", bill.id)
+                continue
+            summary = summary_row.summary.strip()

            try:
                extracted_topics = extract_topics_for_bill_text(
@@ -0,0 +1,281 @@
+"""Ingestion pipeline for loading JSONL post files into the weekly-partitioned posts table.
+
+Usage:
+    ingest-posts /path/to/files/
+    ingest-posts /path/to/single_file.jsonl
+    ingest-posts /data/dir/ --workers 4 --batch-size 5000
+"""
+
+from __future__ import annotations
+
+import logging
+from datetime import UTC, datetime
+from pathlib import Path  # noqa: TC003 this is needed for typer
+from typing import TYPE_CHECKING, Annotated
+
+import orjson
+import psycopg
+import typer
+
+from pipelines.pipelines.common import configure_logger
+from pipelines.orm.common import get_connection_info
+from pipelines.pipelines.parallelize import parallelize_process
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
+logger = logging.getLogger(__name__)
+
+
+app = typer.Typer(help="Ingest JSONL post files into the partitioned posts table.")
+
+
+@app.command()
+def main(
+    path: Annotated[
+        Path,
+        typer.Argument(help="Directory containing JSONL files, or a single JSONL file"),
+    ],
+    batch_size: Annotated[int, typer.Option(help="Rows per INSERT batch")] = 10000,
+    workers: Annotated[
+        int, typer.Option(help="Parallel workers for multi-file ingestion")
+    ] = 4,
+    pattern: Annotated[
+        str, typer.Option(help="Glob pattern for JSONL files")
+    ] = "*.jsonl",
+) -> None:
+    """Ingest JSONL post files into the weekly-partitioned posts table."""
+    configure_logger(level="INFO")
+
+    logger.info("starting ingest-posts")
+    logger.info(
+        "path=%s batch_size=%d workers=%d pattern=%s",
+        path,
+        batch_size,
+        workers,
+        pattern,
+    )
+    if path.is_file():
+        ingest_file(path, batch_size=batch_size)
+    elif path.is_dir():
+        ingest_directory(
+            path, batch_size=batch_size, max_workers=workers, pattern=pattern
+        )
+    else:
+        typer.echo(f"Path does not exist: {path}", err=True)
+        raise typer.Exit(code=1)
+
+    logger.info("ingest-posts done")
+
+
+def ingest_directory(
+    directory: Path,
+    *,
+    batch_size: int,
+    max_workers: int,
+    pattern: str = "*.jsonl",
+) -> None:
+    """Ingest all JSONL files in a directory using parallel workers."""
+    files = sorted(directory.glob(pattern))
+    if not files:
+        logger.warning("No JSONL files found in %s", directory)
+        return
+
+    logger.info("Found %d JSONL files to ingest", len(files))
+
+    kwargs_list = [{"path": fp, "batch_size": batch_size} for fp in files]
+    parallelize_process(ingest_file, kwargs_list, max_workers=max_workers)
+
+
+SCHEMA = "main"
+
+COLUMNS = (
+    "post_id",
+    "user_id",
+    "instance",
+    "date",
+    "text",
+    "langs",
+    "like_count",
+    "reply_count",
+    "repost_count",
+    "reply_to",
+    "replied_author",
+    "thread_root",
+    "thread_root_author",
+    "repost_from",
+    "reposted_author",
+    "quotes",
+    "quoted_author",
+    "labels",
+    "sent_label",
+    "sent_score",
+)
+
+INSERT_FROM_STAGING = f"""
+    INSERT INTO {SCHEMA}.posts ({", ".join(COLUMNS)})
+    SELECT {", ".join(COLUMNS)} FROM pg_temp.staging
+    ON CONFLICT (post_id, date) DO NOTHING
+"""  # noqa: S608
+
+FAILED_INSERT = f"""
+    INSERT INTO {SCHEMA}.failed_ingestion (raw_line, error)
+    VALUES (%(raw_line)s, %(error)s)
+"""  # noqa: S608
+
+
+def get_psycopg_connection() -> psycopg.Connection:
+    """Create a raw psycopg3 connection from environment variables."""
+    database, host, port, username, password = get_connection_info("DATA_SCIENCE_DEV")
+    return psycopg.connect(
+        dbname=database,
+        host=host,
+        port=int(port),
+        user=username,
+        password=password,
+        autocommit=False,
+    )
+
+
+def ingest_file(path: Path, *, batch_size: int) -> None:
+    """Ingest a single JSONL file into the posts table."""
+    log_trigger = max(100_000 // batch_size, 1)
+    failed_lines: list[dict] = []
+    try:
+        with get_psycopg_connection() as connection:
+            for index, batch in enumerate(
+                read_jsonl_batches(path, batch_size, failed_lines), 1
+            ):
+                ingest_batch(connection, batch)
+                if index % log_trigger == 0:
+                    logger.info(
+                        "Ingested %d batches (%d rows) from %s",
+                        index,
+                        index * batch_size,
+                        path,
+                    )
+
+            if failed_lines:
+                logger.warning(
+                    "Recording %d malformed lines from %s", len(failed_lines), path.name
+                )
+                with connection.cursor() as cursor:
+                    cursor.executemany(FAILED_INSERT, failed_lines)
+                connection.commit()
+    except Exception:
+        logger.exception("Failed to ingest file: %s", path)
+        raise
+
+
+def ingest_batch(connection: psycopg.Connection, batch: list[dict]) -> None:
+    """COPY batch into a temp staging table, then INSERT ... ON CONFLICT into posts."""
+    if not batch:
+        return
+
+    try:
+        with connection.cursor() as cursor:
+            cursor.execute(f"""
+                CREATE TEMP TABLE IF NOT EXISTS staging
+                (LIKE {SCHEMA}.posts INCLUDING DEFAULTS)
+                ON COMMIT DELETE ROWS
+            """)
+            cursor.execute("TRUNCATE pg_temp.staging")
+
+            with cursor.copy(
+                f"COPY pg_temp.staging ({', '.join(COLUMNS)}) FROM STDIN"
+            ) as copy:
+                for row in batch:
+                    copy.write_row(tuple(row.get(column) for column in COLUMNS))
+
+            cursor.execute(INSERT_FROM_STAGING)
+        connection.commit()
+    except Exception as error:
+        connection.rollback()
+
+        if len(batch) == 1:
+            logger.exception("Skipping bad row post_id=%s", batch[0].get("post_id"))
+            with connection.cursor() as cursor:
+                cursor.execute(
+                    FAILED_INSERT,
+                    {
+                        "raw_line": orjson.dumps(batch[0], default=str).decode(),
+                        "error": str(error),
+                    },
+                )
+            connection.commit()
+            return
+
+        midpoint = len(batch) // 2
+        ingest_batch(connection, batch[:midpoint])
+        ingest_batch(connection, batch[midpoint:])
+
+
+def read_jsonl_batches(
+    file_path: Path, batch_size: int, failed_lines: list[dict]
+) -> Iterator[list[dict]]:
+    """Stream a JSONL file and yield batches of transformed rows."""
+    batch: list[dict] = []
+    with file_path.open("r", encoding="utf-8") as handle:
+        for raw_line in handle:
+            line = raw_line.strip()
+            if not line:
+                continue
+            batch.extend(parse_line(line, file_path, failed_lines))
+            if len(batch) >= batch_size:
+                yield batch
+                batch = []
+    if batch:
+        yield batch
+
+
+def parse_line(line: str, file_path: Path, failed_lines: list[dict]) -> Iterator[dict]:
+    """Parse a JSONL line, handling concatenated JSON objects."""
+    try:
+        yield transform_row(orjson.loads(line))
+    except orjson.JSONDecodeError:
+        if "}{" not in line:
+            logger.warning(
+                "Skipping malformed line in %s: %s", file_path.name, line[:120]
+            )
+            failed_lines.append({"raw_line": line, "error": "malformed JSON"})
+            return
+        fragments = line.replace("}{", "}\n{").split("\n")
+        for fragment in fragments:
+            try:
+                yield transform_row(orjson.loads(fragment))
+            except (orjson.JSONDecodeError, KeyError, ValueError) as error:
+                logger.warning(
+                    "Skipping malformed fragment in %s: %s",
+                    file_path.name,
+                    fragment[:120],
+                )
+                failed_lines.append({"raw_line": fragment, "error": str(error)})
+    except Exception as error:
+        logger.exception("Skipping bad row in %s: %s", file_path.name, line[:120])
+        failed_lines.append({"raw_line": line, "error": str(error)})
+
+
+def transform_row(raw: dict) -> dict:
+    """Transform a raw JSONL row into a dict matching the Posts table columns."""
+    raw["date"] = parse_date(raw["date"])
+    if raw.get("langs") is not None:
+        raw["langs"] = orjson.dumps(raw["langs"])
+    if raw.get("text") is not None:
+        raw["text"] = raw["text"].replace("\x00", "")
+    return raw
+
+
+def parse_date(raw_date: int) -> datetime:
+    """Parse compact YYYYMMDDHHmm integer into a naive datetime (input is UTC by spec)."""
+    return datetime(
+        raw_date // 100000000,
+        (raw_date // 1000000) % 100,
+        (raw_date // 10000) % 100,
+        (raw_date // 100) % 100,
+        raw_date % 100,
+        tzinfo=UTC,
+    )
+
+
+if __name__ == "__main__":
+    app()
@@ -9,7 +9,7 @@ from typing import Annotated, Any

 import httpx
 import typer
-from sqlalchemy import Select, exists, or_, select
+from sqlalchemy import Select, exists, select
 from sqlalchemy.orm import Session, selectinload

 from tiktoken import get_encoding
@@ -20,6 +20,7 @@ from pipelines.orm.common import get_postgres_engine
 from pipelines.orm.data_science_dev.congress import (
    Bill,
    BillText,
+    BillTextSummary,
    SubjectType,
    VoteClassification,
    VoteRelationship,
@@ -112,7 +113,7 @@ def summarize_bill_text(
    model: str,
    bill_text: BillText,
    summarization_prompts: dict[str, str],
-) -> str:
+) -> str | None:
    """Generate and return a summary for one bill_text row."""
    messages, user_prompt_tokens = build_bill_summary_messages(
        bill_text=bill_text,
@@ -136,15 +137,21 @@ def summarize_bill_text(

 def store_bill_summary_result(
    *,
+    session: Session,
    bill_text: BillText,
    summary: str,
    model: str,
-) -> None:
+) -> BillTextSummary:
    """Store a generated summary and the prompt/model metadata that produced it."""
-    bill_text.summary = summary
-    bill_text.summarization_model = model
-    bill_text.summarization_system_prompt_version = "v1.2"
-    bill_text.summarization_user_prompt_version = "v1"
+    summary_row = BillTextSummary(
+        bill_text=bill_text,
+        summary=summary,
+        summarization_model=model,
+        summarization_system_prompt_version="v1.2",
+        summarization_user_prompt_version="v1",
+    )
+    session.add(summary_row)
+    return summary_row


 def create_select_bill_texts_for_summarization(
@@ -154,7 +161,7 @@ def create_select_bill_texts_for_summarization(
    with_votes_only: bool = False,
    force: bool = False,
    limit: int | None = None,
-) -> Select:
+) -> Select[tuple[BillText]]:
    """Select bill_text rows that have source text and need summaries."""
    stmt = (
        select(BillText)
@@ -189,7 +196,13 @@ def create_select_bill_texts_for_summarization(
            )
        )
    if not force:
-        stmt = stmt.where(or_(BillText.summary.is_(None), BillText.summary == ""))
+        stmt = stmt.where(
+            ~exists(
+                select(BillTextSummary.id).where(
+                    BillTextSummary.bill_text_id == BillText.id
+                )
+            )
+        )
    if limit is not None:
        stmt = stmt.limit(limit)
    return stmt
@@ -287,6 +300,7 @@ def main(
                logger.warning("Skipping bill_text id=%s", bill_text.id)
                continue
            store_bill_summary_result(
+                session=session,
                bill_text=bill_text,
                summary=summary,
                model=model,
@@ -6,6 +6,7 @@ from pipelines.orm.data_science_dev.congress.bill import (
    BillActionRecordedVote,
    BillRelation,
    BillText,
+    BillTextSummary,
    BillTopic,
    BillTopicPosition,
 )
@@ -54,6 +55,7 @@ __all__ = [
    "BillActionRecordedVote",
    "BillRelation",
    "BillText",
+    "BillTextSummary",
    "BillTopic",
    "BillTopicPosition",
    "ClassificationMethod",
@@ -105,13 +105,12 @@ class BillText(DataScienceDevTableBase):
    )

    bill_id: Mapped[int] = mapped_column(ForeignKey("main.bill.id", ondelete="CASCADE"))
+    primary_summary_id: Mapped[int | None] = mapped_column(
+        ForeignKey("main.bill_text_summary.id", ondelete="SET NULL")
+    )
    version_code: Mapped[str]
    version_name: Mapped[str | None]
    text_content: Mapped[str | None]
-    summary: Mapped[str | None]
-    summarization_model: Mapped[str | None]
-    summarization_user_prompt_version: Mapped[str | None]
-    summarization_system_prompt_version: Mapped[str | None]
    date: Mapped[date | None]
    source_datetime_raw: Mapped[str | None]
    text_url_xml: Mapped[str | None]
@@ -122,6 +121,57 @@ class BillText(DataScienceDevTableBase):
    )

    bill: Mapped[Bill] = relationship("Bill", back_populates="bill_texts")
+    summaries: Mapped[list[BillTextSummary]] = relationship(
+        "BillTextSummary",
+        back_populates="bill_text",
+        cascade="all, delete-orphan",
+        foreign_keys="BillTextSummary.bill_text_id",
+        order_by=lambda: (
+            BillTextSummary.created.desc(),
+            BillTextSummary.id.desc(),
+        ),
+    )
+    primary_summary: Mapped[BillTextSummary | None] = relationship(
+        "BillTextSummary",
+        foreign_keys=[primary_summary_id],
+        post_update=True,
+    )
+
+    def latest_summary(self) -> BillTextSummary | None:
+        """Return the newest summary row for this bill text."""
+        return self.summaries[0] if self.summaries else None
+
+    def default_summary(self) -> BillTextSummary | None:
+        """Return the primary summary when set, otherwise the newest summary."""
+        return self.primary_summary or self.latest_summary()
+
+
+class BillTextSummary(DataScienceDevTableBase):
+    """Stores one generated summary for a bill text version."""
+
+    __tablename__ = "bill_text_summary"
+    __table_args__ = (
+        Index("ix_bill_text_summary_bill_text_id", "bill_text_id"),
+        Index(
+            "ix_bill_text_summary_bill_text_id_created",
+            "bill_text_id",
+            "created",
+        ),
+    )
+
+    bill_text_id: Mapped[int] = mapped_column(
+        ForeignKey("main.bill_text.id", ondelete="CASCADE")
+    )
+    summary: Mapped[str]
+    summarization_model: Mapped[str | None]
+    summarization_user_prompt_version: Mapped[str | None]
+    summarization_system_prompt_version: Mapped[str | None]
+
+    bill_text: Mapped[BillText] = relationship(
+        "BillText",
+        back_populates="summaries",
+        foreign_keys=[bill_text_id],
+    )


 class BillAction(DataScienceDevTableBase):
@@ -11,6 +11,7 @@ from pipelines.orm.data_science_dev.congress import (
    BillActionRecordedVote,
    BillRelation,
    BillText,
+    BillTextSummary,
    BillTopic,
    BillTopicPosition,
    ClassificationMethod,
@@ -51,6 +52,7 @@ __all__ = [
    "BillActionRecordedVote",
    "BillRelation",
    "BillText",
+    "BillTextSummary",
    "BillTopic",
    "BillTopicPosition",
    "ClassificationMethod",
@@ -1,34 +0,0 @@
-SUMMARIZATION_SYSTEM_PROMPT = """You are a legislative analyst extracting policy substance from Congressional bill text.
-
-Your job is to compress a bill into a dense, neutral structured summary that captures every distinct policy action — including secondary effects that might be buried in subsections.
-
-EXTRACTION RULES:
- IGNORE: whereas clauses, congressional findings that are purely political statements, recitals, preambles, citations of existing law by number alone, and procedural boilerplate.
- FOCUS ON: operative verbs — what the bill SHALL do, PROHIBIT, REQUIRE, AUTHORIZE, AMEND, APPROPRIATE, or ESTABLISH.
- SURFACE ALL THREADS: If the bill touches multiple policy areas, list each thread separately. Do not collapse them.
- BE CONCRETE: Name the affected population, the mechanism, and the direction (expands/restricts/maintains).
- STAY NEUTRAL: No political framing. Describe what the text does, not what its sponsors claim it does.
-
-OUTPUT FORMAT — plain structured text, not JSON:
-
-OPERATIVE ACTIONS:
-[Numbered list of what the bill actually does, one action per line, max 20 words each]
-
-AFFECTED POPULATIONS:
-[Who gains something, who loses something, or whose behavior is regulated]
-
-MECHANISMS:
-[How it works: new funding, mandate, prohibition, amendment to existing statute, grant program, study commission, etc.]
-
-POLICY THREADS:
-[List each distinct policy domain this bill touches, even minor ones. Use plain language, not domain codes.]
-
-SYMBOLIC/PROCEDURAL ONLY:
-[Yes or No — is this bill primarily a resolution, designation, or awareness declaration with no operative effect?]
-
-LENGTH TARGET: 150-250 words total. Be ruthless about cutting. Density over completeness."""
-
-SUMMARIZATION_USER_TEMPLATE = """Summarize the following Congressional bill according to your instructions.
-
-BILL TEXT:
-{text_content}"""
@@ -0,0 +1,22 @@
+[project]
+name = "ds-testing-pipelines"
+version = "0.1.0"
+description = "Data science pipeline tools and legislative dashboard."
+requires-python = ">=3.12"
+dependencies = [
+    "fastapi",
+    "httpx",
+    "uvicorn[standard]",
+    "jinja2",
+    "sqlalchemy",
+    "psycopg",
+]
+
+[project.optional-dependencies]
+test = [
+    "pytest",
+]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+pythonpath = ["."]
@@ -0,0 +1,36 @@
+from pipelines.orm.data_science_dev.congress import BillText, BillTextSummary
+
+
+def test_default_summary_prefers_primary_summary() -> None:
+    primary_summary = BillTextSummary(id=1, bill_text_id=10, summary="primary")
+    latest_summary = BillTextSummary(id=2, bill_text_id=10, summary="latest")
+    bill_text = BillText(
+        id=10,
+        bill_id=5,
+        version_code="ih",
+        summaries=[latest_summary],
+        primary_summary=primary_summary,
+    )
+
+    assert bill_text.default_summary() is primary_summary
+
+
+def test_default_summary_falls_back_to_latest_summary() -> None:
+    latest_summary = BillTextSummary(id=2, bill_text_id=10, summary="latest")
+    older_summary = BillTextSummary(id=1, bill_text_id=10, summary="older")
+    bill_text = BillText(
+        id=10,
+        bill_id=5,
+        version_code="ih",
+        summaries=[latest_summary, older_summary],
+    )
+
+    assert bill_text.latest_summary() is latest_summary
+    assert bill_text.default_summary() is latest_summary
+
+
+def test_default_summary_is_none_without_summaries() -> None:
+    bill_text = BillText(id=10, bill_id=5, version_code="ih")
+
+    assert bill_text.latest_summary() is None
+    assert bill_text.default_summary() is None
@@ -0,0 +1,71 @@
+from sqlalchemy.dialects import postgresql
+
+from pipelines.jobs.extract_bill_topics import (
+    _select_bill_text_for_topic_extraction,
+    create_select_bills_for_topic_extraction,
+)
+from pipelines.orm.data_science_dev.congress import Bill, BillText, BillTextSummary
+
+
+def _compile_sql(statement: object) -> str:
+    return str(
+        statement.compile(
+            dialect=postgresql.dialect(),
+            compile_kwargs={"literal_binds": True},
+        )
+    )
+
+
+def test_select_bill_text_for_topic_extraction_uses_primary_summary() -> None:
+    primary_summary = BillTextSummary(id=1, bill_text_id=10, summary="primary")
+    newest_summary = BillTextSummary(id=2, bill_text_id=10, summary="newest")
+    bill_text = BillText(
+        id=10,
+        bill_id=5,
+        version_code="ih",
+        summaries=[newest_summary],
+        primary_summary=primary_summary,
+    )
+    bill = Bill(
+        id=5,
+        congress=119,
+        bill_type="hr",
+        number=1,
+        bill_texts=[bill_text],
+    )
+
+    selected = _select_bill_text_for_topic_extraction(bill)
+
+    assert selected is bill_text
+    assert selected.default_summary() is primary_summary
+
+
+def test_select_bill_text_for_topic_extraction_uses_latest_summary_without_primary() -> None:
+    newest_summary = BillTextSummary(id=2, bill_text_id=10, summary="newest")
+    older_summary = BillTextSummary(id=1, bill_text_id=10, summary="older")
+    bill_text = BillText(
+        id=10,
+        bill_id=5,
+        version_code="ih",
+        summaries=[newest_summary, older_summary],
+    )
+    bill = Bill(
+        id=5,
+        congress=119,
+        bill_type="hr",
+        number=1,
+        bill_texts=[bill_text],
+    )
+
+    selected = _select_bill_text_for_topic_extraction(bill)
+
+    assert selected is bill_text
+    assert selected.default_summary() is newest_summary
+
+
+def test_create_select_bills_for_topic_extraction_uses_summary_exists_subquery() -> None:
+    sql = _compile_sql(create_select_bills_for_topic_extraction())
+
+    assert "bill_text_summary" in sql
+    assert "EXISTS" in sql
+    assert "bill_text.summary" not in sql
@@ -0,0 +1,58 @@
+from sqlalchemy.dialects import postgresql
+
+from pipelines.jobs.summarize_bills import (
+    create_select_bill_texts_for_summarization,
+    store_bill_summary_result,
+)
+from pipelines.orm.data_science_dev.congress import BillText, BillTextSummary
+
+
+class FakeSession:
+    def __init__(self) -> None:
+        self.added: list[object] = []
+
+    def add(self, value: object) -> None:
+        self.added.append(value)
+
+
+def _compile_sql(statement: object) -> str:
+    return str(
+        statement.compile(
+            dialect=postgresql.dialect(),
+            compile_kwargs={"literal_binds": True},
+        )
+    )
+
+
+def test_store_bill_summary_result_creates_summary_row() -> None:
+    session = FakeSession()
+    bill_text = BillText(id=10, bill_id=5, version_code="ih")
+
+    summary_row = store_bill_summary_result(
+        session=session,
+        bill_text=bill_text,
+        summary="A summary",
+        model="gpt-5.4-mini",
+    )
+
+    assert session.added == [summary_row]
+    assert isinstance(summary_row, BillTextSummary)
+    assert summary_row.bill_text is bill_text
+    assert summary_row.summary == "A summary"
+    assert summary_row.summarization_model == "gpt-5.4-mini"
+    assert summary_row.summarization_system_prompt_version == "v1.2"
+    assert summary_row.summarization_user_prompt_version == "v1"
+
+
+def test_create_select_bill_texts_for_summarization_excludes_existing_summaries() -> None:
+    sql = _compile_sql(create_select_bill_texts_for_summarization(force=False))
+
+    assert "bill_text_summary" in sql
+    assert "NOT (EXISTS" in sql or "NOT EXISTS" in sql
+    assert "bill_text.summary" not in sql
+
+
+def test_create_select_bill_texts_for_summarization_force_skips_summary_filter() -> None:
+    sql = _compile_sql(create_select_bill_texts_for_summarization(force=True))
+
+    assert "bill_text_summary" not in sql
Author	SHA1	Message	Date
Richie	d3fe6dba56	allowing multiple summaries per bill text	2026-05-08 18:30:07 -04:00
Richie	de9e59b5f4	Merge pull request 'added bert_topic train.py and infer.py' (#3 ) from feature/added-bert_topic-train.py-and-infer.py into main Reviewed-on: #3	2026-05-02 20:59:08 -04:00
Richie	2034a760c9	adding missig @dataclass to config.py	2026-05-02 20:58:33 -04:00
Richie	45bdd7b629	added bert_topic train.py and infer.py	2026-04-28 23:07:41 -04:00
Richie	b5f2df6ae5	Merge pull request 'feature/finishing-migration-of-work' (#9 ) from feature/finishing-migration-of-work into main Reviewed-on: #9	2026-04-28 23:05:56 -04:00
Richie	21448eb515	updated __init__.py	2026-04-28 23:02:31 -04:00
Richie	28993213af	fixed pyproject.toml	2026-04-28 23:02:18 -04:00
Richie	d4c587362d	remoed old prompts	2026-04-28 23:01:54 -04:00
Richie	d0e865ffbd	added congress_vote_context.py ingest_congress.py ingest_posts.py to jobs dir	2026-04-28 23:01:38 -04:00
Richie	297d9ce89b	Merge pull request 'adding website' (#8 ) from feature/adding-website into main Reviewed-on: #8	2026-04-28 22:51:27 -04:00