From 4b768049c06dbfbe323e3496799ebb1436ac3713 Mon Sep 17 00:00:00 2001 From: Richie Cahill Date: Tue, 21 Apr 2026 21:42:13 -0400 Subject: [PATCH] added summarization metadata to the DB --- ...ext_summarization_metadata_7d15f9b7c8a2.py | 55 +++++++++++++++++++ .../orm/data_science_dev/congress/bill.py | 3 + .../data_science_dev/congress/legislator.py | 6 +- pipelines/tools/summarization_prompts.py | 34 ------------ 4 files changed, 61 insertions(+), 37 deletions(-) create mode 100644 alembic/data_science_dev/versions/2026_04_22-add_bill_text_summarization_metadata_7d15f9b7c8a2.py delete mode 100644 pipelines/tools/summarization_prompts.py diff --git a/alembic/data_science_dev/versions/2026_04_22-add_bill_text_summarization_metadata_7d15f9b7c8a2.py b/alembic/data_science_dev/versions/2026_04_22-add_bill_text_summarization_metadata_7d15f9b7c8a2.py new file mode 100644 index 0000000..30141d2 --- /dev/null +++ b/alembic/data_science_dev/versions/2026_04_22-add_bill_text_summarization_metadata_7d15f9b7c8a2.py @@ -0,0 +1,55 @@ +"""add bill_text summarization metadata. + +Revision ID: 7d15f9b7c8a2 +Revises: ef4bc5411176 +Create Date: 2026-04-22 00:00:00.000000 + +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import sqlalchemy as sa + +from alembic import op +from pipelines.orm import DataScienceDevBase + +if TYPE_CHECKING: + from collections.abc import Sequence + +# revision identifiers, used by Alembic. +revision: str = "7d15f9b7c8a2" +down_revision: str | None = "ef4bc5411176" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + +schema = DataScienceDevBase.schema_name + +summarization_model +def upgrade() -> None: + """Upgrade.""" + op.add_column( + "bill_text", + sa.Column("summarization_model", sa.String(), nullable=True), + schema=schema, + ) + op.add_column( + "bill_text", + sa.Column("summarization_user_prompt_version", sa.String(), nullable=True), + schema=schema, + ) + op.add_column( + "bill_text", + sa.Column("summarization_system_prompt_version", sa.String(), nullable=True), + schema=schema, + ) + + +def downgrade() -> None: + """Downgrade.""" + op.drop_column( + "bill_text", "summarization_system_prompt_version", schema=schema + ) + op.drop_column("bill_text", "summarization_user_prompt_version", schema=schema) + op.drop_column("bill_text", "summarization_model", schema=schema) diff --git a/pipelines/orm/data_science_dev/congress/bill.py b/pipelines/orm/data_science_dev/congress/bill.py index e9a11f0..0ace4bf 100644 --- a/pipelines/orm/data_science_dev/congress/bill.py +++ b/pipelines/orm/data_science_dev/congress/bill.py @@ -86,6 +86,9 @@ class BillText(DataScienceDevTableBase): version_name: Mapped[str | None] text_content: Mapped[str | None] summary: Mapped[str | None] + summarization_model: Mapped[str | None] + summarization_user_prompt_version: Mapped[str | None] + summarization_system_prompt_version: Mapped[str | None] date: Mapped[date | None] bill: Mapped[Bill] = relationship("Bill", back_populates="bill_texts") diff --git a/pipelines/orm/data_science_dev/congress/legislator.py b/pipelines/orm/data_science_dev/congress/legislator.py index 474b8f4..0176f47 100644 --- a/pipelines/orm/data_science_dev/congress/legislator.py +++ b/pipelines/orm/data_science_dev/congress/legislator.py @@ -104,7 +104,6 @@ class LegislatorScore(DataScienceDevTableBase): legislator: Mapped[Legislator] = relationship(back_populates="scores") - class LegislatorBillScore(DataScienceDevTableBase): """Per-bill source score used to maintain aggregate legislator scores.""" @@ -136,6 +135,7 @@ class LegislatorBillScore(DataScienceDevTableBase): score: Mapped[float] bill: Mapped[Bill] = relationship(back_populates="legislator_bill_scores") - bill_topic: Mapped[BillTopic] = relationship(back_populates="legislator_bill_scores") + bill_topic: Mapped[BillTopic] = relationship( + back_populates="legislator_bill_scores", + ) legislator: Mapped[Legislator] = relationship(back_populates="bill_scores") - diff --git a/pipelines/tools/summarization_prompts.py b/pipelines/tools/summarization_prompts.py deleted file mode 100644 index bfdd5a5..0000000 --- a/pipelines/tools/summarization_prompts.py +++ /dev/null @@ -1,34 +0,0 @@ -SUMMARIZATION_SYSTEM_PROMPT = """You are a legislative analyst extracting policy substance from Congressional bill text. - -Your job is to compress a bill into a dense, neutral structured summary that captures every distinct policy action — including secondary effects that might be buried in subsections. - -EXTRACTION RULES: -- IGNORE: whereas clauses, congressional findings that are purely political statements, recitals, preambles, citations of existing law by number alone, and procedural boilerplate. -- FOCUS ON: operative verbs — what the bill SHALL do, PROHIBIT, REQUIRE, AUTHORIZE, AMEND, APPROPRIATE, or ESTABLISH. -- SURFACE ALL THREADS: If the bill touches multiple policy areas, list each thread separately. Do not collapse them. -- BE CONCRETE: Name the affected population, the mechanism, and the direction (expands/restricts/maintains). -- STAY NEUTRAL: No political framing. Describe what the text does, not what its sponsors claim it does. - -OUTPUT FORMAT — plain structured text, not JSON: - -OPERATIVE ACTIONS: -[Numbered list of what the bill actually does, one action per line, max 20 words each] - -AFFECTED POPULATIONS: -[Who gains something, who loses something, or whose behavior is regulated] - -MECHANISMS: -[How it works: new funding, mandate, prohibition, amendment to existing statute, grant program, study commission, etc.] - -POLICY THREADS: -[List each distinct policy domain this bill touches, even minor ones. Use plain language, not domain codes.] - -SYMBOLIC/PROCEDURAL ONLY: -[Yes or No — is this bill primarily a resolution, designation, or awareness declaration with no operative effect?] - -LENGTH TARGET: 150-250 words total. Be ruthless about cutting. Density over completeness.""" - -SUMMARIZATION_USER_TEMPLATE = """Summarize the following Congressional bill according to your instructions. - -BILL TEXT: -{text_content}"""