allowing multiple summaries per bill text

This commit is contained in:
2026-05-08 18:30:07 -04:00
parent de9e59b5f4
commit d3fe6dba56
9 changed files with 483 additions and 25 deletions
+26 -12
View File
@@ -19,6 +19,7 @@ from pipelines.orm.common import get_postgres_engine
from pipelines.orm.data_science_dev.congress import (
Bill,
BillText,
BillTextSummary,
BillTopic,
BillTopicPosition,
SubjectType,
@@ -72,11 +73,19 @@ class ExtractedBillTopic:
def _select_bill_text_for_topic_extraction(bill: Bill) -> BillText | None:
"""Pick one summarized bill_text row from the already-loaded relationship."""
for bill_text in bill.bill_texts:
if bill_text.summary and bill_text.summary.strip():
summary_row = bill_text.default_summary()
if summary_row and summary_row.summary.strip():
return bill_text
return None
def _bill_text_has_summary_clause() -> ColumnElement[bool]:
"""Return a correlated EXISTS clause for bill texts with at least one summary."""
return exists(
select(BillTextSummary.id).where(BillTextSummary.bill_text_id == BillText.id)
)
def normalize_topic_label(value: str) -> str:
"""Normalize a topic label for storage, comparison, and de-duping."""
normalized = value.strip().strip("\"'")
@@ -323,11 +332,7 @@ def create_select_bills_for_topic_extraction(
limit: int | None = None,
) -> Select[tuple[Bill]]:
"""Select bill rows that have summarized bill_text rows for topic extraction."""
has_summary = (BillText.summary.is_not(None), BillText.summary != "")
summarized_text_filters: list[ColumnElement[bool]] = [
BillText.bill_id == Bill.id,
*has_summary,
]
summarized_text_filters: list[ColumnElement[bool]] = [_bill_text_has_summary_clause()]
if with_votes_only:
summarized_text_filters.append(
exists(
@@ -347,11 +352,17 @@ def create_select_bills_for_topic_extraction(
)
)
)
summarized_text_exists = exists(select(BillText.id).where(*summarized_text_filters))
summarized_text_exists = exists(
select(BillText.id).where(BillText.bill_id == Bill.id, *summarized_text_filters)
)
bill_text_loader = selectinload(Bill.bill_texts.and_(*summarized_text_filters))
stmt = (
select(Bill)
.where(summarized_text_exists)
.options(selectinload(Bill.bill_texts.and_(*summarized_text_filters[1:])))
.options(
bill_text_loader.selectinload(BillText.summaries),
bill_text_loader.selectinload(BillText.primary_summary),
)
.order_by(Bill.id)
)
if congress is not None:
@@ -363,7 +374,7 @@ def create_select_bills_for_topic_extraction(
select(BillText.id).where(
BillText.bill_id == Bill.id,
BillText.id.in_(bill_text_ids),
*summarized_text_filters[1:],
*summarized_text_filters,
)
)
stmt = stmt.where(selected_text_exists)
@@ -416,8 +427,7 @@ def collect_topic_extraction_diagnostics(
)
)
has_summary = (BillText.summary.is_not(None), BillText.summary != "")
summary_filters = [*bill_text_filters, *has_summary]
summary_filters = [*bill_text_filters, _bill_text_has_summary_clause()]
bills_with_summaries = session.scalar(
select(func.count(func.distinct(Bill.id)))
@@ -607,7 +617,11 @@ def main(
if bill_text is None:
logger.warning("Skipping bill id=%s: no usable summary", bill.id)
continue
summary = bill_text.summary.strip()
summary_row = bill_text.default_summary()
if summary_row is None:
logger.warning("Skipping bill id=%s: no default summary", bill.id)
continue
summary = summary_row.summary.strip()
try:
extracted_topics = extract_topics_for_bill_text(
+23 -9
View File
@@ -9,7 +9,7 @@ from typing import Annotated, Any
import httpx
import typer
from sqlalchemy import Select, exists, or_, select
from sqlalchemy import Select, exists, select
from sqlalchemy.orm import Session, selectinload
from tiktoken import get_encoding
@@ -20,6 +20,7 @@ from pipelines.orm.common import get_postgres_engine
from pipelines.orm.data_science_dev.congress import (
Bill,
BillText,
BillTextSummary,
SubjectType,
VoteClassification,
VoteRelationship,
@@ -112,7 +113,7 @@ def summarize_bill_text(
model: str,
bill_text: BillText,
summarization_prompts: dict[str, str],
) -> str:
) -> str | None:
"""Generate and return a summary for one bill_text row."""
messages, user_prompt_tokens = build_bill_summary_messages(
bill_text=bill_text,
@@ -136,15 +137,21 @@ def summarize_bill_text(
def store_bill_summary_result(
*,
session: Session,
bill_text: BillText,
summary: str,
model: str,
) -> None:
) -> BillTextSummary:
"""Store a generated summary and the prompt/model metadata that produced it."""
bill_text.summary = summary
bill_text.summarization_model = model
bill_text.summarization_system_prompt_version = "v1.2"
bill_text.summarization_user_prompt_version = "v1"
summary_row = BillTextSummary(
bill_text=bill_text,
summary=summary,
summarization_model=model,
summarization_system_prompt_version="v1.2",
summarization_user_prompt_version="v1",
)
session.add(summary_row)
return summary_row
def create_select_bill_texts_for_summarization(
@@ -154,7 +161,7 @@ def create_select_bill_texts_for_summarization(
with_votes_only: bool = False,
force: bool = False,
limit: int | None = None,
) -> Select:
) -> Select[tuple[BillText]]:
"""Select bill_text rows that have source text and need summaries."""
stmt = (
select(BillText)
@@ -189,7 +196,13 @@ def create_select_bill_texts_for_summarization(
)
)
if not force:
stmt = stmt.where(or_(BillText.summary.is_(None), BillText.summary == ""))
stmt = stmt.where(
~exists(
select(BillTextSummary.id).where(
BillTextSummary.bill_text_id == BillText.id
)
)
)
if limit is not None:
stmt = stmt.limit(limit)
return stmt
@@ -287,6 +300,7 @@ def main(
logger.warning("Skipping bill_text id=%s", bill_text.id)
continue
store_bill_summary_result(
session=session,
bill_text=bill_text,
summary=summary,
model=model,
@@ -6,6 +6,7 @@ from pipelines.orm.data_science_dev.congress.bill import (
BillActionRecordedVote,
BillRelation,
BillText,
BillTextSummary,
BillTopic,
BillTopicPosition,
)
@@ -54,6 +55,7 @@ __all__ = [
"BillActionRecordedVote",
"BillRelation",
"BillText",
"BillTextSummary",
"BillTopic",
"BillTopicPosition",
"ClassificationMethod",
@@ -105,13 +105,12 @@ class BillText(DataScienceDevTableBase):
)
bill_id: Mapped[int] = mapped_column(ForeignKey("main.bill.id", ondelete="CASCADE"))
primary_summary_id: Mapped[int | None] = mapped_column(
ForeignKey("main.bill_text_summary.id", ondelete="SET NULL")
)
version_code: Mapped[str]
version_name: Mapped[str | None]
text_content: Mapped[str | None]
summary: Mapped[str | None]
summarization_model: Mapped[str | None]
summarization_user_prompt_version: Mapped[str | None]
summarization_system_prompt_version: Mapped[str | None]
date: Mapped[date | None]
source_datetime_raw: Mapped[str | None]
text_url_xml: Mapped[str | None]
@@ -122,6 +121,57 @@ class BillText(DataScienceDevTableBase):
)
bill: Mapped[Bill] = relationship("Bill", back_populates="bill_texts")
summaries: Mapped[list[BillTextSummary]] = relationship(
"BillTextSummary",
back_populates="bill_text",
cascade="all, delete-orphan",
foreign_keys="BillTextSummary.bill_text_id",
order_by=lambda: (
BillTextSummary.created.desc(),
BillTextSummary.id.desc(),
),
)
primary_summary: Mapped[BillTextSummary | None] = relationship(
"BillTextSummary",
foreign_keys=[primary_summary_id],
post_update=True,
)
def latest_summary(self) -> BillTextSummary | None:
"""Return the newest summary row for this bill text."""
return self.summaries[0] if self.summaries else None
def default_summary(self) -> BillTextSummary | None:
"""Return the primary summary when set, otherwise the newest summary."""
return self.primary_summary or self.latest_summary()
class BillTextSummary(DataScienceDevTableBase):
"""Stores one generated summary for a bill text version."""
__tablename__ = "bill_text_summary"
__table_args__ = (
Index("ix_bill_text_summary_bill_text_id", "bill_text_id"),
Index(
"ix_bill_text_summary_bill_text_id_created",
"bill_text_id",
"created",
),
)
bill_text_id: Mapped[int] = mapped_column(
ForeignKey("main.bill_text.id", ondelete="CASCADE")
)
summary: Mapped[str]
summarization_model: Mapped[str | None]
summarization_user_prompt_version: Mapped[str | None]
summarization_system_prompt_version: Mapped[str | None]
bill_text: Mapped[BillText] = relationship(
"BillText",
back_populates="summaries",
foreign_keys=[bill_text_id],
)
class BillAction(DataScienceDevTableBase):
+2
View File
@@ -11,6 +11,7 @@ from pipelines.orm.data_science_dev.congress import (
BillActionRecordedVote,
BillRelation,
BillText,
BillTextSummary,
BillTopic,
BillTopicPosition,
ClassificationMethod,
@@ -51,6 +52,7 @@ __all__ = [
"BillActionRecordedVote",
"BillRelation",
"BillText",
"BillTextSummary",
"BillTopic",
"BillTopicPosition",
"ClassificationMethod",