allowing multiple summaries per bill text

This commit is contained in:
2026-05-08 18:30:07 -04:00
parent de9e59b5f4
commit d3fe6dba56
9 changed files with 483 additions and 25 deletions
+26 -12
View File
@@ -19,6 +19,7 @@ from pipelines.orm.common import get_postgres_engine
from pipelines.orm.data_science_dev.congress import (
Bill,
BillText,
BillTextSummary,
BillTopic,
BillTopicPosition,
SubjectType,
@@ -72,11 +73,19 @@ class ExtractedBillTopic:
def _select_bill_text_for_topic_extraction(bill: Bill) -> BillText | None:
"""Pick one summarized bill_text row from the already-loaded relationship."""
for bill_text in bill.bill_texts:
if bill_text.summary and bill_text.summary.strip():
summary_row = bill_text.default_summary()
if summary_row and summary_row.summary.strip():
return bill_text
return None
def _bill_text_has_summary_clause() -> ColumnElement[bool]:
"""Return a correlated EXISTS clause for bill texts with at least one summary."""
return exists(
select(BillTextSummary.id).where(BillTextSummary.bill_text_id == BillText.id)
)
def normalize_topic_label(value: str) -> str:
"""Normalize a topic label for storage, comparison, and de-duping."""
normalized = value.strip().strip("\"'")
@@ -323,11 +332,7 @@ def create_select_bills_for_topic_extraction(
limit: int | None = None,
) -> Select[tuple[Bill]]:
"""Select bill rows that have summarized bill_text rows for topic extraction."""
has_summary = (BillText.summary.is_not(None), BillText.summary != "")
summarized_text_filters: list[ColumnElement[bool]] = [
BillText.bill_id == Bill.id,
*has_summary,
]
summarized_text_filters: list[ColumnElement[bool]] = [_bill_text_has_summary_clause()]
if with_votes_only:
summarized_text_filters.append(
exists(
@@ -347,11 +352,17 @@ def create_select_bills_for_topic_extraction(
)
)
)
summarized_text_exists = exists(select(BillText.id).where(*summarized_text_filters))
summarized_text_exists = exists(
select(BillText.id).where(BillText.bill_id == Bill.id, *summarized_text_filters)
)
bill_text_loader = selectinload(Bill.bill_texts.and_(*summarized_text_filters))
stmt = (
select(Bill)
.where(summarized_text_exists)
.options(selectinload(Bill.bill_texts.and_(*summarized_text_filters[1:])))
.options(
bill_text_loader.selectinload(BillText.summaries),
bill_text_loader.selectinload(BillText.primary_summary),
)
.order_by(Bill.id)
)
if congress is not None:
@@ -363,7 +374,7 @@ def create_select_bills_for_topic_extraction(
select(BillText.id).where(
BillText.bill_id == Bill.id,
BillText.id.in_(bill_text_ids),
*summarized_text_filters[1:],
*summarized_text_filters,
)
)
stmt = stmt.where(selected_text_exists)
@@ -416,8 +427,7 @@ def collect_topic_extraction_diagnostics(
)
)
has_summary = (BillText.summary.is_not(None), BillText.summary != "")
summary_filters = [*bill_text_filters, *has_summary]
summary_filters = [*bill_text_filters, _bill_text_has_summary_clause()]
bills_with_summaries = session.scalar(
select(func.count(func.distinct(Bill.id)))
@@ -607,7 +617,11 @@ def main(
if bill_text is None:
logger.warning("Skipping bill id=%s: no usable summary", bill.id)
continue
summary = bill_text.summary.strip()
summary_row = bill_text.default_summary()
if summary_row is None:
logger.warning("Skipping bill id=%s: no default summary", bill.id)
continue
summary = summary_row.summary.strip()
try:
extracted_topics = extract_topics_for_bill_text(