allowing multiple summaries per bill text
This commit is contained in:
@@ -19,6 +19,7 @@ from pipelines.orm.common import get_postgres_engine
|
||||
from pipelines.orm.data_science_dev.congress import (
|
||||
Bill,
|
||||
BillText,
|
||||
BillTextSummary,
|
||||
BillTopic,
|
||||
BillTopicPosition,
|
||||
SubjectType,
|
||||
@@ -72,11 +73,19 @@ class ExtractedBillTopic:
|
||||
def _select_bill_text_for_topic_extraction(bill: Bill) -> BillText | None:
|
||||
"""Pick one summarized bill_text row from the already-loaded relationship."""
|
||||
for bill_text in bill.bill_texts:
|
||||
if bill_text.summary and bill_text.summary.strip():
|
||||
summary_row = bill_text.default_summary()
|
||||
if summary_row and summary_row.summary.strip():
|
||||
return bill_text
|
||||
return None
|
||||
|
||||
|
||||
def _bill_text_has_summary_clause() -> ColumnElement[bool]:
|
||||
"""Return a correlated EXISTS clause for bill texts with at least one summary."""
|
||||
return exists(
|
||||
select(BillTextSummary.id).where(BillTextSummary.bill_text_id == BillText.id)
|
||||
)
|
||||
|
||||
|
||||
def normalize_topic_label(value: str) -> str:
|
||||
"""Normalize a topic label for storage, comparison, and de-duping."""
|
||||
normalized = value.strip().strip("\"'")
|
||||
@@ -323,11 +332,7 @@ def create_select_bills_for_topic_extraction(
|
||||
limit: int | None = None,
|
||||
) -> Select[tuple[Bill]]:
|
||||
"""Select bill rows that have summarized bill_text rows for topic extraction."""
|
||||
has_summary = (BillText.summary.is_not(None), BillText.summary != "")
|
||||
summarized_text_filters: list[ColumnElement[bool]] = [
|
||||
BillText.bill_id == Bill.id,
|
||||
*has_summary,
|
||||
]
|
||||
summarized_text_filters: list[ColumnElement[bool]] = [_bill_text_has_summary_clause()]
|
||||
if with_votes_only:
|
||||
summarized_text_filters.append(
|
||||
exists(
|
||||
@@ -347,11 +352,17 @@ def create_select_bills_for_topic_extraction(
|
||||
)
|
||||
)
|
||||
)
|
||||
summarized_text_exists = exists(select(BillText.id).where(*summarized_text_filters))
|
||||
summarized_text_exists = exists(
|
||||
select(BillText.id).where(BillText.bill_id == Bill.id, *summarized_text_filters)
|
||||
)
|
||||
bill_text_loader = selectinload(Bill.bill_texts.and_(*summarized_text_filters))
|
||||
stmt = (
|
||||
select(Bill)
|
||||
.where(summarized_text_exists)
|
||||
.options(selectinload(Bill.bill_texts.and_(*summarized_text_filters[1:])))
|
||||
.options(
|
||||
bill_text_loader.selectinload(BillText.summaries),
|
||||
bill_text_loader.selectinload(BillText.primary_summary),
|
||||
)
|
||||
.order_by(Bill.id)
|
||||
)
|
||||
if congress is not None:
|
||||
@@ -363,7 +374,7 @@ def create_select_bills_for_topic_extraction(
|
||||
select(BillText.id).where(
|
||||
BillText.bill_id == Bill.id,
|
||||
BillText.id.in_(bill_text_ids),
|
||||
*summarized_text_filters[1:],
|
||||
*summarized_text_filters,
|
||||
)
|
||||
)
|
||||
stmt = stmt.where(selected_text_exists)
|
||||
@@ -416,8 +427,7 @@ def collect_topic_extraction_diagnostics(
|
||||
)
|
||||
)
|
||||
|
||||
has_summary = (BillText.summary.is_not(None), BillText.summary != "")
|
||||
summary_filters = [*bill_text_filters, *has_summary]
|
||||
summary_filters = [*bill_text_filters, _bill_text_has_summary_clause()]
|
||||
|
||||
bills_with_summaries = session.scalar(
|
||||
select(func.count(func.distinct(Bill.id)))
|
||||
@@ -607,7 +617,11 @@ def main(
|
||||
if bill_text is None:
|
||||
logger.warning("Skipping bill id=%s: no usable summary", bill.id)
|
||||
continue
|
||||
summary = bill_text.summary.strip()
|
||||
summary_row = bill_text.default_summary()
|
||||
if summary_row is None:
|
||||
logger.warning("Skipping bill id=%s: no default summary", bill.id)
|
||||
continue
|
||||
summary = summary_row.summary.strip()
|
||||
|
||||
try:
|
||||
extracted_topics = extract_topics_for_bill_text(
|
||||
|
||||
Reference in New Issue
Block a user