From 51d6240690a4bf0e605e2698599bddebbda1639c Mon Sep 17 00:00:00 2001 From: Richie Cahill Date: Tue, 21 Apr 2026 22:48:13 -0400 Subject: [PATCH] removed LegislatorBillScore --- ...emoved_legislatorbillscore_b63ed11d6775.py | 146 +++++++ ...ext_summarization_metadata_7d15f9b7c8a2.py | 1 - .../orm/data_science_dev/congress/__init__.py | 2 - .../orm/data_science_dev/congress/bill.py | 12 +- .../data_science_dev/congress/legislator.py | 43 -- .../tools/calculate_legislator_scores.py | 394 ------------------ 6 files changed, 147 insertions(+), 451 deletions(-) create mode 100644 alembic/data_science_dev/versions/2026_04_21-removed_legislatorbillscore_b63ed11d6775.py delete mode 100644 pipelines/tools/calculate_legislator_scores.py diff --git a/alembic/data_science_dev/versions/2026_04_21-removed_legislatorbillscore_b63ed11d6775.py b/alembic/data_science_dev/versions/2026_04_21-removed_legislatorbillscore_b63ed11d6775.py new file mode 100644 index 0000000..35793fd --- /dev/null +++ b/alembic/data_science_dev/versions/2026_04_21-removed_legislatorbillscore_b63ed11d6775.py @@ -0,0 +1,146 @@ +"""removed LegislatorBillScore. + +Revision ID: b63ed11d6775 +Revises: 7d15f9b7c8a2 +Create Date: 2026-04-21 22:46:48.058542 + +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +from alembic import op +from pipelines.orm import DataScienceDevBase + +if TYPE_CHECKING: + from collections.abc import Sequence + +# revision identifiers, used by Alembic. +revision: str = "b63ed11d6775" +down_revision: str | None = "7d15f9b7c8a2" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + +schema = DataScienceDevBase.schema_name + + +def upgrade() -> None: + """Upgrade.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index( + op.f("ix_legislator_bill_score_bill_id"), + table_name="legislator_bill_score", + schema=schema, + ) + op.drop_index( + op.f("ix_legislator_bill_score_bill_topic_id"), + table_name="legislator_bill_score", + schema=schema, + ) + op.drop_index( + op.f("ix_legislator_bill_score_legislator_id"), + table_name="legislator_bill_score", + schema=schema, + ) + op.drop_index( + op.f("ix_legislator_bill_score_year_topic"), + table_name="legislator_bill_score", + schema=schema, + ) + op.drop_table("legislator_bill_score", schema=schema) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "legislator_bill_score", + sa.Column("bill_id", sa.INTEGER(), autoincrement=False, nullable=False), + sa.Column("bill_topic_id", sa.INTEGER(), autoincrement=False, nullable=False), + sa.Column("legislator_id", sa.INTEGER(), autoincrement=False, nullable=False), + sa.Column("year", sa.INTEGER(), autoincrement=False, nullable=False), + sa.Column("topic", sa.VARCHAR(), autoincrement=False, nullable=False), + sa.Column( + "score", + sa.DOUBLE_PRECISION(precision=53), + autoincrement=False, + nullable=False, + ), + sa.Column("id", sa.INTEGER(), autoincrement=True, nullable=False), + sa.Column( + "created", + postgresql.TIMESTAMP(timezone=True), + server_default=sa.text("now()"), + autoincrement=False, + nullable=False, + ), + sa.Column( + "updated", + postgresql.TIMESTAMP(timezone=True), + server_default=sa.text("now()"), + autoincrement=False, + nullable=False, + ), + sa.ForeignKeyConstraint( + ["bill_id"], + [f"{schema}.bill.id"], + name=op.f("fk_legislator_bill_score_bill_id_bill"), + ondelete="CASCADE", + ), + sa.ForeignKeyConstraint( + ["bill_topic_id"], + [f"{schema}.bill_topic.id"], + name=op.f("fk_legislator_bill_score_bill_topic_id_bill_topic"), + ondelete="CASCADE", + ), + sa.ForeignKeyConstraint( + ["legislator_id"], + [f"{schema}.legislator.id"], + name=op.f("fk_legislator_bill_score_legislator_id_legislator"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_legislator_bill_score")), + sa.UniqueConstraint( + "bill_topic_id", + "legislator_id", + "year", + name=op.f("uq_legislator_bill_score_bill_topic_id_legislator_id_year"), + postgresql_include=[], + postgresql_nulls_not_distinct=False, + ), + schema=schema, + ) + op.create_index( + op.f("ix_legislator_bill_score_year_topic"), + "legislator_bill_score", + ["year", "topic"], + unique=False, + schema=schema, + ) + op.create_index( + op.f("ix_legislator_bill_score_legislator_id"), + "legislator_bill_score", + ["legislator_id"], + unique=False, + schema=schema, + ) + op.create_index( + op.f("ix_legislator_bill_score_bill_topic_id"), + "legislator_bill_score", + ["bill_topic_id"], + unique=False, + schema=schema, + ) + op.create_index( + op.f("ix_legislator_bill_score_bill_id"), + "legislator_bill_score", + ["bill_id"], + unique=False, + schema=schema, + ) + # ### end Alembic commands ### diff --git a/alembic/data_science_dev/versions/2026_04_22-add_bill_text_summarization_metadata_7d15f9b7c8a2.py b/alembic/data_science_dev/versions/2026_04_22-add_bill_text_summarization_metadata_7d15f9b7c8a2.py index 30141d2..e7a7e4a 100644 --- a/alembic/data_science_dev/versions/2026_04_22-add_bill_text_summarization_metadata_7d15f9b7c8a2.py +++ b/alembic/data_science_dev/versions/2026_04_22-add_bill_text_summarization_metadata_7d15f9b7c8a2.py @@ -26,7 +26,6 @@ depends_on: str | Sequence[str] | None = None schema = DataScienceDevBase.schema_name -summarization_model def upgrade() -> None: """Upgrade.""" op.add_column( diff --git a/pipelines/orm/data_science_dev/congress/__init__.py b/pipelines/orm/data_science_dev/congress/__init__.py index c2f1630..a796637 100644 --- a/pipelines/orm/data_science_dev/congress/__init__.py +++ b/pipelines/orm/data_science_dev/congress/__init__.py @@ -7,7 +7,6 @@ from pipelines.orm.data_science_dev.congress.bill import ( BillTopicPosition, ) from pipelines.orm.data_science_dev.congress.legislator import ( - LegislatorBillScore, Legislator, LegislatorScore, LegislatorSocialMedia, @@ -20,7 +19,6 @@ __all__ = [ "BillTopic", "BillTopicPosition", "Legislator", - "LegislatorBillScore", "LegislatorScore", "LegislatorSocialMedia", "Vote", diff --git a/pipelines/orm/data_science_dev/congress/bill.py b/pipelines/orm/data_science_dev/congress/bill.py index 0ace4bf..4f7331c 100644 --- a/pipelines/orm/data_science_dev/congress/bill.py +++ b/pipelines/orm/data_science_dev/congress/bill.py @@ -12,7 +12,6 @@ from sqlalchemy.orm import Mapped, mapped_column, relationship from pipelines.orm.data_science_dev.base import DataScienceDevTableBase if TYPE_CHECKING: - from pipelines.orm.data_science_dev.congress.legislator import LegislatorBillScore from pipelines.orm.data_science_dev.congress.vote import Vote @@ -64,11 +63,6 @@ class Bill(DataScienceDevTableBase): back_populates="bill", cascade="all, delete-orphan", ) - legislator_bill_scores: Mapped[list[LegislatorBillScore]] = relationship( - "LegislatorBillScore", - back_populates="bill", - cascade="all, delete-orphan", - ) class BillText(DataScienceDevTableBase): @@ -93,6 +87,7 @@ class BillText(DataScienceDevTableBase): bill: Mapped[Bill] = relationship("Bill", back_populates="bill_texts") +# suport multipu summary prer bill class BillTopic(DataScienceDevTableBase): """One bill stance on one topic used to score roll-call votes.""" @@ -120,8 +115,3 @@ class BillTopic(DataScienceDevTableBase): ) bill: Mapped[Bill] = relationship("Bill", back_populates="topics") - legislator_bill_scores: Mapped[list[LegislatorBillScore]] = relationship( - "LegislatorBillScore", - back_populates="bill_topic", - cascade="all, delete-orphan", - ) diff --git a/pipelines/orm/data_science_dev/congress/legislator.py b/pipelines/orm/data_science_dev/congress/legislator.py index 0176f47..69683da 100644 --- a/pipelines/orm/data_science_dev/congress/legislator.py +++ b/pipelines/orm/data_science_dev/congress/legislator.py @@ -11,7 +11,6 @@ from sqlalchemy.orm import Mapped, mapped_column, relationship from pipelines.orm.data_science_dev.base import DataScienceDevTableBase if TYPE_CHECKING: - from pipelines.orm.data_science_dev.congress.bill import Bill, BillTopic from pipelines.orm.data_science_dev.congress.vote import VoteRecord @@ -56,11 +55,6 @@ class Legislator(DataScienceDevTableBase): back_populates="legislator", cascade="all, delete-orphan", ) - bill_scores: Mapped[list[LegislatorBillScore]] = relationship( - "LegislatorBillScore", - back_populates="legislator", - cascade="all, delete-orphan", - ) class LegislatorSocialMedia(DataScienceDevTableBase): @@ -102,40 +96,3 @@ class LegislatorScore(DataScienceDevTableBase): score: Mapped[float] legislator: Mapped[Legislator] = relationship(back_populates="scores") - - -class LegislatorBillScore(DataScienceDevTableBase): - """Per-bill source score used to maintain aggregate legislator scores.""" - - __tablename__ = "legislator_bill_score" - __table_args__ = ( - UniqueConstraint( - "bill_topic_id", - "legislator_id", - "year", - name="uq_legislator_bill_score_bill_topic_id_legislator_id_year", - ), - Index("ix_legislator_bill_score_year_topic", "year", "topic"), - ) - - bill_id: Mapped[int] = mapped_column( - ForeignKey("main.bill.id", ondelete="CASCADE"), - index=True, - ) - bill_topic_id: Mapped[int] = mapped_column( - ForeignKey("main.bill_topic.id", ondelete="CASCADE"), - index=True, - ) - legislator_id: Mapped[int] = mapped_column( - ForeignKey("main.legislator.id", ondelete="CASCADE"), - index=True, - ) - year: Mapped[int] - topic: Mapped[str] - score: Mapped[float] - - bill: Mapped[Bill] = relationship(back_populates="legislator_bill_scores") - bill_topic: Mapped[BillTopic] = relationship( - back_populates="legislator_bill_scores", - ) - legislator: Mapped[Legislator] = relationship(back_populates="bill_scores") diff --git a/pipelines/tools/calculate_legislator_scores.py b/pipelines/tools/calculate_legislator_scores.py deleted file mode 100644 index 82d7327..0000000 --- a/pipelines/tools/calculate_legislator_scores.py +++ /dev/null @@ -1,394 +0,0 @@ -"""Calculate legislator topic scores from bill topic metadata and roll-call votes.""" - -from __future__ import annotations - -import argparse -from collections import defaultdict -from dataclasses import dataclass -from datetime import UTC, datetime -from typing import Iterable - -from sqlalchemy import Integer, delete, extract, func, select, tuple_ -from sqlalchemy.orm import Session - -from pipelines.orm.common import get_postgres_engine -from pipelines.orm.data_science_dev.congress import ( - Bill, - BillTopic, - BillTopicPosition, - LegislatorBillScore, - LegislatorScore, - Vote, - VoteRecord, -) - -SUPPORT_VOTES = frozenset({"yea", "aye", "yes"}) -OPPOSE_VOTES = frozenset({"nay", "no"}) -NEUTRAL_SCORE = 50.0 -SUPPORT_SCORE = 100.0 -OPPOSE_SCORE = 1.0 -ScoreKey = tuple[int, int, str] - - -@dataclass(frozen=True) -class VoteScoreInput: - """Raw vote data needed for one bill/topic/legislator scoring event.""" - - bill_id: int - bill_topic_id: int - legislator_id: int - year: int - topic: str - support_position: str | BillTopicPosition - vote_position: str | None - - -@dataclass(frozen=True) -class ComputedBillScore: - """Per-bill source score for one legislator/year/topic.""" - - bill_id: int - bill_topic_id: int - legislator_id: int - year: int - topic: str - score: float - - -@dataclass(frozen=True) -class ScoreRunResult: - """Summary for a scoring job run.""" - - processed_bills: int - bill_score_rows: int - aggregate_score_rows: int - - -def score_vote( - vote_position: str | None, - support_position: str | BillTopicPosition | None, -) -> float | None: - """Return a 1-100 score where 50 is neutral.""" - stance = normalize_support_position(support_position) - if stance is None: - return None - if vote_position is None: - return NEUTRAL_SCORE - - vote = vote_position.strip().casefold() - if vote not in SUPPORT_VOTES | OPPOSE_VOTES: - return NEUTRAL_SCORE - - voted_yes = vote in SUPPORT_VOTES - yes_is_for_topic = stance is BillTopicPosition.FOR - return SUPPORT_SCORE if voted_yes == yes_is_for_topic else OPPOSE_SCORE - - -def normalize_support_position( - support_position: str | BillTopicPosition | None, -) -> BillTopicPosition | None: - """Normalize a DB enum/string stance value.""" - if support_position is None: - return None - if isinstance(support_position, BillTopicPosition): - return support_position - value = support_position.strip().casefold() - try: - return BillTopicPosition(value) - except ValueError: - return None - - -def calculate_bill_score_values( - vote_inputs: Iterable[VoteScoreInput], -) -> list[ComputedBillScore]: - """Aggregate raw vote inputs into per-bill source scores.""" - grouped: dict[tuple[int, int, int, int, str], list[float]] = defaultdict(list) - for vote_input in vote_inputs: - score = score_vote(vote_input.vote_position, vote_input.support_position) - if score is None: - continue - key = ( - vote_input.bill_id, - vote_input.bill_topic_id, - vote_input.legislator_id, - vote_input.year, - vote_input.topic, - ) - grouped[key].append(score) - - return [ - ComputedBillScore( - bill_id=bill_id, - bill_topic_id=bill_topic_id, - legislator_id=legislator_id, - year=year, - topic=topic, - score=sum(scores) / len(scores), - ) - for (bill_id, bill_topic_id, legislator_id, year, topic), scores in sorted( - grouped.items() - ) - ] - - -def calculate_and_store_legislator_scores( - session: Session, - *, - congress: int | None = None, - bill_ids: list[int] | None = None, - topics: list[str] | None = None, - force: bool = False, - limit: int | None = None, -) -> ScoreRunResult: - """Score selected bills and refresh aggregate legislator score rows.""" - selected_bill_ids = select_bill_ids_to_score( - session, - congress=congress, - bill_ids=bill_ids, - topics=topics, - force=force, - limit=limit, - ) - result = ScoreRunResult( - processed_bills=0, - bill_score_rows=0, - aggregate_score_rows=0, - ) - for bill_id in selected_bill_ids: - bill_score_rows, aggregate_score_rows = score_bill( - session, - bill_id=bill_id, - topics=topics, - mark_processed=topics is None, - ) - result = ScoreRunResult( - processed_bills=result.processed_bills + 1, - bill_score_rows=result.bill_score_rows + bill_score_rows, - aggregate_score_rows=result.aggregate_score_rows + aggregate_score_rows, - ) - session.commit() - return result - - -def select_bill_ids_to_score( - session: Session, - *, - congress: int | None = None, - bill_ids: list[int] | None = None, - topics: list[str] | None = None, - force: bool = False, - limit: int | None = None, -) -> list[int]: - """Select bills with topic metadata and votes that should be scored.""" - stmt = ( - select(Bill.id) - .join(BillTopic, BillTopic.bill_id == Bill.id) - .join(Vote, Vote.bill_id == Bill.id) - .distinct() - .order_by(Bill.id) - ) - if not force: - stmt = stmt.where(Bill.score_processed_at.is_(None)) - if congress is not None: - stmt = stmt.where(Bill.congress == congress) - if bill_ids: - stmt = stmt.where(Bill.id.in_(bill_ids)) - if topics: - stmt = stmt.where(BillTopic.topic.in_(topics)) - if limit is not None: - stmt = stmt.limit(limit) - return list(session.scalars(stmt)) - - -def score_bill( - session: Session, - *, - bill_id: int, - topics: list[str] | None = None, - mark_processed: bool = True, -) -> tuple[int, int]: - """Score all selected vote records for one bill and refresh aggregates.""" - prior_keys = _existing_score_keys_for_bill(session, bill_id=bill_id, topics=topics) - session.execute(_delete_bill_scores_statement(bill_id=bill_id, topics=topics)) - session.flush() - - scores = calculate_bill_score_values( - _load_bill_vote_score_inputs(session, bill_id=bill_id, topics=topics) - ) - session.add_all( - LegislatorBillScore( - bill_id=score.bill_id, - bill_topic_id=score.bill_topic_id, - legislator_id=score.legislator_id, - year=score.year, - topic=score.topic, - score=score.score, - ) - for score in scores - ) - if mark_processed: - bill = session.get(Bill, bill_id) - if bill is not None: - bill.score_processed_at = datetime.now(tz=UTC) - session.flush() - - affected_keys = prior_keys | { - (score.legislator_id, score.year, score.topic) for score in scores - } - aggregate_rows = refresh_aggregate_scores(session, affected_keys) - return len(scores), aggregate_rows - - -def refresh_aggregate_scores(session: Session, keys: set[ScoreKey]) -> int: - """Refresh aggregate legislator_score rows from per-bill source scores.""" - if not keys: - return 0 - - key_tuple = tuple_( - LegislatorScore.legislator_id, - LegislatorScore.year, - LegislatorScore.topic, - ) - session.execute(delete(LegislatorScore).where(key_tuple.in_(list(keys)))) - session.flush() - - source_key_tuple = tuple_( - LegislatorBillScore.legislator_id, - LegislatorBillScore.year, - LegislatorBillScore.topic, - ) - rows = session.execute( - select( - LegislatorBillScore.legislator_id, - LegislatorBillScore.year, - LegislatorBillScore.topic, - func.avg(LegislatorBillScore.score).label("score"), - ) - .where(source_key_tuple.in_(list(keys))) - .group_by( - LegislatorBillScore.legislator_id, - LegislatorBillScore.year, - LegislatorBillScore.topic, - ) - ).all() - session.add_all( - LegislatorScore( - legislator_id=row.legislator_id, - year=row.year, - topic=row.topic, - score=float(row.score), - ) - for row in rows - ) - session.flush() - return len(rows) - - -def _load_bill_vote_score_inputs( - session: Session, - *, - bill_id: int, - topics: list[str] | None, -) -> list[VoteScoreInput]: - year = extract("year", Vote.vote_date).cast(Integer).label("year") - stmt = ( - select( - Vote.bill_id, - BillTopic.id.label("bill_topic_id"), - VoteRecord.legislator_id, - year, - BillTopic.topic, - BillTopic.support_position, - VoteRecord.position, - ) - .join(Vote, Vote.id == VoteRecord.vote_id) - .join(BillTopic, BillTopic.bill_id == Vote.bill_id) - .where(Vote.bill_id == bill_id) - ) - if topics: - stmt = stmt.where(BillTopic.topic.in_(topics)) - - return [ - VoteScoreInput( - bill_id=row.bill_id, - bill_topic_id=row.bill_topic_id, - legislator_id=row.legislator_id, - year=int(row.year), - topic=row.topic, - support_position=row.support_position, - vote_position=row.position, - ) - for row in session.execute(stmt) - ] - - -def _existing_score_keys_for_bill( - session: Session, - *, - bill_id: int, - topics: list[str] | None, -) -> set[ScoreKey]: - stmt = select( - LegislatorBillScore.legislator_id, - LegislatorBillScore.year, - LegislatorBillScore.topic, - ).where(LegislatorBillScore.bill_id == bill_id) - if topics: - stmt = stmt.where(LegislatorBillScore.topic.in_(topics)) - return {(row.legislator_id, row.year, row.topic) for row in session.execute(stmt)} - - -def _delete_bill_scores_statement(*, bill_id: int, topics: list[str] | None): - stmt = delete(LegislatorBillScore).where(LegislatorBillScore.bill_id == bill_id) - if topics: - stmt = stmt.where(LegislatorBillScore.topic.in_(topics)) - return stmt - - -def main() -> None: - """CLI entrypoint.""" - parser = argparse.ArgumentParser( - description="Calculate legislator_score rows from bill_topic and vote_record data." - ) - parser.add_argument("--congress", type=int, help="Only score bills from one Congress.") - parser.add_argument( - "--bill-id", - action="append", - dest="bill_ids", - type=int, - help="Only score one bill id. Repeat for multiple bills.", - ) - parser.add_argument( - "--topic", - action="append", - dest="topics", - help="Only calculate one topic. Repeat for multiple topics.", - ) - parser.add_argument( - "--force", - action="store_true", - help="Reprocess bills even when bill.score_processed_at is already set.", - ) - parser.add_argument("--limit", type=int, help="Maximum number of bills to process.") - args = parser.parse_args() - - engine = get_postgres_engine(name="DATA_SCIENCE_DEV") - with Session(engine) as session: - result = calculate_and_store_legislator_scores( - session, - congress=args.congress, - bill_ids=args.bill_ids, - topics=args.topics, - force=args.force, - limit=args.limit, - ) - print( - "Processed " - f"{result.processed_bills} bills; stored {result.bill_score_rows} bill score rows; " - f"refreshed {result.aggregate_score_rows} aggregate score rows." - ) - - -if __name__ == "__main__": - main()