created scoring tables and basic logic

This commit is contained in:
2026-04-21 11:44:53 -04:00
parent be4b473a3c
commit 674edafe94
9 changed files with 843 additions and 38 deletions

View File

@@ -0,0 +1,245 @@
"""adding LegislatorScore and BillTopic.
Revision ID: ef4bc5411176
Revises: 5cd7eee3549d
Create Date: 2026-04-21 11:35:18.977213
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import sqlalchemy as sa
from alembic import op
from pipelines.orm import DataScienceDevBase
if TYPE_CHECKING:
from collections.abc import Sequence
# revision identifiers, used by Alembic.
revision: str = "ef4bc5411176"
down_revision: str | None = "5cd7eee3549d"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
schema = DataScienceDevBase.schema_name
def upgrade() -> None:
"""Upgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.create_table(
"bill_topic",
sa.Column("bill_id", sa.Integer(), nullable=False),
sa.Column("topic", sa.String(), nullable=False),
sa.Column(
"support_position",
sa.Enum("for", "against", name="bill_topic_position", native_enum=False),
nullable=False,
),
sa.Column("id", sa.Integer(), nullable=False),
sa.Column(
"created",
sa.DateTime(timezone=True),
server_default=sa.text("now()"),
nullable=False,
),
sa.Column(
"updated",
sa.DateTime(timezone=True),
server_default=sa.text("now()"),
nullable=False,
),
sa.ForeignKeyConstraint(
["bill_id"],
[f"{schema}.bill.id"],
name=op.f("fk_bill_topic_bill_id_bill"),
ondelete="CASCADE",
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_bill_topic")),
sa.UniqueConstraint(
"bill_id",
"topic",
"support_position",
name="uq_bill_topic_bill_id_topic_support_position",
),
schema=schema,
)
op.create_index(
"ix_bill_topic_topic", "bill_topic", ["topic"], unique=False, schema=schema
)
op.create_table(
"legislator_score",
sa.Column("legislator_id", sa.Integer(), nullable=False),
sa.Column("year", sa.Integer(), nullable=False),
sa.Column("topic", sa.String(), nullable=False),
sa.Column("score", sa.Float(), nullable=False),
sa.Column("id", sa.Integer(), nullable=False),
sa.Column(
"created",
sa.DateTime(timezone=True),
server_default=sa.text("now()"),
nullable=False,
),
sa.Column(
"updated",
sa.DateTime(timezone=True),
server_default=sa.text("now()"),
nullable=False,
),
sa.ForeignKeyConstraint(
["legislator_id"],
[f"{schema}.legislator.id"],
name=op.f("fk_legislator_score_legislator_id_legislator"),
ondelete="CASCADE",
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_legislator_score")),
sa.UniqueConstraint(
"legislator_id",
"year",
"topic",
name="uq_legislator_score_legislator_id_year_topic",
),
schema=schema,
)
op.create_index(
op.f("ix_legislator_score_legislator_id"),
"legislator_score",
["legislator_id"],
unique=False,
schema=schema,
)
op.create_index(
"ix_legislator_score_year_topic",
"legislator_score",
["year", "topic"],
unique=False,
schema=schema,
)
op.create_table(
"legislator_bill_score",
sa.Column("bill_id", sa.Integer(), nullable=False),
sa.Column("bill_topic_id", sa.Integer(), nullable=False),
sa.Column("legislator_id", sa.Integer(), nullable=False),
sa.Column("year", sa.Integer(), nullable=False),
sa.Column("topic", sa.String(), nullable=False),
sa.Column("score", sa.Float(), nullable=False),
sa.Column("id", sa.Integer(), nullable=False),
sa.Column(
"created",
sa.DateTime(timezone=True),
server_default=sa.text("now()"),
nullable=False,
),
sa.Column(
"updated",
sa.DateTime(timezone=True),
server_default=sa.text("now()"),
nullable=False,
),
sa.ForeignKeyConstraint(
["bill_id"],
[f"{schema}.bill.id"],
name=op.f("fk_legislator_bill_score_bill_id_bill"),
ondelete="CASCADE",
),
sa.ForeignKeyConstraint(
["bill_topic_id"],
[f"{schema}.bill_topic.id"],
name=op.f("fk_legislator_bill_score_bill_topic_id_bill_topic"),
ondelete="CASCADE",
),
sa.ForeignKeyConstraint(
["legislator_id"],
[f"{schema}.legislator.id"],
name=op.f("fk_legislator_bill_score_legislator_id_legislator"),
ondelete="CASCADE",
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_legislator_bill_score")),
sa.UniqueConstraint(
"bill_topic_id",
"legislator_id",
"year",
name="uq_legislator_bill_score_bill_topic_id_legislator_id_year",
),
schema=schema,
)
op.create_index(
op.f("ix_legislator_bill_score_bill_id"),
"legislator_bill_score",
["bill_id"],
unique=False,
schema=schema,
)
op.create_index(
op.f("ix_legislator_bill_score_bill_topic_id"),
"legislator_bill_score",
["bill_topic_id"],
unique=False,
schema=schema,
)
op.create_index(
op.f("ix_legislator_bill_score_legislator_id"),
"legislator_bill_score",
["legislator_id"],
unique=False,
schema=schema,
)
op.create_index(
"ix_legislator_bill_score_year_topic",
"legislator_bill_score",
["year", "topic"],
unique=False,
schema=schema,
)
op.add_column(
"bill",
sa.Column("score_processed_at", sa.DateTime(timezone=True), nullable=True),
schema=schema,
)
op.add_column(
"bill_text", sa.Column("summary", sa.String(), nullable=True), schema=schema
)
# ### end Alembic commands ###
def downgrade() -> None:
"""Downgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column("bill_text", "summary", schema=schema)
op.drop_column("bill", "score_processed_at", schema=schema)
op.drop_index(
"ix_legislator_bill_score_year_topic",
table_name="legislator_bill_score",
schema=schema,
)
op.drop_index(
op.f("ix_legislator_bill_score_legislator_id"),
table_name="legislator_bill_score",
schema=schema,
)
op.drop_index(
op.f("ix_legislator_bill_score_bill_topic_id"),
table_name="legislator_bill_score",
schema=schema,
)
op.drop_index(
op.f("ix_legislator_bill_score_bill_id"),
table_name="legislator_bill_score",
schema=schema,
)
op.drop_table("legislator_bill_score", schema=schema)
op.drop_index(
"ix_legislator_score_year_topic", table_name="legislator_score", schema=schema
)
op.drop_index(
op.f("ix_legislator_score_legislator_id"),
table_name="legislator_score",
schema=schema,
)
op.drop_table("legislator_score", schema=schema)
op.drop_index("ix_bill_topic_topic", table_name="bill_topic", schema=schema)
op.drop_table("bill_topic", schema=schema)
# ### end Alembic commands ###

View File

@@ -40,8 +40,12 @@ def dynamic_schema(filename: str, _options: dict[Any, Any]) -> None:
"""Dynamic schema.""" """Dynamic schema."""
original_file = Path(filename).read_text() original_file = Path(filename).read_text()
schema_name = base_class.schema_name schema_name = base_class.schema_name
dynamic_schema_file_part1 = original_file.replace(f"schema='{schema_name}'", "schema=schema") dynamic_schema_file_part1 = original_file.replace(
dynamic_schema_file = dynamic_schema_file_part1.replace(f"'{schema_name}.", "f'{schema}.") f"schema='{schema_name}'", "schema=schema"
)
dynamic_schema_file = dynamic_schema_file_part1.replace(
f"'{schema_name}.", "f'{schema}."
)
Path(filename).write_text(dynamic_schema_file) Path(filename).write_text(dynamic_schema_file)
@@ -49,7 +53,10 @@ def dynamic_schema(filename: str, _options: dict[Any, Any]) -> None:
def import_postgresql(filename: str, _options: dict[Any, Any]) -> None: def import_postgresql(filename: str, _options: dict[Any, Any]) -> None:
"""Add postgresql dialect import when postgresql types are used.""" """Add postgresql dialect import when postgresql types are used."""
content = Path(filename).read_text() content = Path(filename).read_text()
if "postgresql." in content and "from sqlalchemy.dialects import postgresql" not in content: if (
"postgresql." in content
and "from sqlalchemy.dialects import postgresql" not in content
):
content = content.replace( content = content.replace(
"import sqlalchemy as sa\n", "import sqlalchemy as sa\n",
"import sqlalchemy as sa\nfrom sqlalchemy.dialects import postgresql\n", "import sqlalchemy as sa\nfrom sqlalchemy.dialects import postgresql\n",
@@ -66,8 +73,17 @@ def ruff_check_and_format(filename: str, _options: dict[Any, Any]) -> None:
def include_name( def include_name(
name: str | None, name: str | None,
type_: Literal["schema", "table", "column", "index", "unique_constraint", "foreign_key_constraint"], type_: Literal[
_parent_names: MutableMapping[Literal["schema_name", "table_name", "schema_qualified_table_name"], str | None], "schema",
"table",
"column",
"index",
"unique_constraint",
"foreign_key_constraint",
],
_parent_names: MutableMapping[
Literal["schema_name", "table_name", "schema_qualified_table_name"], str | None
],
) -> bool: ) -> bool:
"""Filter tables to be included in the migration. """Filter tables to be included in the migration.

View File

@@ -13,7 +13,7 @@ from typing import TYPE_CHECKING
import sqlalchemy as sa import sqlalchemy as sa
from alembic import op from alembic import op
from python.orm import ${config.attributes["base"].__name__} from pipelines.orm import ${config.attributes["base"].__name__}
if TYPE_CHECKING: if TYPE_CHECKING:
from collections.abc import Sequence from collections.abc import Sequence

View File

@@ -1,8 +1,15 @@
"""init.""" """Congress ORM models."""
from pipelines.orm.data_science_dev.congress.bill import Bill, BillText from pipelines.orm.data_science_dev.congress.bill import (
Bill,
BillText,
BillTopic,
BillTopicPosition,
)
from pipelines.orm.data_science_dev.congress.legislator import ( from pipelines.orm.data_science_dev.congress.legislator import (
LegislatorBillScore,
Legislator, Legislator,
LegislatorScore,
LegislatorSocialMedia, LegislatorSocialMedia,
) )
from pipelines.orm.data_science_dev.congress.vote import Vote, VoteRecord from pipelines.orm.data_science_dev.congress.vote import Vote, VoteRecord
@@ -10,7 +17,11 @@ from pipelines.orm.data_science_dev.congress.vote import Vote, VoteRecord
__all__ = [ __all__ = [
"Bill", "Bill",
"BillText", "BillText",
"BillTopic",
"BillTopicPosition",
"Legislator", "Legislator",
"LegislatorBillScore",
"LegislatorScore",
"LegislatorSocialMedia", "LegislatorSocialMedia",
"Vote", "Vote",
"VoteRecord", "VoteRecord",

View File

@@ -2,22 +2,37 @@
from __future__ import annotations from __future__ import annotations
from datetime import date from datetime import date, datetime
from enum import StrEnum
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from sqlalchemy import ForeignKey, Index, UniqueConstraint from sqlalchemy import DateTime, Enum, ForeignKey, Index, UniqueConstraint
from sqlalchemy.orm import Mapped, mapped_column, relationship from sqlalchemy.orm import Mapped, mapped_column, relationship
from pipelines.orm.data_science_dev.base import DataScienceDevTableBase from pipelines.orm.data_science_dev.base import DataScienceDevTableBase
if TYPE_CHECKING: if TYPE_CHECKING:
from pipelines.orm.data_science_dev.congress.legislator import LegislatorBillScore
from pipelines.orm.data_science_dev.congress.vote import Vote from pipelines.orm.data_science_dev.congress.vote import Vote
class BillTopicPosition(StrEnum):
"""Whether a yes vote on a bill is for or against a topic."""
FOR = "for"
AGAINST = "against"
class Bill(DataScienceDevTableBase): class Bill(DataScienceDevTableBase):
"""Legislation with congress number, type, titles, status, and sponsor.""" """Legislation with congress number, type, titles, status, and sponsor."""
__tablename__ = "bill" __tablename__ = "bill"
__table_args__ = (
UniqueConstraint(
"congress", "bill_type", "number", name="uq_bill_congress_type_number"
),
Index("ix_bill_congress", "congress"),
)
congress: Mapped[int] congress: Mapped[int]
bill_type: Mapped[str] bill_type: Mapped[str]
@@ -33,6 +48,7 @@ class Bill(DataScienceDevTableBase):
sponsor_bioguide_id: Mapped[str | None] sponsor_bioguide_id: Mapped[str | None]
subjects_top_term: Mapped[str | None] subjects_top_term: Mapped[str | None]
score_processed_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
votes: Mapped[list[Vote]] = relationship( votes: Mapped[list[Vote]] = relationship(
"Vote", "Vote",
@@ -43,12 +59,15 @@ class Bill(DataScienceDevTableBase):
back_populates="bill", back_populates="bill",
cascade="all, delete-orphan", cascade="all, delete-orphan",
) )
topics: Mapped[list[BillTopic]] = relationship(
__table_args__ = ( "BillTopic",
UniqueConstraint( back_populates="bill",
"congress", "bill_type", "number", name="uq_bill_congress_type_number" cascade="all, delete-orphan",
), )
Index("ix_bill_congress", "congress"), legislator_bill_scores: Mapped[list[LegislatorBillScore]] = relationship(
"LegislatorBillScore",
back_populates="bill",
cascade="all, delete-orphan",
) )
@@ -56,17 +75,50 @@ class BillText(DataScienceDevTableBase):
"""Stores different text versions of a bill (introduced, enrolled, etc.).""" """Stores different text versions of a bill (introduced, enrolled, etc.)."""
__tablename__ = "bill_text" __tablename__ = "bill_text"
bill_id: Mapped[int] = mapped_column(ForeignKey("main.bill.id", ondelete="CASCADE"))
version_code: Mapped[str]
version_name: Mapped[str | None]
text_content: Mapped[str | None]
date: Mapped[date | None]
bill: Mapped[Bill] = relationship("Bill", back_populates="bill_texts")
__table_args__ = ( __table_args__ = (
UniqueConstraint( UniqueConstraint(
"bill_id", "version_code", name="uq_bill_text_bill_id_version_code" "bill_id", "version_code", name="uq_bill_text_bill_id_version_code"
), ),
) )
bill_id: Mapped[int] = mapped_column(ForeignKey("main.bill.id", ondelete="CASCADE"))
version_code: Mapped[str]
version_name: Mapped[str | None]
text_content: Mapped[str | None]
summary: Mapped[str | None]
date: Mapped[date | None]
bill: Mapped[Bill] = relationship("Bill", back_populates="bill_texts")
class BillTopic(DataScienceDevTableBase):
"""One bill stance on one topic used to score roll-call votes."""
__tablename__ = "bill_topic"
__table_args__ = (
UniqueConstraint(
"bill_id",
"topic",
"support_position",
name="uq_bill_topic_bill_id_topic_support_position",
),
Index("ix_bill_topic_topic", "topic"),
)
bill_id: Mapped[int] = mapped_column(ForeignKey("main.bill.id", ondelete="CASCADE"))
topic: Mapped[str]
support_position: Mapped[BillTopicPosition] = mapped_column(
Enum(
BillTopicPosition,
values_callable=lambda enum_cls: [member.value for member in enum_cls],
native_enum=False,
name="bill_topic_position",
)
)
bill: Mapped[Bill] = relationship("Bill", back_populates="topics")
legislator_bill_scores: Mapped[list[LegislatorBillScore]] = relationship(
"LegislatorBillScore",
back_populates="bill_topic",
cascade="all, delete-orphan",
)

View File

@@ -5,12 +5,13 @@ from __future__ import annotations
from datetime import date from datetime import date
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from sqlalchemy import ForeignKey, Text from sqlalchemy import ForeignKey, Index, Text, UniqueConstraint
from sqlalchemy.orm import Mapped, mapped_column, relationship from sqlalchemy.orm import Mapped, mapped_column, relationship
from pipelines.orm.data_science_dev.base import DataScienceDevTableBase from pipelines.orm.data_science_dev.base import DataScienceDevTableBase
if TYPE_CHECKING: if TYPE_CHECKING:
from pipelines.orm.data_science_dev.congress.bill import Bill, BillTopic
from pipelines.orm.data_science_dev.congress.vote import VoteRecord from pipelines.orm.data_science_dev.congress.vote import VoteRecord
@@ -50,6 +51,16 @@ class Legislator(DataScienceDevTableBase):
back_populates="legislator", back_populates="legislator",
cascade="all, delete-orphan", cascade="all, delete-orphan",
) )
scores: Mapped[list[LegislatorScore]] = relationship(
"LegislatorScore",
back_populates="legislator",
cascade="all, delete-orphan",
)
bill_scores: Mapped[list[LegislatorBillScore]] = relationship(
"LegislatorBillScore",
back_populates="legislator",
cascade="all, delete-orphan",
)
class LegislatorSocialMedia(DataScienceDevTableBase): class LegislatorSocialMedia(DataScienceDevTableBase):
@@ -66,3 +77,65 @@ class LegislatorSocialMedia(DataScienceDevTableBase):
legislator: Mapped[Legislator] = relationship( legislator: Mapped[Legislator] = relationship(
back_populates="social_media_accounts" back_populates="social_media_accounts"
) )
class LegislatorScore(DataScienceDevTableBase):
"""Computed topic score for a legislator in one calendar year."""
__tablename__ = "legislator_score"
__table_args__ = (
UniqueConstraint(
"legislator_id",
"year",
"topic",
name="uq_legislator_score_legislator_id_year_topic",
),
Index("ix_legislator_score_year_topic", "year", "topic"),
)
legislator_id: Mapped[int] = mapped_column(
ForeignKey("main.legislator.id", ondelete="CASCADE"),
index=True,
)
year: Mapped[int]
topic: Mapped[str]
score: Mapped[float]
legislator: Mapped[Legislator] = relationship(back_populates="scores")
class LegislatorBillScore(DataScienceDevTableBase):
"""Per-bill source score used to maintain aggregate legislator scores."""
__tablename__ = "legislator_bill_score"
__table_args__ = (
UniqueConstraint(
"bill_topic_id",
"legislator_id",
"year",
name="uq_legislator_bill_score_bill_topic_id_legislator_id_year",
),
Index("ix_legislator_bill_score_year_topic", "year", "topic"),
)
bill_id: Mapped[int] = mapped_column(
ForeignKey("main.bill.id", ondelete="CASCADE"),
index=True,
)
bill_topic_id: Mapped[int] = mapped_column(
ForeignKey("main.bill_topic.id", ondelete="CASCADE"),
index=True,
)
legislator_id: Mapped[int] = mapped_column(
ForeignKey("main.legislator.id", ondelete="CASCADE"),
index=True,
)
year: Mapped[int]
topic: Mapped[str]
score: Mapped[float]
bill: Mapped[Bill] = relationship(back_populates="legislator_bill_scores")
bill_topic: Mapped[BillTopic] = relationship(back_populates="legislator_bill_scores")
legislator: Mapped[Legislator] = relationship(back_populates="bill_scores")

View File

@@ -44,6 +44,17 @@ class Vote(DataScienceDevTableBase):
"""Roll call votes with counts and optional bill linkage.""" """Roll call votes with counts and optional bill linkage."""
__tablename__ = "vote" __tablename__ = "vote"
__table_args__ = (
UniqueConstraint(
"congress",
"chamber",
"session",
"number",
name="uq_vote_congress_chamber_session_number",
),
Index("ix_vote_date", "vote_date"),
Index("ix_vote_congress_chamber", "congress", "chamber"),
)
congress: Mapped[int] congress: Mapped[int]
chamber: Mapped[str] chamber: Mapped[str]
@@ -71,14 +82,3 @@ class Vote(DataScienceDevTableBase):
cascade="all, delete-orphan", cascade="all, delete-orphan",
) )
__table_args__ = (
UniqueConstraint(
"congress",
"chamber",
"session",
"number",
name="uq_vote_congress_chamber_session_number",
),
Index("ix_vote_date", "vote_date"),
Index("ix_vote_congress_chamber", "congress", "chamber"),
)

View File

@@ -2,14 +2,28 @@
from __future__ import annotations from __future__ import annotations
from pipelines.orm.data_science_dev.congress import Bill, BillText, Legislator, Vote, VoteRecord from pipelines.orm.data_science_dev.congress import (
Bill,
BillText,
BillTopic,
BillTopicPosition,
Legislator,
LegislatorBillScore,
LegislatorScore,
Vote,
VoteRecord,
)
from pipelines.orm.data_science_dev.posts import partitions # noqa: F401 — registers partition classes in metadata from pipelines.orm.data_science_dev.posts import partitions # noqa: F401 — registers partition classes in metadata
from pipelines.orm.data_science_dev.posts.tables import Posts from pipelines.orm.data_science_dev.posts.tables import Posts
__all__ = [ __all__ = [
"Bill", "Bill",
"BillText", "BillText",
"BillTopic",
"BillTopicPosition",
"Legislator", "Legislator",
"LegislatorBillScore",
"LegislatorScore",
"Posts", "Posts",
"Vote", "Vote",
"VoteRecord", "VoteRecord",

View File

@@ -0,0 +1,394 @@
"""Calculate legislator topic scores from bill topic metadata and roll-call votes."""
from __future__ import annotations
import argparse
from collections import defaultdict
from dataclasses import dataclass
from datetime import UTC, datetime
from typing import Iterable
from sqlalchemy import Integer, delete, extract, func, select, tuple_
from sqlalchemy.orm import Session
from pipelines.orm.common import get_postgres_engine
from pipelines.orm.data_science_dev.congress import (
Bill,
BillTopic,
BillTopicPosition,
LegislatorBillScore,
LegislatorScore,
Vote,
VoteRecord,
)
SUPPORT_VOTES = frozenset({"yea", "aye", "yes"})
OPPOSE_VOTES = frozenset({"nay", "no"})
NEUTRAL_SCORE = 50.0
SUPPORT_SCORE = 100.0
OPPOSE_SCORE = 1.0
ScoreKey = tuple[int, int, str]
@dataclass(frozen=True)
class VoteScoreInput:
"""Raw vote data needed for one bill/topic/legislator scoring event."""
bill_id: int
bill_topic_id: int
legislator_id: int
year: int
topic: str
support_position: str | BillTopicPosition
vote_position: str | None
@dataclass(frozen=True)
class ComputedBillScore:
"""Per-bill source score for one legislator/year/topic."""
bill_id: int
bill_topic_id: int
legislator_id: int
year: int
topic: str
score: float
@dataclass(frozen=True)
class ScoreRunResult:
"""Summary for a scoring job run."""
processed_bills: int
bill_score_rows: int
aggregate_score_rows: int
def score_vote(
vote_position: str | None,
support_position: str | BillTopicPosition | None,
) -> float | None:
"""Return a 1-100 score where 50 is neutral."""
stance = normalize_support_position(support_position)
if stance is None:
return None
if vote_position is None:
return NEUTRAL_SCORE
vote = vote_position.strip().casefold()
if vote not in SUPPORT_VOTES | OPPOSE_VOTES:
return NEUTRAL_SCORE
voted_yes = vote in SUPPORT_VOTES
yes_is_for_topic = stance is BillTopicPosition.FOR
return SUPPORT_SCORE if voted_yes == yes_is_for_topic else OPPOSE_SCORE
def normalize_support_position(
support_position: str | BillTopicPosition | None,
) -> BillTopicPosition | None:
"""Normalize a DB enum/string stance value."""
if support_position is None:
return None
if isinstance(support_position, BillTopicPosition):
return support_position
value = support_position.strip().casefold()
try:
return BillTopicPosition(value)
except ValueError:
return None
def calculate_bill_score_values(
vote_inputs: Iterable[VoteScoreInput],
) -> list[ComputedBillScore]:
"""Aggregate raw vote inputs into per-bill source scores."""
grouped: dict[tuple[int, int, int, int, str], list[float]] = defaultdict(list)
for vote_input in vote_inputs:
score = score_vote(vote_input.vote_position, vote_input.support_position)
if score is None:
continue
key = (
vote_input.bill_id,
vote_input.bill_topic_id,
vote_input.legislator_id,
vote_input.year,
vote_input.topic,
)
grouped[key].append(score)
return [
ComputedBillScore(
bill_id=bill_id,
bill_topic_id=bill_topic_id,
legislator_id=legislator_id,
year=year,
topic=topic,
score=sum(scores) / len(scores),
)
for (bill_id, bill_topic_id, legislator_id, year, topic), scores in sorted(
grouped.items()
)
]
def calculate_and_store_legislator_scores(
session: Session,
*,
congress: int | None = None,
bill_ids: list[int] | None = None,
topics: list[str] | None = None,
force: bool = False,
limit: int | None = None,
) -> ScoreRunResult:
"""Score selected bills and refresh aggregate legislator score rows."""
selected_bill_ids = select_bill_ids_to_score(
session,
congress=congress,
bill_ids=bill_ids,
topics=topics,
force=force,
limit=limit,
)
result = ScoreRunResult(
processed_bills=0,
bill_score_rows=0,
aggregate_score_rows=0,
)
for bill_id in selected_bill_ids:
bill_score_rows, aggregate_score_rows = score_bill(
session,
bill_id=bill_id,
topics=topics,
mark_processed=topics is None,
)
result = ScoreRunResult(
processed_bills=result.processed_bills + 1,
bill_score_rows=result.bill_score_rows + bill_score_rows,
aggregate_score_rows=result.aggregate_score_rows + aggregate_score_rows,
)
session.commit()
return result
def select_bill_ids_to_score(
session: Session,
*,
congress: int | None = None,
bill_ids: list[int] | None = None,
topics: list[str] | None = None,
force: bool = False,
limit: int | None = None,
) -> list[int]:
"""Select bills with topic metadata and votes that should be scored."""
stmt = (
select(Bill.id)
.join(BillTopic, BillTopic.bill_id == Bill.id)
.join(Vote, Vote.bill_id == Bill.id)
.distinct()
.order_by(Bill.id)
)
if not force:
stmt = stmt.where(Bill.score_processed_at.is_(None))
if congress is not None:
stmt = stmt.where(Bill.congress == congress)
if bill_ids:
stmt = stmt.where(Bill.id.in_(bill_ids))
if topics:
stmt = stmt.where(BillTopic.topic.in_(topics))
if limit is not None:
stmt = stmt.limit(limit)
return list(session.scalars(stmt))
def score_bill(
session: Session,
*,
bill_id: int,
topics: list[str] | None = None,
mark_processed: bool = True,
) -> tuple[int, int]:
"""Score all selected vote records for one bill and refresh aggregates."""
prior_keys = _existing_score_keys_for_bill(session, bill_id=bill_id, topics=topics)
session.execute(_delete_bill_scores_statement(bill_id=bill_id, topics=topics))
session.flush()
scores = calculate_bill_score_values(
_load_bill_vote_score_inputs(session, bill_id=bill_id, topics=topics)
)
session.add_all(
LegislatorBillScore(
bill_id=score.bill_id,
bill_topic_id=score.bill_topic_id,
legislator_id=score.legislator_id,
year=score.year,
topic=score.topic,
score=score.score,
)
for score in scores
)
if mark_processed:
bill = session.get(Bill, bill_id)
if bill is not None:
bill.score_processed_at = datetime.now(tz=UTC)
session.flush()
affected_keys = prior_keys | {
(score.legislator_id, score.year, score.topic) for score in scores
}
aggregate_rows = refresh_aggregate_scores(session, affected_keys)
return len(scores), aggregate_rows
def refresh_aggregate_scores(session: Session, keys: set[ScoreKey]) -> int:
"""Refresh aggregate legislator_score rows from per-bill source scores."""
if not keys:
return 0
key_tuple = tuple_(
LegislatorScore.legislator_id,
LegislatorScore.year,
LegislatorScore.topic,
)
session.execute(delete(LegislatorScore).where(key_tuple.in_(list(keys))))
session.flush()
source_key_tuple = tuple_(
LegislatorBillScore.legislator_id,
LegislatorBillScore.year,
LegislatorBillScore.topic,
)
rows = session.execute(
select(
LegislatorBillScore.legislator_id,
LegislatorBillScore.year,
LegislatorBillScore.topic,
func.avg(LegislatorBillScore.score).label("score"),
)
.where(source_key_tuple.in_(list(keys)))
.group_by(
LegislatorBillScore.legislator_id,
LegislatorBillScore.year,
LegislatorBillScore.topic,
)
).all()
session.add_all(
LegislatorScore(
legislator_id=row.legislator_id,
year=row.year,
topic=row.topic,
score=float(row.score),
)
for row in rows
)
session.flush()
return len(rows)
def _load_bill_vote_score_inputs(
session: Session,
*,
bill_id: int,
topics: list[str] | None,
) -> list[VoteScoreInput]:
year = extract("year", Vote.vote_date).cast(Integer).label("year")
stmt = (
select(
Vote.bill_id,
BillTopic.id.label("bill_topic_id"),
VoteRecord.legislator_id,
year,
BillTopic.topic,
BillTopic.support_position,
VoteRecord.position,
)
.join(Vote, Vote.id == VoteRecord.vote_id)
.join(BillTopic, BillTopic.bill_id == Vote.bill_id)
.where(Vote.bill_id == bill_id)
)
if topics:
stmt = stmt.where(BillTopic.topic.in_(topics))
return [
VoteScoreInput(
bill_id=row.bill_id,
bill_topic_id=row.bill_topic_id,
legislator_id=row.legislator_id,
year=int(row.year),
topic=row.topic,
support_position=row.support_position,
vote_position=row.position,
)
for row in session.execute(stmt)
]
def _existing_score_keys_for_bill(
session: Session,
*,
bill_id: int,
topics: list[str] | None,
) -> set[ScoreKey]:
stmt = select(
LegislatorBillScore.legislator_id,
LegislatorBillScore.year,
LegislatorBillScore.topic,
).where(LegislatorBillScore.bill_id == bill_id)
if topics:
stmt = stmt.where(LegislatorBillScore.topic.in_(topics))
return {(row.legislator_id, row.year, row.topic) for row in session.execute(stmt)}
def _delete_bill_scores_statement(*, bill_id: int, topics: list[str] | None):
stmt = delete(LegislatorBillScore).where(LegislatorBillScore.bill_id == bill_id)
if topics:
stmt = stmt.where(LegislatorBillScore.topic.in_(topics))
return stmt
def main() -> None:
"""CLI entrypoint."""
parser = argparse.ArgumentParser(
description="Calculate legislator_score rows from bill_topic and vote_record data."
)
parser.add_argument("--congress", type=int, help="Only score bills from one Congress.")
parser.add_argument(
"--bill-id",
action="append",
dest="bill_ids",
type=int,
help="Only score one bill id. Repeat for multiple bills.",
)
parser.add_argument(
"--topic",
action="append",
dest="topics",
help="Only calculate one topic. Repeat for multiple topics.",
)
parser.add_argument(
"--force",
action="store_true",
help="Reprocess bills even when bill.score_processed_at is already set.",
)
parser.add_argument("--limit", type=int, help="Maximum number of bills to process.")
args = parser.parse_args()
engine = get_postgres_engine(name="DATA_SCIENCE_DEV")
with Session(engine) as session:
result = calculate_and_store_legislator_scores(
session,
congress=args.congress,
bill_ids=args.bill_ids,
topics=args.topics,
force=args.force,
limit=args.limit,
)
print(
"Processed "
f"{result.processed_bills} bills; stored {result.bill_score_rows} bill score rows; "
f"refreshed {result.aggregate_score_rows} aggregate score rows."
)
if __name__ == "__main__":
main()