added orm code

2026-04-14 18:17:35 -04:00
parent 7979dc3328
commit 2abd61d3b1
14 changed files with 527 additions and 0 deletions
--- a/pipelines/orm/init.py
+++ b/pipelines/orm/init.py
@@ -0,0 +1,7 @@
 """ORM package exports."""
 from pipelines.orm.data_science_dev.base import DataScienceDevBase
 __all__ = [
    "DataScienceDevBase",
 ]
--- a/pipelines/orm/common.py
+++ b/pipelines/orm/common.py
@@ -0,0 +1,51 @@
 """Shared ORM definitions."""
 from __future__ import annotations
 from os import getenv
 from typing import cast
 from sqlalchemy import create_engine
 from sqlalchemy.engine import URL, Engine
 NAMING_CONVENTION = {
    "ix": "ix_%(table_name)s_%(column_0_name)s",
    "uq": "uq_%(table_name)s_%(column_0_name)s",
    "ck": "ck_%(table_name)s_%(constraint_name)s",
    "fk": "fk_%(table_name)s_%(column_0_name)s_%(referred_table_name)s",
    "pk": "pk_%(table_name)s",
 }
 def get_connection_info(name: str) -> tuple[str, str, str, str, str | None]:
    """Get connection info from environment variables."""
    database = getenv(f"{name}_DB")
    host = getenv(f"{name}_HOST")
    port = getenv(f"{name}_PORT")
    username = getenv(f"{name}_USER")
    password = getenv(f"{name}_PASSWORD")
    if None in (database, host, port, username):
        error = f"Missing environment variables for Postgres connection.\n{database=}\n{host=}\n{port=}\n{username=}\n"
        raise ValueError(error)
    return cast("tuple[str, str, str, str, str | None]", (database, host, port, username, password))
 def get_postgres_engine(*, name: str = "POSTGRES", pool_pre_ping: bool = True) -> Engine:
    """Create a SQLAlchemy engine from environment variables."""
    database, host, port, username, password = get_connection_info(name)
    url = URL.create(
        drivername="postgresql+psycopg",
        username=username,
        password=password,
        host=host,
        port=int(port),
        database=database,
    )
    return create_engine(
        url=url,
        pool_pre_ping=pool_pre_ping,
        pool_recycle=1800,
    )
--- a/pipelines/orm/data_science_dev/init.py
+++ b/pipelines/orm/data_science_dev/init.py
@@ -0,0 +1,15 @@
 """Data science dev database ORM exports."""
 from __future__ import annotations
 from pipelines.orm.data_science_dev.base import (
    DataScienceDevBase,
    DataScienceDevTableBase,
    DataScienceDevTableBaseBig,
 )
 __all__ = [
    "DataScienceDevBase",
    "DataScienceDevTableBase",
    "DataScienceDevTableBaseBig",
 ]
--- a/pipelines/orm/data_science_dev/base.py
+++ b/pipelines/orm/data_science_dev/base.py
@@ -0,0 +1,52 @@
 """Data science dev database ORM base."""
 from __future__ import annotations
 from datetime import datetime
 from sqlalchemy import BigInteger, DateTime, MetaData, func
 from sqlalchemy.ext.declarative import AbstractConcreteBase
 from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
 from pipelines.orm.common import NAMING_CONVENTION
 class DataScienceDevBase(DeclarativeBase):
    """Base class for data_science_dev database ORM models."""
    schema_name = "main"
    metadata = MetaData(
        schema=schema_name,
        naming_convention=NAMING_CONVENTION,
    )
 class _TableMixin:
    """Shared timestamp columns for all table bases."""
    created: Mapped[datetime] = mapped_column(
        DateTime(timezone=True),
        server_default=func.now(),
    )
    updated: Mapped[datetime] = mapped_column(
        DateTime(timezone=True),
        server_default=func.now(),
        onupdate=func.now(),
    )
 class DataScienceDevTableBase(_TableMixin, AbstractConcreteBase, DataScienceDevBase):
    """Table with Integer primary key."""
    __abstract__ = True
    id: Mapped[int] = mapped_column(primary_key=True)
 class DataScienceDevTableBaseBig(_TableMixin, AbstractConcreteBase, DataScienceDevBase):
    """Table with BigInteger primary key."""
    __abstract__ = True
    id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
--- a/pipelines/orm/data_science_dev/congress/init.py
+++ b/pipelines/orm/data_science_dev/congress/init.py
@@ -0,0 +1,17 @@
 """init."""
 from pipelines.orm.data_science_dev.congress.bill import Bill, BillText
 from pipelines.orm.data_science_dev.congress.legislator import (
    Legislator,
    LegislatorSocialMedia,
 )
 from pipelines.orm.data_science_dev.congress.vote import Vote, VoteRecord
 __all__ = [
    "Bill",
    "BillText",
    "Legislator",
    "LegislatorSocialMedia",
    "Vote",
    "VoteRecord",
 ]
--- a/pipelines/orm/data_science_dev/congress/bill.py
+++ b/pipelines/orm/data_science_dev/congress/bill.py
@@ -0,0 +1,72 @@
 """Bill model - legislation introduced in Congress."""
 from __future__ import annotations
 from datetime import date
 from typing import TYPE_CHECKING
 from sqlalchemy import ForeignKey, Index, UniqueConstraint
 from sqlalchemy.orm import Mapped, mapped_column, relationship
 from pipelines.orm.data_science_dev.base import DataScienceDevTableBase
 if TYPE_CHECKING:
    from pipelines.orm.data_science_dev.congress.vote import Vote
 class Bill(DataScienceDevTableBase):
    """Legislation with congress number, type, titles, status, and sponsor."""
    __tablename__ = "bill"
    congress: Mapped[int]
    bill_type: Mapped[str]
    number: Mapped[int]
    title: Mapped[str | None]
    title_short: Mapped[str | None]
    official_title: Mapped[str | None]
    status: Mapped[str | None]
    status_at: Mapped[date | None]
    sponsor_bioguide_id: Mapped[str | None]
    subjects_top_term: Mapped[str | None]
    votes: Mapped[list[Vote]] = relationship(
        "Vote",
        back_populates="bill",
    )
    bill_texts: Mapped[list[BillText]] = relationship(
        "BillText",
        back_populates="bill",
        cascade="all, delete-orphan",
    )
    __table_args__ = (
        UniqueConstraint(
            "congress", "bill_type", "number", name="uq_bill_congress_type_number"
        ),
        Index("ix_bill_congress", "congress"),
    )
 class BillText(DataScienceDevTableBase):
    """Stores different text versions of a bill (introduced, enrolled, etc.)."""
    __tablename__ = "bill_text"
    bill_id: Mapped[int] = mapped_column(ForeignKey("main.bill.id", ondelete="CASCADE"))
    version_code: Mapped[str]
    version_name: Mapped[str | None]
    text_content: Mapped[str | None]
    date: Mapped[date | None]
    bill: Mapped[Bill] = relationship("Bill", back_populates="bill_texts")
    __table_args__ = (
        UniqueConstraint(
            "bill_id", "version_code", name="uq_bill_text_bill_id_version_code"
        ),
    )
--- a/pipelines/orm/data_science_dev/congress/legislator.py
+++ b/pipelines/orm/data_science_dev/congress/legislator.py
@@ -0,0 +1,68 @@
 """Legislator model - members of Congress."""
 from __future__ import annotations
 from datetime import date
 from typing import TYPE_CHECKING
 from sqlalchemy import ForeignKey, Text
 from sqlalchemy.orm import Mapped, mapped_column, relationship
 from pipelines.orm.data_science_dev.base import DataScienceDevTableBase
 if TYPE_CHECKING:
    from pipelines.orm.data_science_dev.congress.vote import VoteRecord
 class Legislator(DataScienceDevTableBase):
    """Members of Congress with identification and current term info."""
    __tablename__ = "legislator"
    bioguide_id: Mapped[str] = mapped_column(Text, unique=True, index=True)
    thomas_id: Mapped[str | None]
    lis_id: Mapped[str | None]
    govtrack_id: Mapped[int | None]
    opensecrets_id: Mapped[str | None]
    fec_ids: Mapped[str | None]
    first_name: Mapped[str]
    last_name: Mapped[str]
    official_full_name: Mapped[str | None]
    nickname: Mapped[str | None]
    birthday: Mapped[date | None]
    gender: Mapped[str | None]
    current_party: Mapped[str | None]
    current_state: Mapped[str | None]
    current_district: Mapped[int | None]
    current_chamber: Mapped[str | None]
    social_media_accounts: Mapped[list[LegislatorSocialMedia]] = relationship(
        "LegislatorSocialMedia",
        back_populates="legislator",
        cascade="all, delete-orphan",
    )
    vote_records: Mapped[list[VoteRecord]] = relationship(
        "VoteRecord",
        back_populates="legislator",
        cascade="all, delete-orphan",
    )
 class LegislatorSocialMedia(DataScienceDevTableBase):
    """Social media account linked to a legislator."""
    __tablename__ = "legislator_social_media"
    legislator_id: Mapped[int] = mapped_column(ForeignKey("main.legislator.id"))
    platform: Mapped[str]
    account_name: Mapped[str]
    url: Mapped[str | None]
    source: Mapped[str]
    legislator: Mapped[Legislator] = relationship(
        back_populates="social_media_accounts"
    )
--- a/pipelines/orm/data_science_dev/congress/vote.py
+++ b/pipelines/orm/data_science_dev/congress/vote.py
@@ -0,0 +1,84 @@
 """Vote model - roll call votes in Congress."""
 from __future__ import annotations
 from datetime import date
 from typing import TYPE_CHECKING
 from sqlalchemy import ForeignKey, Index, UniqueConstraint
 from sqlalchemy.orm import Mapped, mapped_column, relationship
 from pipelines.orm.data_science_dev.base import (
    DataScienceDevBase,
    DataScienceDevTableBase,
 )
 if TYPE_CHECKING:
    from pipelines.orm.data_science_dev.congress.bill import Bill
    from pipelines.orm.data_science_dev.congress.legislator import Legislator
    from pipelines.orm.data_science_dev.congress.vote import Vote
 class VoteRecord(DataScienceDevBase):
    """Links a vote to a legislator with their position (Yea, Nay, etc.)."""
    __tablename__ = "vote_record"
    vote_id: Mapped[int] = mapped_column(
        ForeignKey("main.vote.id", ondelete="CASCADE"),
        primary_key=True,
    )
    legislator_id: Mapped[int] = mapped_column(
        ForeignKey("main.legislator.id", ondelete="CASCADE"),
        primary_key=True,
    )
    position: Mapped[str]
    vote: Mapped[Vote] = relationship("Vote", back_populates="vote_records")
    legislator: Mapped[Legislator] = relationship(
        "Legislator", back_populates="vote_records"
    )
 class Vote(DataScienceDevTableBase):
    """Roll call votes with counts and optional bill linkage."""
    __tablename__ = "vote"
    congress: Mapped[int]
    chamber: Mapped[str]
    session: Mapped[int]
    number: Mapped[int]
    vote_type: Mapped[str | None]
    question: Mapped[str | None]
    result: Mapped[str | None]
    result_text: Mapped[str | None]
    vote_date: Mapped[date]
    yea_count: Mapped[int | None]
    nay_count: Mapped[int | None]
    not_voting_count: Mapped[int | None]
    present_count: Mapped[int | None]
    bill_id: Mapped[int | None] = mapped_column(ForeignKey("main.bill.id"))
    bill: Mapped[Bill | None] = relationship("Bill", back_populates="votes")
    vote_records: Mapped[list[VoteRecord]] = relationship(
        "VoteRecord",
        back_populates="vote",
        cascade="all, delete-orphan",
    )
    __table_args__ = (
        UniqueConstraint(
            "congress",
            "chamber",
            "session",
            "number",
            name="uq_vote_congress_chamber_session_number",
        ),
        Index("ix_vote_date", "vote_date"),
        Index("ix_vote_congress_chamber", "congress", "chamber"),
    )
--- a/pipelines/orm/data_science_dev/models.py
+++ b/pipelines/orm/data_science_dev/models.py
@@ -0,0 +1,16 @@
 """Data science dev database ORM models."""
 from __future__ import annotations
 from pipelines.orm.data_science_dev.congress import Bill, BillText, Legislator, Vote, VoteRecord
 from pipelines.orm.data_science_dev.posts import partitions  # noqa: F401 — registers partition classes in metadata
 from pipelines.orm.data_science_dev.posts.tables import Posts
 __all__ = [
    "Bill",
    "BillText",
    "Legislator",
    "Posts",
    "Vote",
    "VoteRecord",
 ]
--- a/pipelines/orm/data_science_dev/posts/init.py
+++ b/pipelines/orm/data_science_dev/posts/init.py
@@ -0,0 +1,11 @@
 """Posts module — weekly-partitioned posts table and partition ORM models."""
 from __future__ import annotations
 from pipelines.orm.data_science_dev.posts.failed_ingestion import FailedIngestion
 from pipelines.orm.data_science_dev.posts.tables import Posts
 __all__ = [
    "FailedIngestion",
    "Posts",
 ]
--- a/pipelines/orm/data_science_dev/posts/columns.py
+++ b/pipelines/orm/data_science_dev/posts/columns.py
@@ -0,0 +1,33 @@
 """Shared column definitions for the posts partitioned table family."""
 from __future__ import annotations
 from datetime import datetime
 from sqlalchemy import BigInteger, SmallInteger, Text
 from sqlalchemy.orm import Mapped, mapped_column
 class PostsColumns:
    """Mixin providing all posts columns. Used by both the parent table and partitions."""
    post_id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
    user_id: Mapped[int] = mapped_column(BigInteger)
    instance: Mapped[str]
    date: Mapped[datetime] = mapped_column(primary_key=True)
    text: Mapped[str] = mapped_column(Text)
    langs: Mapped[str | None]
    like_count: Mapped[int]
    reply_count: Mapped[int]
    repost_count: Mapped[int]
    reply_to: Mapped[int | None] = mapped_column(BigInteger)
    replied_author: Mapped[int | None] = mapped_column(BigInteger)
    thread_root: Mapped[int | None] = mapped_column(BigInteger)
    thread_root_author: Mapped[int | None] = mapped_column(BigInteger)
    repost_from: Mapped[int | None] = mapped_column(BigInteger)
    reposted_author: Mapped[int | None] = mapped_column(BigInteger)
    quotes: Mapped[int | None] = mapped_column(BigInteger)
    quoted_author: Mapped[int | None] = mapped_column(BigInteger)
    labels: Mapped[str | None]
    sent_label: Mapped[int | None] = mapped_column(SmallInteger)
    sent_score: Mapped[float | None]
--- a/pipelines/orm/data_science_dev/posts/failed_ingestion.py
+++ b/pipelines/orm/data_science_dev/posts/failed_ingestion.py
@@ -0,0 +1,17 @@
 """Table for storing JSONL lines that failed during post ingestion."""
 from __future__ import annotations
 from sqlalchemy import Text
 from sqlalchemy.orm import Mapped, mapped_column
 from pipelines.orm.data_science_dev.base import DataScienceDevTableBase
 class FailedIngestion(DataScienceDevTableBase):
    """Stores raw JSONL lines and their error messages when ingestion fails."""
    __tablename__ = "failed_ingestion"
    raw_line: Mapped[str] = mapped_column(Text)
    error: Mapped[str] = mapped_column(Text)
--- a/pipelines/orm/data_science_dev/posts/partitions.py
+++ b/pipelines/orm/data_science_dev/posts/partitions.py
@@ -0,0 +1,71 @@
 """Dynamically generated ORM classes for each weekly partition of the posts table.
 Each class maps to a PostgreSQL partition table (e.g. posts_2024_01).
 These are real ORM models tracked by Alembic autogenerate.
 Uses ISO week numbering (datetime.isocalendar().week). ISO years can have
 52 or 53 weeks, and week boundaries are always Monday to Monday.
 """
 from __future__ import annotations
 import sys
 from datetime import UTC, datetime
 from pipelines.orm.data_science_dev.base import DataScienceDevBase
 from pipelines.orm.data_science_dev.posts.columns import PostsColumns
 PARTITION_START_YEAR = 2023
 PARTITION_END_YEAR = 2026
 _current_module = sys.modules[__name__]
 def iso_weeks_in_year(year: int) -> int:
    """Return the number of ISO weeks in a given year (52 or 53)."""
    dec_28 = datetime(year, 12, 28, tzinfo=UTC)
    return dec_28.isocalendar().week
 def week_bounds(year: int, week: int) -> tuple[datetime, datetime]:
    """Return (start, end) datetimes for an ISO week.
    Start = Monday 00:00:00 UTC of the given ISO week.
    End   = Monday 00:00:00 UTC of the following ISO week.
    """
    start = datetime.fromisocalendar(year, week, 1).replace(tzinfo=UTC)
    if week < iso_weeks_in_year(year):
        end = datetime.fromisocalendar(year, week + 1, 1).replace(tzinfo=UTC)
    else:
        end = datetime.fromisocalendar(year + 1, 1, 1).replace(tzinfo=UTC)
    return start, end
 def _build_partition_classes() -> dict[str, type]:
    """Generate one ORM class per ISO week partition."""
    classes: dict[str, type] = {}
    for year in range(PARTITION_START_YEAR, PARTITION_END_YEAR + 1):
        for week in range(1, iso_weeks_in_year(year) + 1):
            class_name = f"PostsWeek{year}W{week:02d}"
            table_name = f"posts_{year}_{week:02d}"
            partition_class = type(
                class_name,
                (PostsColumns, DataScienceDevBase),
                {
                    "__tablename__": table_name,
                    "__table_args__": ({"implicit_returning": False},),
                },
            )
            classes[class_name] = partition_class
    return classes
 # Generate all partition classes and register them on this module
 _partition_classes = _build_partition_classes()
 for _name, _cls in _partition_classes.items():
    setattr(_current_module, _name, _cls)
 __all__ = list(_partition_classes.keys())
--- a/pipelines/orm/data_science_dev/posts/tables.py
+++ b/pipelines/orm/data_science_dev/posts/tables.py
@@ -0,0 +1,13 @@
 """Posts parent table with PostgreSQL weekly range partitioning on date column."""
 from __future__ import annotations
 from pipelines.orm.data_science_dev.base import DataScienceDevBase
 from pipelines.orm.data_science_dev.posts.columns import PostsColumns
 class Posts(PostsColumns, DataScienceDevBase):
    """Parent partitioned table for posts, partitioned by week on `date`."""
    __tablename__ = "posts"
    __table_args__ = ({"postgresql_partition_by": "RANGE (date)"},)