updated ingest_congress to use congress-legislators for legislator info

2026-03-29 12:02:29 -04:00
parent e368402eea
commit ba8ff35109
1 changed files with 179 additions and 64 deletions
@@ -1,11 +1,12 @@
 """Ingestion pipeline for loading congress data from unitedstates/congress JSON files.
 Loads legislators, bills, votes, vote records, and bill text into the data_science_dev database.
 Expects the parent directory to contain congress-tracker/ and congress-legislators/ as siblings.
 Usage:
-    ingest-congress /path/to/congress/data/
+    ingest-congress /path/to/parent/
-    ingest-congress /path/to/congress/data/ --congress 118
+    ingest-congress /path/to/parent/ --congress 118
-    ingest-congress /path/to/congress/data/ --congress 118 --only bills
+    ingest-congress /path/to/parent/ --congress 118 --only bills
 """
 from __future__ import annotations
@@ -16,12 +17,13 @@ from typing import TYPE_CHECKING, Annotated
 import orjson
 import typer
 import yaml
 from sqlalchemy import select
 from sqlalchemy.orm import Session
 from python.common import configure_logger
 from python.orm.common import get_postgres_engine
-from python.orm.data_science_dev.congress import Bill, BillText, Legislator, Vote, VoteRecord
+from python.orm.data_science_dev.congress import Bill, BillText, Legislator, LegislatorSocialMedia, Vote, VoteRecord
 if TYPE_CHECKING:
    from collections.abc import Iterator
@@ -37,18 +39,28 @@ app = typer.Typer(help="Ingest unitedstates/congress data into data_science_dev.
@app.command()
 def main(
-    data_dir: Annotated[Path, typer.Argument(help="Path to the congress data/ directory")],
+    parent_dir: Annotated[
        Path,
        typer.Argument(help="Parent directory containing congress-tracker/ and congress-legislators/"),
    ],
    congress: Annotated[int | None, typer.Option(help="Only ingest a specific congress number")] = None,
    only: Annotated[
        str | None,
-        typer.Option(help="Only run a specific step: legislators, bills, votes, bill-text"),
+        typer.Option(help="Only run a specific step: legislators, social-media, bills, votes, bill-text"),
    ] = None,
 ) -> None:
    """Ingest congress data from unitedstates/congress JSON files."""
    configure_logger(level="INFO")
    data_dir = parent_dir / "congress-tracker/congress/data/"
    legislators_dir = parent_dir / "congress-legislators"
    if not data_dir.is_dir():
-        typer.echo(f"Data directory does not exist: {data_dir}", err=True)
+        typer.echo(f"Expected congress-tracker/ directory: {data_dir}", err=True)
        raise typer.Exit(code=1)
    if not legislators_dir.is_dir():
        typer.echo(f"Expected congress-legislators/ directory: {legislators_dir}", err=True)
        raise typer.Exit(code=1)
    engine = get_postgres_engine(name="DATA_SCIENCE_DEV")
@@ -60,11 +72,12 @@ def main(
    logger.info("Found %d congress directories to process", len(congress_dirs))
-    steps = {
+    steps: dict[str, tuple] = {
-        "legislators": ingest_legislators,
+        "legislators": (ingest_legislators, (engine, legislators_dir)),
-        "bills": ingest_bills,
+        "legislators-social-media": (ingest_social_media, (engine, legislators_dir)),
-        "votes": ingest_votes,
+        "bills": (ingest_bills, (engine, congress_dirs)),
-        "bill-text": ingest_bill_text,
+        "votes": (ingest_votes, (engine, congress_dirs)),
        "bill-text": (ingest_bill_text, (engine, congress_dirs)),
    }
    if only:
@@ -73,9 +86,9 @@ def main(
            raise typer.Exit(code=1)
        steps = {only: steps[only]}
-    for step_name, step_func in steps.items():
+    for step_name, (step_func, step_args) in steps.items():
        logger.info("=== Starting step: %s ===", step_name)
-        step_func(engine, congress_dirs)
+        step_func(*step_args)
        logger.info("=== Finished step: %s ===", step_name)
    logger.info("ingest-congress done")
@@ -102,33 +115,166 @@ def _flush_batch(session: Session, batch: list[object], label: str) -> int:
 # ---------------------------------------------------------------------------
-# Legislators — extracted from vote JSON files (voter records include bioguide_id, name, party, state)
+# Legislators — loaded from congress-legislators YAML files
 # ---------------------------------------------------------------------------
-def ingest_legislators(engine: Engine, congress_dirs: list[Path]) -> None:
+def ingest_legislators(engine: Engine, legislators_dir: Path) -> None:
-    """Extract unique legislators from vote data and insert them."""
+    """Load legislators from congress-legislators YAML files."""
    legislators_data = _load_legislators_yaml(legislators_dir)
    logger.info("Loaded %d legislators from YAML files", len(legislators_data))
    with Session(engine) as session:
-        seen_bioguide_ids = set(session.scalars(select(Legislator.bioguide_id)).all())
+        existing_legislators = {
-        logger.info("Found %d existing legislators in DB", len(seen_bioguide_ids))
+            legislator.bioguide_id: legislator for legislator in session.scalars(select(Legislator)).all()
        }
        logger.info("Found %d existing legislators in DB", len(existing_legislators))
        total_inserted = 0
-        batch: list[Legislator] = []
+        total_updated = 0
-        for congress_dir in congress_dirs:
+        for entry in legislators_data:
-            votes_dir = congress_dir / "votes"
+            bioguide_id = entry.get("id", {}).get("bioguide")
-            if not votes_dir.is_dir():
+            if not bioguide_id:
                continue
            logger.info("Scanning legislators from %s", congress_dir.name)
            for vote_file in votes_dir.rglob("data.json"):
                data = _read_json(vote_file)
                if data is None:
                    continue
                _extract_legislators(data, seen_bioguide_ids, batch)
                if len(batch) >= BATCH_SIZE:
                    total_inserted += _flush_batch(session, batch, "legislators")
-        total_inserted += _flush_batch(session, batch, "legislators")
+            fields = _parse_legislator(entry)
-    logger.info("Inserted %d new legislators total", total_inserted)
+            if existing := existing_legislators.get(bioguide_id):
                changed = False
                for field, value in fields.items():
                    if value is not None and getattr(existing, field) != value:
                        setattr(existing, field, value)
                        changed = True
                if changed:
                    total_updated += 1
            else:
                session.add(Legislator(bioguide_id=bioguide_id, **fields))
                total_inserted += 1
        session.commit()
    logger.info("Inserted %d new legislators, updated %d existing", total_inserted, total_updated)
 def _load_legislators_yaml(legislators_dir: Path) -> list[dict]:
    """Load and combine legislators-current.yaml and legislators-historical.yaml."""
    legislators: list[dict] = []
    for filename in ("legislators-current.yaml", "legislators-historical.yaml"):
        path = legislators_dir / filename
        if not path.exists():
            logger.warning("Legislators file not found: %s", path)
            continue
        with path.open() as file:
            data = yaml.safe_load(file)
            if isinstance(data, list):
                legislators.extend(data)
    return legislators
 def _parse_legislator(entry: dict) -> dict:
    """Extract Legislator fields from a congress-legislators YAML entry."""
    ids = entry.get("id", {})
    name = entry.get("name", {})
    bio = entry.get("bio", {})
    terms = entry.get("terms", [])
    latest_term = terms[-1] if terms else {}
    fec_ids = ids.get("fec")
    fec_ids_joined = ",".join(fec_ids) if isinstance(fec_ids, list) else fec_ids
    chamber = latest_term.get("type")
    chamber_normalized = {"rep": "House", "sen": "Senate"}.get(chamber, chamber)
    return {
        "thomas_id": ids.get("thomas"),
        "lis_id": ids.get("lis"),
        "govtrack_id": ids.get("govtrack"),
        "opensecrets_id": ids.get("opensecrets"),
        "fec_ids": fec_ids_joined,
        "first_name": name.get("first"),
        "last_name": name.get("last"),
        "official_full_name": name.get("official_full"),
        "nickname": name.get("nickname"),
        "birthday": bio.get("birthday"),
        "gender": bio.get("gender"),
        "current_party": latest_term.get("party"),
        "current_state": latest_term.get("state"),
        "current_district": latest_term.get("district"),
        "current_chamber": chamber_normalized,
    }
 # ---------------------------------------------------------------------------
 # Social Media — loaded from legislators-social-media.yaml
 # ---------------------------------------------------------------------------
 SOCIAL_MEDIA_PLATFORMS = {
    "twitter": "https://twitter.com/{account}",
    "facebook": "https://facebook.com/{account}",
    "youtube": "https://youtube.com/{account}",
    "instagram": "https://instagram.com/{account}",
    "mastodon": None,
 }
 def ingest_social_media(engine: Engine, legislators_dir: Path) -> None:
    """Load social media accounts from legislators-social-media.yaml."""
    social_media_path = legislators_dir / "legislators-social-media.yaml"
    if not social_media_path.exists():
        logger.warning("Social media file not found: %s", social_media_path)
        return
    with social_media_path.open() as file:
        social_media_data = yaml.safe_load(file)
    if not isinstance(social_media_data, list):
        logger.warning("Unexpected format in %s", social_media_path)
        return
    logger.info("Loaded %d entries from legislators-social-media.yaml", len(social_media_data))
    with Session(engine) as session:
        legislator_map = _build_legislator_map(session)
        existing_accounts = {
            (account.legislator_id, account.platform)
            for account in session.scalars(select(LegislatorSocialMedia)).all()
        }
        logger.info("Found %d existing social media accounts in DB", len(existing_accounts))
        total_inserted = 0
        total_updated = 0
        for entry in social_media_data:
            bioguide_id = entry.get("id", {}).get("bioguide")
            if not bioguide_id:
                continue
            legislator_id = legislator_map.get(bioguide_id)
            if legislator_id is None:
                continue
            social = entry.get("social", {})
            for platform, url_template in SOCIAL_MEDIA_PLATFORMS.items():
                account_name = social.get(platform)
                if not account_name:
                    continue
                url = url_template.format(account=account_name) if url_template else None
                if (legislator_id, platform) in existing_accounts:
                    total_updated += 1
                else:
                    session.add(
                        LegislatorSocialMedia(
                            legislator_id=legislator_id,
                            platform=platform,
                            account_name=str(account_name),
                            url=url,
                            source="https://github.com/unitedstates/congress-legislators",
                        )
                    )
                    existing_accounts.add((legislator_id, platform))
                    total_inserted += 1
        session.commit()
    logger.info("Inserted %d new social media accounts, updated %d existing", total_inserted, total_updated)
 def _iter_voters(position_group: object) -> Iterator[dict]:
@@ -141,37 +287,6 @@ def _iter_voters(position_group: object) -> Iterator[dict]:
                yield voter
 def _extract_legislators(data: dict, seen_bioguide_ids: set[str], batch: list[Legislator]) -> None:
    """Extract unique legislators from a single vote data.json."""
    for position_group in data.get("votes", {}).values():
        for voter in _iter_voters(position_group):
            bioguide_id = voter.get("id")
            if not bioguide_id or bioguide_id in seen_bioguide_ids:
                continue
            seen_bioguide_ids.add(bioguide_id)
            first_name, last_name = _split_name(voter.get("display_name", ""))
            batch.append(
                Legislator(
                    bioguide_id=bioguide_id,
                    first_name=first_name,
                    last_name=last_name,
                    current_party=voter.get("party"),
                    current_state=voter.get("state"),
                )
            )
 def _split_name(display_name: str) -> tuple[str, str]:
    """Split 'Last, First' or 'Name' into (first, last)."""
    if "," in display_name:
        parts = display_name.split(",", 1)
        return parts[1].strip(), parts[0].strip()
    parts = display_name.rsplit(" ", 1)
    if len(parts) > 1:
        return parts[0].strip(), parts[1].strip()
    return display_name, ""
 # ---------------------------------------------------------------------------
 # Bills
 # ---------------------------------------------------------------------------