updated ingest_congress to use congress-legislators for legislator info

This commit is contained in:
2026-03-29 12:02:29 -04:00
parent e368402eea
commit ba8ff35109

View File

@@ -1,11 +1,12 @@
"""Ingestion pipeline for loading congress data from unitedstates/congress JSON files. """Ingestion pipeline for loading congress data from unitedstates/congress JSON files.
Loads legislators, bills, votes, vote records, and bill text into the data_science_dev database. Loads legislators, bills, votes, vote records, and bill text into the data_science_dev database.
Expects the parent directory to contain congress-tracker/ and congress-legislators/ as siblings.
Usage: Usage:
ingest-congress /path/to/congress/data/ ingest-congress /path/to/parent/
ingest-congress /path/to/congress/data/ --congress 118 ingest-congress /path/to/parent/ --congress 118
ingest-congress /path/to/congress/data/ --congress 118 --only bills ingest-congress /path/to/parent/ --congress 118 --only bills
""" """
from __future__ import annotations from __future__ import annotations
@@ -16,12 +17,13 @@ from typing import TYPE_CHECKING, Annotated
import orjson import orjson
import typer import typer
import yaml
from sqlalchemy import select from sqlalchemy import select
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from python.common import configure_logger from python.common import configure_logger
from python.orm.common import get_postgres_engine from python.orm.common import get_postgres_engine
from python.orm.data_science_dev.congress import Bill, BillText, Legislator, Vote, VoteRecord from python.orm.data_science_dev.congress import Bill, BillText, Legislator, LegislatorSocialMedia, Vote, VoteRecord
if TYPE_CHECKING: if TYPE_CHECKING:
from collections.abc import Iterator from collections.abc import Iterator
@@ -37,18 +39,28 @@ app = typer.Typer(help="Ingest unitedstates/congress data into data_science_dev.
@app.command() @app.command()
def main( def main(
data_dir: Annotated[Path, typer.Argument(help="Path to the congress data/ directory")], parent_dir: Annotated[
Path,
typer.Argument(help="Parent directory containing congress-tracker/ and congress-legislators/"),
],
congress: Annotated[int | None, typer.Option(help="Only ingest a specific congress number")] = None, congress: Annotated[int | None, typer.Option(help="Only ingest a specific congress number")] = None,
only: Annotated[ only: Annotated[
str | None, str | None,
typer.Option(help="Only run a specific step: legislators, bills, votes, bill-text"), typer.Option(help="Only run a specific step: legislators, social-media, bills, votes, bill-text"),
] = None, ] = None,
) -> None: ) -> None:
"""Ingest congress data from unitedstates/congress JSON files.""" """Ingest congress data from unitedstates/congress JSON files."""
configure_logger(level="INFO") configure_logger(level="INFO")
data_dir = parent_dir / "congress-tracker/congress/data/"
legislators_dir = parent_dir / "congress-legislators"
if not data_dir.is_dir(): if not data_dir.is_dir():
typer.echo(f"Data directory does not exist: {data_dir}", err=True) typer.echo(f"Expected congress-tracker/ directory: {data_dir}", err=True)
raise typer.Exit(code=1)
if not legislators_dir.is_dir():
typer.echo(f"Expected congress-legislators/ directory: {legislators_dir}", err=True)
raise typer.Exit(code=1) raise typer.Exit(code=1)
engine = get_postgres_engine(name="DATA_SCIENCE_DEV") engine = get_postgres_engine(name="DATA_SCIENCE_DEV")
@@ -60,11 +72,12 @@ def main(
logger.info("Found %d congress directories to process", len(congress_dirs)) logger.info("Found %d congress directories to process", len(congress_dirs))
steps = { steps: dict[str, tuple] = {
"legislators": ingest_legislators, "legislators": (ingest_legislators, (engine, legislators_dir)),
"bills": ingest_bills, "legislators-social-media": (ingest_social_media, (engine, legislators_dir)),
"votes": ingest_votes, "bills": (ingest_bills, (engine, congress_dirs)),
"bill-text": ingest_bill_text, "votes": (ingest_votes, (engine, congress_dirs)),
"bill-text": (ingest_bill_text, (engine, congress_dirs)),
} }
if only: if only:
@@ -73,9 +86,9 @@ def main(
raise typer.Exit(code=1) raise typer.Exit(code=1)
steps = {only: steps[only]} steps = {only: steps[only]}
for step_name, step_func in steps.items(): for step_name, (step_func, step_args) in steps.items():
logger.info("=== Starting step: %s ===", step_name) logger.info("=== Starting step: %s ===", step_name)
step_func(engine, congress_dirs) step_func(*step_args)
logger.info("=== Finished step: %s ===", step_name) logger.info("=== Finished step: %s ===", step_name)
logger.info("ingest-congress done") logger.info("ingest-congress done")
@@ -102,33 +115,166 @@ def _flush_batch(session: Session, batch: list[object], label: str) -> int:
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Legislators — extracted from vote JSON files (voter records include bioguide_id, name, party, state) # Legislators — loaded from congress-legislators YAML files
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def ingest_legislators(engine: Engine, congress_dirs: list[Path]) -> None: def ingest_legislators(engine: Engine, legislators_dir: Path) -> None:
"""Extract unique legislators from vote data and insert them.""" """Load legislators from congress-legislators YAML files."""
legislators_data = _load_legislators_yaml(legislators_dir)
logger.info("Loaded %d legislators from YAML files", len(legislators_data))
with Session(engine) as session: with Session(engine) as session:
seen_bioguide_ids = set(session.scalars(select(Legislator.bioguide_id)).all()) existing_legislators = {
logger.info("Found %d existing legislators in DB", len(seen_bioguide_ids)) legislator.bioguide_id: legislator for legislator in session.scalars(select(Legislator)).all()
}
logger.info("Found %d existing legislators in DB", len(existing_legislators))
total_inserted = 0 total_inserted = 0
batch: list[Legislator] = [] total_updated = 0
for congress_dir in congress_dirs: for entry in legislators_data:
votes_dir = congress_dir / "votes" bioguide_id = entry.get("id", {}).get("bioguide")
if not votes_dir.is_dir(): if not bioguide_id:
continue continue
logger.info("Scanning legislators from %s", congress_dir.name)
for vote_file in votes_dir.rglob("data.json"):
data = _read_json(vote_file)
if data is None:
continue
_extract_legislators(data, seen_bioguide_ids, batch)
if len(batch) >= BATCH_SIZE:
total_inserted += _flush_batch(session, batch, "legislators")
total_inserted += _flush_batch(session, batch, "legislators") fields = _parse_legislator(entry)
logger.info("Inserted %d new legislators total", total_inserted) if existing := existing_legislators.get(bioguide_id):
changed = False
for field, value in fields.items():
if value is not None and getattr(existing, field) != value:
setattr(existing, field, value)
changed = True
if changed:
total_updated += 1
else:
session.add(Legislator(bioguide_id=bioguide_id, **fields))
total_inserted += 1
session.commit()
logger.info("Inserted %d new legislators, updated %d existing", total_inserted, total_updated)
def _load_legislators_yaml(legislators_dir: Path) -> list[dict]:
"""Load and combine legislators-current.yaml and legislators-historical.yaml."""
legislators: list[dict] = []
for filename in ("legislators-current.yaml", "legislators-historical.yaml"):
path = legislators_dir / filename
if not path.exists():
logger.warning("Legislators file not found: %s", path)
continue
with path.open() as file:
data = yaml.safe_load(file)
if isinstance(data, list):
legislators.extend(data)
return legislators
def _parse_legislator(entry: dict) -> dict:
"""Extract Legislator fields from a congress-legislators YAML entry."""
ids = entry.get("id", {})
name = entry.get("name", {})
bio = entry.get("bio", {})
terms = entry.get("terms", [])
latest_term = terms[-1] if terms else {}
fec_ids = ids.get("fec")
fec_ids_joined = ",".join(fec_ids) if isinstance(fec_ids, list) else fec_ids
chamber = latest_term.get("type")
chamber_normalized = {"rep": "House", "sen": "Senate"}.get(chamber, chamber)
return {
"thomas_id": ids.get("thomas"),
"lis_id": ids.get("lis"),
"govtrack_id": ids.get("govtrack"),
"opensecrets_id": ids.get("opensecrets"),
"fec_ids": fec_ids_joined,
"first_name": name.get("first"),
"last_name": name.get("last"),
"official_full_name": name.get("official_full"),
"nickname": name.get("nickname"),
"birthday": bio.get("birthday"),
"gender": bio.get("gender"),
"current_party": latest_term.get("party"),
"current_state": latest_term.get("state"),
"current_district": latest_term.get("district"),
"current_chamber": chamber_normalized,
}
# ---------------------------------------------------------------------------
# Social Media — loaded from legislators-social-media.yaml
# ---------------------------------------------------------------------------
SOCIAL_MEDIA_PLATFORMS = {
"twitter": "https://twitter.com/{account}",
"facebook": "https://facebook.com/{account}",
"youtube": "https://youtube.com/{account}",
"instagram": "https://instagram.com/{account}",
"mastodon": None,
}
def ingest_social_media(engine: Engine, legislators_dir: Path) -> None:
"""Load social media accounts from legislators-social-media.yaml."""
social_media_path = legislators_dir / "legislators-social-media.yaml"
if not social_media_path.exists():
logger.warning("Social media file not found: %s", social_media_path)
return
with social_media_path.open() as file:
social_media_data = yaml.safe_load(file)
if not isinstance(social_media_data, list):
logger.warning("Unexpected format in %s", social_media_path)
return
logger.info("Loaded %d entries from legislators-social-media.yaml", len(social_media_data))
with Session(engine) as session:
legislator_map = _build_legislator_map(session)
existing_accounts = {
(account.legislator_id, account.platform)
for account in session.scalars(select(LegislatorSocialMedia)).all()
}
logger.info("Found %d existing social media accounts in DB", len(existing_accounts))
total_inserted = 0
total_updated = 0
for entry in social_media_data:
bioguide_id = entry.get("id", {}).get("bioguide")
if not bioguide_id:
continue
legislator_id = legislator_map.get(bioguide_id)
if legislator_id is None:
continue
social = entry.get("social", {})
for platform, url_template in SOCIAL_MEDIA_PLATFORMS.items():
account_name = social.get(platform)
if not account_name:
continue
url = url_template.format(account=account_name) if url_template else None
if (legislator_id, platform) in existing_accounts:
total_updated += 1
else:
session.add(
LegislatorSocialMedia(
legislator_id=legislator_id,
platform=platform,
account_name=str(account_name),
url=url,
source="https://github.com/unitedstates/congress-legislators",
)
)
existing_accounts.add((legislator_id, platform))
total_inserted += 1
session.commit()
logger.info("Inserted %d new social media accounts, updated %d existing", total_inserted, total_updated)
def _iter_voters(position_group: object) -> Iterator[dict]: def _iter_voters(position_group: object) -> Iterator[dict]:
@@ -141,37 +287,6 @@ def _iter_voters(position_group: object) -> Iterator[dict]:
yield voter yield voter
def _extract_legislators(data: dict, seen_bioguide_ids: set[str], batch: list[Legislator]) -> None:
"""Extract unique legislators from a single vote data.json."""
for position_group in data.get("votes", {}).values():
for voter in _iter_voters(position_group):
bioguide_id = voter.get("id")
if not bioguide_id or bioguide_id in seen_bioguide_ids:
continue
seen_bioguide_ids.add(bioguide_id)
first_name, last_name = _split_name(voter.get("display_name", ""))
batch.append(
Legislator(
bioguide_id=bioguide_id,
first_name=first_name,
last_name=last_name,
current_party=voter.get("party"),
current_state=voter.get("state"),
)
)
def _split_name(display_name: str) -> tuple[str, str]:
"""Split 'Last, First' or 'Name' into (first, last)."""
if "," in display_name:
parts = display_name.split(",", 1)
return parts[1].strip(), parts[0].strip()
parts = display_name.rsplit(" ", 1)
if len(parts) > 1:
return parts[0].strip(), parts[1].strip()
return display_name, ""
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Bills # Bills
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------