mirror of
https://github.com/RichieCahill/dotfiles.git
synced 2026-04-17 21:18:18 -04:00
updated ingest_congress to use congress-legislators for legislator info
This commit is contained in:
@@ -1,11 +1,12 @@
|
|||||||
"""Ingestion pipeline for loading congress data from unitedstates/congress JSON files.
|
"""Ingestion pipeline for loading congress data from unitedstates/congress JSON files.
|
||||||
|
|
||||||
Loads legislators, bills, votes, vote records, and bill text into the data_science_dev database.
|
Loads legislators, bills, votes, vote records, and bill text into the data_science_dev database.
|
||||||
|
Expects the parent directory to contain congress-tracker/ and congress-legislators/ as siblings.
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
ingest-congress /path/to/congress/data/
|
ingest-congress /path/to/parent/
|
||||||
ingest-congress /path/to/congress/data/ --congress 118
|
ingest-congress /path/to/parent/ --congress 118
|
||||||
ingest-congress /path/to/congress/data/ --congress 118 --only bills
|
ingest-congress /path/to/parent/ --congress 118 --only bills
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
@@ -16,12 +17,13 @@ from typing import TYPE_CHECKING, Annotated
|
|||||||
|
|
||||||
import orjson
|
import orjson
|
||||||
import typer
|
import typer
|
||||||
|
import yaml
|
||||||
from sqlalchemy import select
|
from sqlalchemy import select
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
from python.common import configure_logger
|
from python.common import configure_logger
|
||||||
from python.orm.common import get_postgres_engine
|
from python.orm.common import get_postgres_engine
|
||||||
from python.orm.data_science_dev.congress import Bill, BillText, Legislator, Vote, VoteRecord
|
from python.orm.data_science_dev.congress import Bill, BillText, Legislator, LegislatorSocialMedia, Vote, VoteRecord
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from collections.abc import Iterator
|
from collections.abc import Iterator
|
||||||
@@ -37,18 +39,28 @@ app = typer.Typer(help="Ingest unitedstates/congress data into data_science_dev.
|
|||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
def main(
|
def main(
|
||||||
data_dir: Annotated[Path, typer.Argument(help="Path to the congress data/ directory")],
|
parent_dir: Annotated[
|
||||||
|
Path,
|
||||||
|
typer.Argument(help="Parent directory containing congress-tracker/ and congress-legislators/"),
|
||||||
|
],
|
||||||
congress: Annotated[int | None, typer.Option(help="Only ingest a specific congress number")] = None,
|
congress: Annotated[int | None, typer.Option(help="Only ingest a specific congress number")] = None,
|
||||||
only: Annotated[
|
only: Annotated[
|
||||||
str | None,
|
str | None,
|
||||||
typer.Option(help="Only run a specific step: legislators, bills, votes, bill-text"),
|
typer.Option(help="Only run a specific step: legislators, social-media, bills, votes, bill-text"),
|
||||||
] = None,
|
] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Ingest congress data from unitedstates/congress JSON files."""
|
"""Ingest congress data from unitedstates/congress JSON files."""
|
||||||
configure_logger(level="INFO")
|
configure_logger(level="INFO")
|
||||||
|
|
||||||
|
data_dir = parent_dir / "congress-tracker/congress/data/"
|
||||||
|
legislators_dir = parent_dir / "congress-legislators"
|
||||||
|
|
||||||
if not data_dir.is_dir():
|
if not data_dir.is_dir():
|
||||||
typer.echo(f"Data directory does not exist: {data_dir}", err=True)
|
typer.echo(f"Expected congress-tracker/ directory: {data_dir}", err=True)
|
||||||
|
raise typer.Exit(code=1)
|
||||||
|
|
||||||
|
if not legislators_dir.is_dir():
|
||||||
|
typer.echo(f"Expected congress-legislators/ directory: {legislators_dir}", err=True)
|
||||||
raise typer.Exit(code=1)
|
raise typer.Exit(code=1)
|
||||||
|
|
||||||
engine = get_postgres_engine(name="DATA_SCIENCE_DEV")
|
engine = get_postgres_engine(name="DATA_SCIENCE_DEV")
|
||||||
@@ -60,11 +72,12 @@ def main(
|
|||||||
|
|
||||||
logger.info("Found %d congress directories to process", len(congress_dirs))
|
logger.info("Found %d congress directories to process", len(congress_dirs))
|
||||||
|
|
||||||
steps = {
|
steps: dict[str, tuple] = {
|
||||||
"legislators": ingest_legislators,
|
"legislators": (ingest_legislators, (engine, legislators_dir)),
|
||||||
"bills": ingest_bills,
|
"legislators-social-media": (ingest_social_media, (engine, legislators_dir)),
|
||||||
"votes": ingest_votes,
|
"bills": (ingest_bills, (engine, congress_dirs)),
|
||||||
"bill-text": ingest_bill_text,
|
"votes": (ingest_votes, (engine, congress_dirs)),
|
||||||
|
"bill-text": (ingest_bill_text, (engine, congress_dirs)),
|
||||||
}
|
}
|
||||||
|
|
||||||
if only:
|
if only:
|
||||||
@@ -73,9 +86,9 @@ def main(
|
|||||||
raise typer.Exit(code=1)
|
raise typer.Exit(code=1)
|
||||||
steps = {only: steps[only]}
|
steps = {only: steps[only]}
|
||||||
|
|
||||||
for step_name, step_func in steps.items():
|
for step_name, (step_func, step_args) in steps.items():
|
||||||
logger.info("=== Starting step: %s ===", step_name)
|
logger.info("=== Starting step: %s ===", step_name)
|
||||||
step_func(engine, congress_dirs)
|
step_func(*step_args)
|
||||||
logger.info("=== Finished step: %s ===", step_name)
|
logger.info("=== Finished step: %s ===", step_name)
|
||||||
|
|
||||||
logger.info("ingest-congress done")
|
logger.info("ingest-congress done")
|
||||||
@@ -102,33 +115,166 @@ def _flush_batch(session: Session, batch: list[object], label: str) -> int:
|
|||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Legislators — extracted from vote JSON files (voter records include bioguide_id, name, party, state)
|
# Legislators — loaded from congress-legislators YAML files
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
def ingest_legislators(engine: Engine, congress_dirs: list[Path]) -> None:
|
def ingest_legislators(engine: Engine, legislators_dir: Path) -> None:
|
||||||
"""Extract unique legislators from vote data and insert them."""
|
"""Load legislators from congress-legislators YAML files."""
|
||||||
|
legislators_data = _load_legislators_yaml(legislators_dir)
|
||||||
|
logger.info("Loaded %d legislators from YAML files", len(legislators_data))
|
||||||
|
|
||||||
with Session(engine) as session:
|
with Session(engine) as session:
|
||||||
seen_bioguide_ids = set(session.scalars(select(Legislator.bioguide_id)).all())
|
existing_legislators = {
|
||||||
logger.info("Found %d existing legislators in DB", len(seen_bioguide_ids))
|
legislator.bioguide_id: legislator for legislator in session.scalars(select(Legislator)).all()
|
||||||
|
}
|
||||||
|
logger.info("Found %d existing legislators in DB", len(existing_legislators))
|
||||||
|
|
||||||
total_inserted = 0
|
total_inserted = 0
|
||||||
batch: list[Legislator] = []
|
total_updated = 0
|
||||||
for congress_dir in congress_dirs:
|
for entry in legislators_data:
|
||||||
votes_dir = congress_dir / "votes"
|
bioguide_id = entry.get("id", {}).get("bioguide")
|
||||||
if not votes_dir.is_dir():
|
if not bioguide_id:
|
||||||
continue
|
continue
|
||||||
logger.info("Scanning legislators from %s", congress_dir.name)
|
|
||||||
for vote_file in votes_dir.rglob("data.json"):
|
|
||||||
data = _read_json(vote_file)
|
|
||||||
if data is None:
|
|
||||||
continue
|
|
||||||
_extract_legislators(data, seen_bioguide_ids, batch)
|
|
||||||
if len(batch) >= BATCH_SIZE:
|
|
||||||
total_inserted += _flush_batch(session, batch, "legislators")
|
|
||||||
|
|
||||||
total_inserted += _flush_batch(session, batch, "legislators")
|
fields = _parse_legislator(entry)
|
||||||
logger.info("Inserted %d new legislators total", total_inserted)
|
if existing := existing_legislators.get(bioguide_id):
|
||||||
|
changed = False
|
||||||
|
for field, value in fields.items():
|
||||||
|
if value is not None and getattr(existing, field) != value:
|
||||||
|
setattr(existing, field, value)
|
||||||
|
changed = True
|
||||||
|
if changed:
|
||||||
|
total_updated += 1
|
||||||
|
else:
|
||||||
|
session.add(Legislator(bioguide_id=bioguide_id, **fields))
|
||||||
|
total_inserted += 1
|
||||||
|
|
||||||
|
session.commit()
|
||||||
|
logger.info("Inserted %d new legislators, updated %d existing", total_inserted, total_updated)
|
||||||
|
|
||||||
|
|
||||||
|
def _load_legislators_yaml(legislators_dir: Path) -> list[dict]:
|
||||||
|
"""Load and combine legislators-current.yaml and legislators-historical.yaml."""
|
||||||
|
legislators: list[dict] = []
|
||||||
|
for filename in ("legislators-current.yaml", "legislators-historical.yaml"):
|
||||||
|
path = legislators_dir / filename
|
||||||
|
if not path.exists():
|
||||||
|
logger.warning("Legislators file not found: %s", path)
|
||||||
|
continue
|
||||||
|
with path.open() as file:
|
||||||
|
data = yaml.safe_load(file)
|
||||||
|
if isinstance(data, list):
|
||||||
|
legislators.extend(data)
|
||||||
|
return legislators
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_legislator(entry: dict) -> dict:
|
||||||
|
"""Extract Legislator fields from a congress-legislators YAML entry."""
|
||||||
|
ids = entry.get("id", {})
|
||||||
|
name = entry.get("name", {})
|
||||||
|
bio = entry.get("bio", {})
|
||||||
|
terms = entry.get("terms", [])
|
||||||
|
latest_term = terms[-1] if terms else {}
|
||||||
|
|
||||||
|
fec_ids = ids.get("fec")
|
||||||
|
fec_ids_joined = ",".join(fec_ids) if isinstance(fec_ids, list) else fec_ids
|
||||||
|
|
||||||
|
chamber = latest_term.get("type")
|
||||||
|
chamber_normalized = {"rep": "House", "sen": "Senate"}.get(chamber, chamber)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"thomas_id": ids.get("thomas"),
|
||||||
|
"lis_id": ids.get("lis"),
|
||||||
|
"govtrack_id": ids.get("govtrack"),
|
||||||
|
"opensecrets_id": ids.get("opensecrets"),
|
||||||
|
"fec_ids": fec_ids_joined,
|
||||||
|
"first_name": name.get("first"),
|
||||||
|
"last_name": name.get("last"),
|
||||||
|
"official_full_name": name.get("official_full"),
|
||||||
|
"nickname": name.get("nickname"),
|
||||||
|
"birthday": bio.get("birthday"),
|
||||||
|
"gender": bio.get("gender"),
|
||||||
|
"current_party": latest_term.get("party"),
|
||||||
|
"current_state": latest_term.get("state"),
|
||||||
|
"current_district": latest_term.get("district"),
|
||||||
|
"current_chamber": chamber_normalized,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Social Media — loaded from legislators-social-media.yaml
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
SOCIAL_MEDIA_PLATFORMS = {
|
||||||
|
"twitter": "https://twitter.com/{account}",
|
||||||
|
"facebook": "https://facebook.com/{account}",
|
||||||
|
"youtube": "https://youtube.com/{account}",
|
||||||
|
"instagram": "https://instagram.com/{account}",
|
||||||
|
"mastodon": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def ingest_social_media(engine: Engine, legislators_dir: Path) -> None:
|
||||||
|
"""Load social media accounts from legislators-social-media.yaml."""
|
||||||
|
social_media_path = legislators_dir / "legislators-social-media.yaml"
|
||||||
|
if not social_media_path.exists():
|
||||||
|
logger.warning("Social media file not found: %s", social_media_path)
|
||||||
|
return
|
||||||
|
|
||||||
|
with social_media_path.open() as file:
|
||||||
|
social_media_data = yaml.safe_load(file)
|
||||||
|
|
||||||
|
if not isinstance(social_media_data, list):
|
||||||
|
logger.warning("Unexpected format in %s", social_media_path)
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("Loaded %d entries from legislators-social-media.yaml", len(social_media_data))
|
||||||
|
|
||||||
|
with Session(engine) as session:
|
||||||
|
legislator_map = _build_legislator_map(session)
|
||||||
|
existing_accounts = {
|
||||||
|
(account.legislator_id, account.platform)
|
||||||
|
for account in session.scalars(select(LegislatorSocialMedia)).all()
|
||||||
|
}
|
||||||
|
logger.info("Found %d existing social media accounts in DB", len(existing_accounts))
|
||||||
|
|
||||||
|
total_inserted = 0
|
||||||
|
total_updated = 0
|
||||||
|
for entry in social_media_data:
|
||||||
|
bioguide_id = entry.get("id", {}).get("bioguide")
|
||||||
|
if not bioguide_id:
|
||||||
|
continue
|
||||||
|
|
||||||
|
legislator_id = legislator_map.get(bioguide_id)
|
||||||
|
if legislator_id is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
social = entry.get("social", {})
|
||||||
|
for platform, url_template in SOCIAL_MEDIA_PLATFORMS.items():
|
||||||
|
account_name = social.get(platform)
|
||||||
|
if not account_name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
url = url_template.format(account=account_name) if url_template else None
|
||||||
|
|
||||||
|
if (legislator_id, platform) in existing_accounts:
|
||||||
|
total_updated += 1
|
||||||
|
else:
|
||||||
|
session.add(
|
||||||
|
LegislatorSocialMedia(
|
||||||
|
legislator_id=legislator_id,
|
||||||
|
platform=platform,
|
||||||
|
account_name=str(account_name),
|
||||||
|
url=url,
|
||||||
|
source="https://github.com/unitedstates/congress-legislators",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
existing_accounts.add((legislator_id, platform))
|
||||||
|
total_inserted += 1
|
||||||
|
|
||||||
|
session.commit()
|
||||||
|
logger.info("Inserted %d new social media accounts, updated %d existing", total_inserted, total_updated)
|
||||||
|
|
||||||
|
|
||||||
def _iter_voters(position_group: object) -> Iterator[dict]:
|
def _iter_voters(position_group: object) -> Iterator[dict]:
|
||||||
@@ -141,37 +287,6 @@ def _iter_voters(position_group: object) -> Iterator[dict]:
|
|||||||
yield voter
|
yield voter
|
||||||
|
|
||||||
|
|
||||||
def _extract_legislators(data: dict, seen_bioguide_ids: set[str], batch: list[Legislator]) -> None:
|
|
||||||
"""Extract unique legislators from a single vote data.json."""
|
|
||||||
for position_group in data.get("votes", {}).values():
|
|
||||||
for voter in _iter_voters(position_group):
|
|
||||||
bioguide_id = voter.get("id")
|
|
||||||
if not bioguide_id or bioguide_id in seen_bioguide_ids:
|
|
||||||
continue
|
|
||||||
seen_bioguide_ids.add(bioguide_id)
|
|
||||||
first_name, last_name = _split_name(voter.get("display_name", ""))
|
|
||||||
batch.append(
|
|
||||||
Legislator(
|
|
||||||
bioguide_id=bioguide_id,
|
|
||||||
first_name=first_name,
|
|
||||||
last_name=last_name,
|
|
||||||
current_party=voter.get("party"),
|
|
||||||
current_state=voter.get("state"),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _split_name(display_name: str) -> tuple[str, str]:
|
|
||||||
"""Split 'Last, First' or 'Name' into (first, last)."""
|
|
||||||
if "," in display_name:
|
|
||||||
parts = display_name.split(",", 1)
|
|
||||||
return parts[1].strip(), parts[0].strip()
|
|
||||||
parts = display_name.rsplit(" ", 1)
|
|
||||||
if len(parts) > 1:
|
|
||||||
return parts[0].strip(), parts[1].strip()
|
|
||||||
return display_name, ""
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Bills
|
# Bills
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|||||||
Reference in New Issue
Block a user