Files
weave/pipelines/jobs/congress_vote_context.py
T

1985 lines
72 KiB
Python

"""Offline canonical vote-context parsing, matching, classification, and scoring helpers."""
from __future__ import annotations
import hashlib
import logging
import re
import subprocess
import xml.etree.ElementTree as ET
from dataclasses import dataclass
from datetime import UTC, date, datetime
from pathlib import Path
from typing import TYPE_CHECKING, Any
from zoneinfo import ZoneInfo
from sqlalchemy import delete, select
from sqlalchemy.orm import Session, joinedload, selectinload
from pipelines.parallelize import parallelize_thread
from pipelines.orm.data_science_dev.congress import (
Amendment,
AmendmentAction,
AmendmentActionRecordedVote,
Bill,
BillAction,
BillActionRecordedVote,
BillRelation,
BillText,
ClassificationMethod,
ConfidenceLevel,
IngestRun,
MeasureFunction,
MeasureSubtype,
ScoreRun,
SourceArtifact,
SubjectType,
TextResolutionMethod,
TextTargetBasis,
TextTargetType,
Vote,
VoteActionMatch,
VoteActionScope,
VoteClassification,
VoteContextAudit,
VoteEffect,
VoteMeasureLink,
VoteMeasureRole,
VotePositionMeaning,
VoteRelationship,
VoteTextTarget,
)
if TYPE_CHECKING:
from collections.abc import Iterable, Sequence
logger = logging.getLogger(__name__)
CLASSIFICATION_VERSION = "canonical_vote_context_v3"
SCORING_VERSION = "canonical_vote_scores_v3"
EASTERN_TIMEZONE = ZoneInfo("America/New_York")
OFFLINE_BILLSTATUS_MIN_CONGRESS = 108
PARALLEL_FILE_CHUNK_SIZE = 500
PARALLEL_PROGRESS_TRACKER = 250
DIRECT_TEXT_PATTERNS = (
"on passage",
"on passage of the bill",
"on the bill",
"on the joint resolution",
"on agreeing to the resolution",
"on motion to suspend the rules and pass",
"on motion to suspend the rules and agree",
"on motion to suspend the rules and concur",
"on agreeing to the conference report",
"on motion to concur",
"passed house",
"passed senate",
)
AMENDMENT_DIRECT_PATTERNS = (
"on the amendment",
"on agreeing to the amendment",
"agreeing to the amendment",
)
PROCEDURAL_PATTERNS = (
"cloture",
"motion to proceed",
"motion to recommit",
"motion to reconsider",
"motion to table",
"previous question",
"ordering the previous question",
"rule",
)
NON_LEGISLATIVE_PATTERNS = (
"nomination",
"treaty",
"speaker",
"quorum",
"journal",
"adjourn",
)
SPECIAL_RULE_PATTERNS = (
"providing for consideration of",
"providing for the consideration of",
)
MEASURE_REF_RE = re.compile(
r"\b(?P<type>H\.?\s*R\.|S\.|H\.?\s*J\.?\s*Res\.|S\.?\s*J\.?\s*Res\."
r"|H\.?\s*Con\.?\s*Res\.|S\.?\s*Con\.?\s*Res\.|H\.?\s*Res\.|S\.?\s*Res\.)"
r"\s*(?P<number>\d+)\b",
flags=re.IGNORECASE,
)
@dataclass(frozen=True)
class ParsedRecordedVote:
congress: int
chamber: str
session_number: int
roll_number: int
vote_datetime: datetime | None
vote_url: str | None
@dataclass(frozen=True)
class ParsedAction:
sequence: int
action_date: date
action_time: str | None
action_text: str
action_type: str | None
action_code: str | None
source_system_code: str | None
source_system_name: str | None
recorded_votes: tuple[ParsedRecordedVote, ...]
@dataclass(frozen=True)
class ParsedBillRelation:
related_key: tuple[int, str, int]
relationship_type: str
identified_by: str | None
latest_action_date: date | None
latest_action_text: str | None
@dataclass(frozen=True)
class ParsedTextVersion:
version_code: str
version_name: str | None
version_date: date | None
source_datetime_raw: str | None
text_url_xml: str | None
text_url_pdf: str | None
text_url_html: str | None
@dataclass(frozen=True)
class ParsedBillStatus:
bill_key: tuple[int, str, int]
actions: tuple[ParsedAction, ...]
relations: tuple[ParsedBillRelation, ...]
text_versions: tuple[ParsedTextVersion, ...]
@dataclass(frozen=True)
class ActionCandidate:
scope: VoteActionScope
bill_action: BillAction | None
amendment_action: AmendmentAction | None
score: int
match_method: str
match_reason: str
match_confidence: ConfidenceLevel
@property
def selected_action_text(self) -> str:
if self.bill_action is not None:
return self.bill_action.action_text
if self.amendment_action is not None:
return self.amendment_action.action_text
return ""
def _chunked[T](items: Sequence[T], chunk_size: int) -> Iterable[Sequence[T]]:
"""Yield fixed-size slices from a sequence."""
for start in range(0, len(items), chunk_size):
yield items[start : start + chunk_size]
def get_git_sha(repo_root: Path | None = None) -> str | None:
"""Best-effort current git SHA for audit/run metadata."""
try:
completed = subprocess.run(
["git", "rev-parse", "HEAD"],
cwd=repo_root,
check=True,
capture_output=True,
text=True,
)
except (OSError, subprocess.SubprocessError):
return None
return completed.stdout.strip() or None
def start_ingest_run(
session: Session,
*,
source_snapshot_label: str,
repo_root: Path | None = None,
) -> IngestRun:
"""Create and persist an ingest run row."""
ingest_run = IngestRun(
started_at=datetime.now(UTC),
git_sha=get_git_sha(repo_root),
classifier_version=CLASSIFICATION_VERSION,
source_snapshot_label=source_snapshot_label,
status="running",
)
session.add(ingest_run)
session.commit()
return ingest_run
def finish_ingest_run(
session: Session,
ingest_run_id: int,
*,
status: str,
) -> None:
"""Mark an ingest run complete."""
ingest_run = session.get(IngestRun, ingest_run_id)
if ingest_run is None:
return
ingest_run.completed_at = datetime.now(UTC)
ingest_run.status = status
session.commit()
def register_source_artifact(
session: Session,
*,
path: Path,
source_kind: str,
congress: int,
chamber: str | None,
ingest_run_id: int | None,
source_url: str | None = None,
) -> SourceArtifact:
"""Track the exact local file used for a parsed object."""
payload = path.read_bytes()
sha256 = hashlib.sha256(payload).hexdigest()
modified_at = datetime.fromtimestamp(path.stat().st_mtime, tz=UTC)
artifact = session.scalar(
select(SourceArtifact).where(
SourceArtifact.local_path == str(path),
SourceArtifact.sha256 == sha256,
SourceArtifact.ingest_run_id == ingest_run_id,
)
)
if artifact is not None:
return artifact
artifact = SourceArtifact(
source_kind=source_kind,
congress=congress,
chamber=chamber,
local_path=str(path),
source_url=source_url,
sha256=sha256,
byte_size=len(payload),
modified_at=modified_at,
ingested_at=datetime.now(UTC),
ingest_run_id=ingest_run_id,
)
session.add(artifact)
session.flush()
return artifact
def derive_session_number(congress: int, session_year: int) -> int:
"""Convert a calendar session year into congressional session number 1 or 2."""
congress_start_year = ((congress - 1) * 2) + 1789
return session_year - congress_start_year + 1
def normalize_chamber(raw: str | None) -> str | None:
"""Normalize source-specific chamber labels."""
if raw is None:
return None
value = raw.strip().lower()
mapping = {
"h": "House",
"house": "House",
"s": "Senate",
"senate": "Senate",
}
return mapping.get(value, raw)
def parse_date_like(value: Any) -> date | None:
"""Parse a date-like string into a date object."""
if value is None:
return None
if isinstance(value, date) and not isinstance(value, datetime):
return value
text = str(value).strip()
if not text:
return None
try:
return date.fromisoformat(text[:10])
except ValueError:
return None
def parse_datetime_like(
value: Any,
*,
fallback_time: str | None = None,
) -> datetime | None:
"""Parse ISO-ish datetime strings from local vote and billstatus sources."""
if value is None:
return None
if isinstance(value, datetime):
return value.astimezone(UTC) if value.tzinfo else value.replace(tzinfo=UTC)
text = str(value).strip()
if not text:
return None
normalized = text.replace("Z", "+00:00")
try:
parsed = datetime.fromisoformat(normalized)
except ValueError:
if fallback_time:
fallback = f"{text[:10]}T{fallback_time}"
try:
parsed = datetime.fromisoformat(fallback)
except ValueError:
return None
else:
try:
parsed = datetime.fromisoformat(f"{text[:10]}T00:00:00")
except ValueError:
return None
if parsed.tzinfo is None:
return parsed.replace(tzinfo=UTC)
return parsed.astimezone(UTC)
def legislative_date_for_comparison(
*,
vote_datetime: datetime | None,
fallback_date: date | None,
) -> date | None:
"""Compare by chamber-local legislative day when a datetime is available."""
if vote_datetime is not None:
return vote_datetime.astimezone(EASTERN_TIMEZONE).date()
return fallback_date
def measure_subtype_for_bill_type(bill_type: str | None) -> MeasureSubtype | None:
"""Map repository bill types to normalized measure subtypes."""
if bill_type is None:
return None
normalized = bill_type.strip().lower()
if normalized in {"hr", "s"}:
return MeasureSubtype.BILL
if normalized in {"hjres", "sjres"}:
return MeasureSubtype.JOINT_RESOLUTION
if normalized in {"hconres", "sconres"}:
return MeasureSubtype.CONCURRENT_RESOLUTION
if normalized in {"hres", "sres"}:
return MeasureSubtype.SIMPLE_RESOLUTION
return None
def measure_type_value(bill_type: str | None) -> str | None:
"""Expose the measure type as a stable, upper-case code."""
if bill_type is None:
return None
return bill_type.upper()
def is_house_origin_measure(bill: Bill) -> bool:
"""Return True when a bill/resolution originates in the House."""
return bill.bill_type.lower().startswith("h")
def is_senate_origin_measure(bill: Bill) -> bool:
"""Return True when a bill/resolution originates in the Senate."""
return bill.bill_type.lower().startswith("s")
def normalized_text(*parts: str | None) -> str:
"""Normalize action/question/title text for semantic matching."""
joined = " ".join(part for part in parts if part)
return " ".join(joined.casefold().split())
def has_amendment_signal(*parts: str | None, raw_amendment_ref: dict | None = None) -> bool:
"""Heuristic for amendment-related votes."""
if raw_amendment_ref:
return True
text = normalized_text(*parts)
return "amendment" in text
def is_non_legislative_question(*parts: str | None) -> bool:
"""Classify votes that are not about legislation or legislative text."""
text = normalized_text(*parts)
return any(pattern in text for pattern in NON_LEGISLATIVE_PATTERNS)
def is_procedural_question(*parts: str | None) -> bool:
"""Heuristic for procedural motions related to legislation."""
text = normalized_text(*parts)
return any(pattern in text for pattern in PROCEDURAL_PATTERNS)
def is_direct_measure_text_question(*parts: str | None) -> bool:
"""Heuristic for direct measure/resolution text votes."""
text = normalized_text(*parts)
if any(pattern in text for pattern in DIRECT_TEXT_PATTERNS):
return True
return (
"agreeing to the resolution" in text
or "suspend the rules" in text
or "conference report" in text
or "passed house" in text
or "passed senate" in text
)
def is_direct_amendment_text_question(*parts: str | None) -> bool:
"""Heuristic for direct amendment adoption votes."""
text = normalized_text(*parts)
if "motion to table the amendment" in text:
return False
return any(pattern in text for pattern in AMENDMENT_DIRECT_PATTERNS)
def is_special_rule_measure(
*,
bill: Bill | None,
question: str | None,
action_text: str | None,
) -> bool:
"""Detect House special rules that govern another measure."""
if bill is None or measure_subtype_for_bill_type(bill.bill_type) is not MeasureSubtype.SIMPLE_RESOLUTION:
return False
text = normalized_text(
question,
action_text,
bill.title,
bill.title_short,
bill.official_title,
)
return any(pattern in text for pattern in SPECIAL_RULE_PATTERNS)
def measure_function_for_vote(
*,
bill: Bill | None,
question: str | None,
action_text: str | None,
) -> MeasureFunction | None:
"""Semantic function of the voted-on measure."""
subtype = measure_subtype_for_bill_type(bill.bill_type if bill else None)
if subtype is None:
return None
if is_special_rule_measure(bill=bill, question=question, action_text=action_text):
return MeasureFunction.SPECIAL_RULE
text = normalized_text(
question,
action_text,
bill.title if bill else None,
bill.title_short if bill else None,
bill.official_title if bill else None,
)
if "budget resolution" in text:
return MeasureFunction.BUDGET_RESOLUTION
if subtype is MeasureSubtype.SIMPLE_RESOLUTION and (
"rules of the house" in text
or "electing the speaker" in text
or "authorizing the speaker" in text
):
return MeasureFunction.CHAMBER_INTERNAL
if "sense of" in text or "commemorat" in text or "congratulating" in text:
return MeasureFunction.COMMEMORATIVE_OR_SENSE_OF
return MeasureFunction.SUBSTANTIVE_MEASURE
def parse_measure_references(
text: str | None,
*,
congress: int,
) -> list[tuple[int, str, int]]:
"""Extract referenced measures from question/title/action text."""
if not text:
return []
refs: list[tuple[int, str, int]] = []
for match in MEASURE_REF_RE.finditer(text):
raw_type = match.group("type").casefold()
number = int(match.group("number"))
normalized_type = (
raw_type.replace(".", "")
.replace(" ", "")
.replace("conres", "conres")
.replace("jres", "jres")
.replace("res", "res")
)
normalized_type = normalized_type.replace("hr", "hr", 1)
if normalized_type == "s":
bill_type = "s"
elif normalized_type == "hr":
bill_type = "hr"
else:
bill_type = normalized_type
refs.append((congress, bill_type, number))
return refs
def require_billstatus_artifacts(congress_dirs: Sequence[Path]) -> None:
"""Fail fast when canonical offline context files are unavailable."""
missing = [
congress_dir
for congress_dir in congress_dirs
if not any((congress_dir / "bills").rglob("fdsys_billstatus.xml"))
]
if missing:
congress_list = ", ".join(path.name for path in missing)
msg = (
"Canonical offline vote-context resolution requires local BILLSTATUS "
f"artifacts. Missing fdsys_billstatus.xml under congress directories: {congress_list}"
)
raise RuntimeError(msg)
def filter_context_supported_congress_dirs(
congress_dirs: Sequence[Path],
) -> list[Path]:
"""Return only congress directories supported by offline BILLSTATUS coverage."""
supported: list[Path] = []
skipped: list[Path] = []
for congress_dir in congress_dirs:
congress_number = int(congress_dir.name)
if congress_number < OFFLINE_BILLSTATUS_MIN_CONGRESS:
skipped.append(congress_dir)
else:
supported.append(congress_dir)
if skipped:
logger.info(
"Skipping canonical vote-context steps for pre-%sth Congress directories: %s",
OFFLINE_BILLSTATUS_MIN_CONGRESS,
", ".join(path.name for path in skipped),
)
return supported
def _xml_local_name(tag: str) -> str:
return tag.rsplit("}", 1)[-1]
def _xml_text(element: ET.Element | None, *names: str) -> str | None:
if element is None:
return None
for descendant in element.iter():
if _xml_local_name(descendant.tag) in names:
text = descendant.text.strip() if descendant.text else None
if text:
return text
return None
def _xml_direct_children(element: ET.Element, *names: str) -> list[ET.Element]:
return [child for child in list(element) if _xml_local_name(child.tag) in names]
def _xml_direct_child(element: ET.Element, *names: str) -> ET.Element | None:
for child in list(element):
if _xml_local_name(child.tag) in names:
return child
return None
def parse_billstatus_file(path: Path) -> ParsedBillStatus | None:
"""Parse the official Bill Status XML needed for actions, relations, and text versions."""
try:
root = ET.fromstring(path.read_bytes())
except ET.ParseError:
logger.exception("Failed to parse bill status XML: %s", path)
return None
bill_node = _xml_direct_child(root, "bill")
if bill_node is None:
bill_node = root
congress_text = _xml_text(bill_node, "congress")
bill_type_text = _xml_text(bill_node, "billType", "bill-type", "type")
bill_number_text = _xml_text(bill_node, "billNumber", "bill-number", "number")
if not congress_text or not bill_type_text or not bill_number_text:
return None
bill_key = (int(congress_text), bill_type_text.strip().lower(), int(bill_number_text))
actions_parent = _xml_direct_child(bill_node, "actions")
actions: list[ParsedAction] = []
if actions_parent is not None:
for index, item in enumerate(_xml_direct_children(actions_parent, "item", "action"), start=1):
action_date = parse_date_like(_xml_text(item, "actionDate", "actedAt", "action-date", "acted_at"))
action_text = _xml_text(item, "text") or ""
if action_date is None or not action_text:
continue
source_system = _xml_direct_child(item, "sourceSystem")
recorded_votes_parent = _xml_direct_child(item, "recordedVotes")
recorded_votes: list[ParsedRecordedVote] = []
if recorded_votes_parent is not None:
for vote_item in _xml_direct_children(recorded_votes_parent, "recordedVote", "item"):
roll_number = _xml_text(vote_item, "rollNumber", "roll-number")
chamber = normalize_chamber(_xml_text(vote_item, "chamber"))
congress = _xml_text(vote_item, "congress")
session_number = _xml_text(vote_item, "sessionNumber", "session-number")
if not roll_number or chamber is None or not congress or not session_number:
continue
recorded_votes.append(
ParsedRecordedVote(
congress=int(congress),
chamber=chamber,
session_number=int(session_number),
roll_number=int(roll_number),
vote_datetime=parse_datetime_like(_xml_text(vote_item, "date")),
vote_url=_xml_text(vote_item, "url"),
)
)
actions.append(
ParsedAction(
sequence=index,
action_date=action_date,
action_time=_xml_text(item, "actionTime", "action-time"),
action_text=action_text,
action_type=_xml_text(item, "type"),
action_code=_xml_text(item, "actionCode", "action-code"),
source_system_code=_xml_text(source_system, "code"),
source_system_name=_xml_text(source_system, "name"),
recorded_votes=tuple(recorded_votes),
)
)
relations: list[ParsedBillRelation] = []
related_parent = _xml_direct_child(bill_node, "relatedBills", "relatedBillDetails")
if related_parent is not None:
for item in _xml_direct_children(related_parent, "item", "relatedBill", "relatedBillDetail"):
relation_congress = _xml_text(item, "congress")
relation_type = _xml_text(item, "type", "billType")
relation_number = _xml_text(item, "number", "billNumber")
if not relation_congress or not relation_type or not relation_number:
continue
relationship_details = _xml_direct_child(item, "relationshipDetails")
relationship_item = (
_xml_direct_child(relationship_details, "item")
if relationship_details is not None
else None
)
relations.append(
ParsedBillRelation(
related_key=(
int(relation_congress),
relation_type.strip().lower(),
int(relation_number),
),
relationship_type=(
_xml_text(
relationship_item,
"relationshipType",
"relationship-type",
"typeOfRelationship",
"type",
)
or _xml_text(
item,
"relationshipType",
"relationship-type",
"typeOfRelationship",
)
or "related"
),
identified_by=_xml_text(
relationship_item,
"identifiedBy",
"identified-by",
)
or _xml_text(item, "identifiedBy", "identified-by"),
latest_action_date=parse_date_like(_xml_text(item, "latestActionDate", "latest-action-date")),
latest_action_text=_xml_text(item, "latestActionText", "latest-action-text", "latestAction"),
)
)
text_versions: list[ParsedTextVersion] = []
titles_parent = _xml_direct_child(bill_node, "titles")
title_version_name_to_code: dict[str, str] = {}
if titles_parent is not None:
for item in _xml_direct_children(titles_parent, "item", "title"):
version_name = _xml_text(item, "billTextVersionName")
version_code = _xml_text(item, "billTextVersionCode")
if version_name and version_code:
title_version_name_to_code.setdefault(
normalized_text(version_name),
version_code.lower(),
)
text_versions_parent = _xml_direct_child(bill_node, "textVersions")
if text_versions_parent is not None:
for item in _xml_direct_children(text_versions_parent, "item", "textVersion"):
version_name = _xml_text(item, "type", "versionName")
version_code = _xml_text(item, "billTextVersionCode", "versionCode", "typeCode")
if version_code is None and version_name is not None:
version_code = title_version_name_to_code.get(normalized_text(version_name))
raw_date = _xml_text(item, "date")
if not version_code and not version_name:
continue
formats_parent = _xml_direct_child(item, "formats")
xml_url = None
pdf_url = None
html_url = None
if formats_parent is not None:
for format_item in _xml_direct_children(formats_parent, "item", "format"):
format_type = normalized_text(_xml_text(format_item, "type"), _xml_text(format_item, "name"))
url = _xml_text(format_item, "url")
if not url:
continue
if "xml" in format_type:
xml_url = url
elif "pdf" in format_type:
pdf_url = url
elif "html" in format_type or "formatted text" in format_type:
html_url = url
text_versions.append(
ParsedTextVersion(
version_code=(version_code or version_name or "").lower(),
version_name=version_name,
version_date=parse_date_like(raw_date),
source_datetime_raw=raw_date,
text_url_xml=xml_url,
text_url_pdf=pdf_url,
text_url_html=html_url,
)
)
return ParsedBillStatus(
bill_key=bill_key,
actions=tuple(actions),
relations=tuple(relations),
text_versions=tuple(text_versions),
)
def _parse_billstatus_path(*, path: Path) -> ParsedBillStatus | None:
"""Thread-friendly wrapper for billstatus XML parsing."""
return parse_billstatus_file(path)
def _read_json_path(*, path: Path) -> dict[str, Any] | None:
"""Thread-friendly wrapper for amendment JSON loading."""
return _read_json(path)
def merge_billstatus_text_versions_for_bill(
*,
bill_id: int,
parsed_text_versions: Sequence[ParsedTextVersion],
source_artifact_id: int | None,
existing_bill_texts: dict[tuple[int, str], BillText],
) -> list[BillText]:
"""Create or enrich BillText rows from official billstatus metadata.
This fills metadata-only bill text rows when local text-versions artifacts do not exist,
which allows vote->text resolution to link to an official version even without local content.
"""
created: list[BillText] = []
for version in parsed_text_versions:
version_code = version.version_code.lower()
key = (bill_id, version_code)
existing = existing_bill_texts.get(key)
if existing is None:
bill_text = BillText(
bill_id=bill_id,
version_code=version_code,
version_name=version.version_name,
text_content=None,
date=version.version_date,
source_datetime_raw=version.source_datetime_raw,
text_url_xml=version.text_url_xml,
text_url_pdf=version.text_url_pdf,
text_url_html=version.text_url_html,
source_artifact_id=source_artifact_id,
)
existing_bill_texts[key] = bill_text
created.append(bill_text)
continue
if existing.version_name is None and version.version_name is not None:
existing.version_name = version.version_name
if existing.date is None and version.version_date is not None:
existing.date = version.version_date
if existing.source_datetime_raw is None and version.source_datetime_raw is not None:
existing.source_datetime_raw = version.source_datetime_raw
if existing.text_url_xml is None and version.text_url_xml is not None:
existing.text_url_xml = version.text_url_xml
if existing.text_url_pdf is None and version.text_url_pdf is not None:
existing.text_url_pdf = version.text_url_pdf
if existing.text_url_html is None and version.text_url_html is not None:
existing.text_url_html = version.text_url_html
if existing.source_artifact_id is None and source_artifact_id is not None:
existing.source_artifact_id = source_artifact_id
return created
def build_billstatus_text_version_index(
congress_dirs: Sequence[Path],
) -> dict[tuple[int, str, int], dict[str, ParsedTextVersion]]:
"""Index text-version metadata by bill key and version code."""
index: dict[tuple[int, str, int], dict[str, ParsedTextVersion]] = {}
for congress_dir in congress_dirs:
billstatus_paths = sorted((congress_dir / "bills").rglob("fdsys_billstatus.xml"))
for chunk in _chunked(billstatus_paths, PARALLEL_FILE_CHUNK_SIZE):
results = parallelize_thread(
_parse_billstatus_path,
[{"path": path} for path in chunk],
progress_tracker=PARALLEL_PROGRESS_TRACKER,
)
for parsed in results.results:
if parsed is None:
continue
version_map = index.setdefault(parsed.bill_key, {})
for version in parsed.text_versions:
version_map.setdefault(version.version_code.lower(), version)
return index
def raw_bill_key_from_ref(
raw_bill_ref: dict[str, Any] | None,
*,
default_congress: int,
) -> tuple[int, str, int] | None:
"""Resolve a raw vote-side bill reference into the canonical bill key."""
if not raw_bill_ref:
return None
raw_type = raw_bill_ref.get("type")
raw_number = raw_bill_ref.get("number")
if raw_type is None or raw_number is None:
return None
raw_congress = raw_bill_ref.get("congress", default_congress)
try:
return (int(raw_congress), str(raw_type).lower(), int(raw_number))
except (TypeError, ValueError):
return None
def parse_vote_source_url(raw_vote: dict[str, Any]) -> str | None:
"""Best-effort raw vote source URL from vote JSON."""
for key in ("url", "source_url", "sourceUrl"):
value = raw_vote.get(key)
if isinstance(value, str) and value:
return value
return None
def coerce_raw_ref(raw_value: Any) -> dict[str, Any] | None:
"""Preserve raw refs as JSON-ish dictionaries."""
if raw_value is None:
return None
if isinstance(raw_value, dict):
return raw_value
return {"value": raw_value}
def parsed_vote_datetime(raw_vote: dict[str, Any]) -> datetime | None:
"""Build a full vote datetime when the source exposes one."""
raw_date = raw_vote.get("date")
raw_time = raw_vote.get("time")
if raw_time is not None and isinstance(raw_date, str):
return parse_datetime_like(raw_date, fallback_time=str(raw_time))
return parse_datetime_like(raw_date)
def ingest_bill_status_context(
session: Session,
*,
congress_dirs: Sequence[Path],
bill_map: dict[tuple[int, str, int], int],
ingest_run_id: int | None,
) -> None:
"""Rebuild bill actions, relations, amendments, and their recorded votes."""
require_billstatus_artifacts(congress_dirs)
congress_numbers = [int(path.name) for path in congress_dirs]
bill_ids_subquery = select(Bill.id).where(Bill.congress.in_(congress_numbers))
existing_bill_texts = {
(bill_text.bill_id, bill_text.version_code.lower()): bill_text
for bill_text in session.scalars(
select(BillText)
.join(Bill, Bill.id == BillText.bill_id)
.where(Bill.congress.in_(congress_numbers))
).all()
}
session.execute(
delete(BillRelation).where(BillRelation.bill_id.in_(bill_ids_subquery))
)
session.execute(delete(BillAction).where(BillAction.bill_id.in_(bill_ids_subquery)))
session.execute(delete(Amendment).where(Amendment.congress.in_(congress_numbers)))
session.commit()
for congress_dir in congress_dirs:
bills_dir = congress_dir / "bills"
if not bills_dir.is_dir():
logger.warning(f"Missing bills directory for congress {congress_dir.name}: {bills_dir}")
continue
billstatus_paths = sorted(bills_dir.rglob("fdsys_billstatus.xml"))
logger.info(
"Scanning %d bill status files from %s",
len(billstatus_paths),
congress_dir.name,
)
for chunk in _chunked(billstatus_paths, PARALLEL_FILE_CHUNK_SIZE):
results = parallelize_thread(
_parse_billstatus_path,
[{"path": path} for path in chunk],
progress_tracker=PARALLEL_PROGRESS_TRACKER,
)
for path, parsed in zip(chunk, results.results, strict=True):
if parsed is None:
continue
bill_id = bill_map.get(parsed.bill_key)
if bill_id is None:
continue
artifact = register_source_artifact(
session,
path=path,
source_kind="billstatus_xml",
congress=parsed.bill_key[0],
chamber=None,
ingest_run_id=ingest_run_id,
)
session.add_all(
merge_billstatus_text_versions_for_bill(
bill_id=bill_id,
parsed_text_versions=parsed.text_versions,
source_artifact_id=artifact.id,
existing_bill_texts=existing_bill_texts,
)
)
for relation in parsed.relations:
related_bill_id = bill_map.get(relation.related_key)
if related_bill_id is None:
continue
session.add(
BillRelation(
bill_id=bill_id,
related_bill_id=related_bill_id,
relationship_type=relation.relationship_type,
identified_by=relation.identified_by,
latest_action_date=relation.latest_action_date,
latest_action_text=relation.latest_action_text,
)
)
for action in parsed.actions:
bill_action = BillAction(
bill_id=bill_id,
sequence=action.sequence,
action_date=action.action_date,
action_time=action.action_time,
action_text=action.action_text,
action_type=action.action_type,
action_code=action.action_code,
source_system_code=action.source_system_code,
source_system_name=action.source_system_name,
source_artifact_id=artifact.id,
)
session.add(bill_action)
session.flush()
for recorded_vote in action.recorded_votes:
session.add(
BillActionRecordedVote(
bill_action_id=bill_action.id,
congress=recorded_vote.congress,
chamber=recorded_vote.chamber,
session_number=recorded_vote.session_number,
roll_number=recorded_vote.roll_number,
vote_datetime=recorded_vote.vote_datetime,
vote_url=recorded_vote.vote_url,
)
)
amendments_dir = congress_dir / "amendments"
if amendments_dir.is_dir():
amendment_paths = sorted(amendments_dir.rglob("data.json"))
logger.info(
"Scanning %d amendment files from %s",
len(amendment_paths),
congress_dir.name,
)
for chunk in _chunked(amendment_paths, PARALLEL_FILE_CHUNK_SIZE):
results = parallelize_thread(
_read_json_path,
[{"path": path} for path in chunk],
progress_tracker=PARALLEL_PROGRESS_TRACKER,
)
for amendment_path, raw in zip(chunk, results.results, strict=True):
if raw is None:
continue
amendment = _parse_amendment_json(
session,
raw=raw,
bill_map=bill_map,
ingest_run_id=ingest_run_id,
path=amendment_path,
)
if amendment is not None:
session.add(amendment)
session.commit()
def _parse_amendment_json(
session: Session,
*,
raw: dict[str, Any],
bill_map: dict[tuple[int, str, int], int],
ingest_run_id: int | None,
path: Path,
) -> Amendment | None:
congress = raw.get("congress")
amendment_type = raw.get("amendment_type") or raw.get("type")
number = raw.get("number")
if congress is None or amendment_type is None or number is None:
return None
artifact = register_source_artifact(
session,
path=path,
source_kind="amendment_json",
congress=int(congress),
chamber=normalize_chamber(raw.get("chamber")),
ingest_run_id=ingest_run_id,
)
amended_bill_id = None
amended_bill_ref = raw.get("amends_bill") or raw.get("bill") or raw.get("amended_bill")
if isinstance(amended_bill_ref, dict):
amended_bill_key = raw_bill_key_from_ref(
amended_bill_ref,
default_congress=int(congress),
)
if amended_bill_key is not None:
amended_bill_id = bill_map.get(amended_bill_key)
amendment = Amendment(
congress=int(congress),
amendment_type=str(amendment_type).lower(),
number=int(number),
chamber=normalize_chamber(raw.get("chamber")),
description=raw.get("description"),
purpose=raw.get("purpose"),
amended_bill_id=amended_bill_id,
source_path=str(path),
source_artifact_id=artifact.id,
)
session.add(amendment)
session.flush()
actions = raw.get("actions")
if isinstance(actions, list):
for index, item in enumerate(actions, start=1):
if not isinstance(item, dict):
continue
action_date = parse_date_like(item.get("acted_at") or item.get("action_date"))
action_text = item.get("text")
if action_date is None or not isinstance(action_text, str) or not action_text:
continue
action = AmendmentAction(
amendment_id=amendment.id,
sequence=index,
action_date=action_date,
action_time=_extract_time_component(item.get("acted_at")),
action_text=action_text,
action_type=item.get("type"),
action_code=item.get("state") or item.get("vote_type"),
source_system_code=None,
source_system_name="unitedstates/congress amendment JSON",
source_artifact_id=artifact.id,
)
session.add(action)
session.flush()
roll = item.get("roll")
chamber = normalize_chamber(item.get("where"))
session_number = item.get("session")
if roll and chamber and session_number:
session.add(
AmendmentActionRecordedVote(
amendment_action_id=action.id,
congress=int(congress),
chamber=chamber,
session_number=int(session_number),
roll_number=int(roll),
vote_datetime=parse_datetime_like(item.get("acted_at")),
vote_url=item.get("url"),
)
)
return amendment
def _extract_time_component(raw_value: Any) -> str | None:
if raw_value is None:
return None
text = str(raw_value)
if "T" not in text:
return None
return text.split("T", 1)[1].replace("Z", "")
def _read_json(path: Path) -> dict[str, Any] | None:
import orjson
try:
return orjson.loads(path.read_bytes())
except Exception:
logger.exception("Failed to parse %s", path)
return None
def build_vote_action_matches(
session: Session,
*,
congress_numbers: Sequence[int],
) -> None:
"""Match raw votes to official bill/amendment actions and persist all candidates."""
vote_ids_subquery = select(Vote.id).where(Vote.congress.in_(list(congress_numbers)))
has_votes = session.scalar(select(Vote.id).where(Vote.congress.in_(list(congress_numbers))).limit(1))
if has_votes is None:
return
session.execute(
delete(VoteActionMatch).where(VoteActionMatch.vote_id.in_(vote_ids_subquery))
)
session.execute(
delete(VoteContextAudit).where(VoteContextAudit.vote_id.in_(vote_ids_subquery))
)
session.commit()
vote_stmt = select(Vote).where(Vote.congress.in_(list(congress_numbers))).order_by(Vote.id)
votes = session.scalars(vote_stmt).all()
bill_vote_index = _build_bill_action_vote_index(session, congress_numbers)
amendment_vote_index = _build_amendment_action_vote_index(session, congress_numbers)
for vote in votes:
candidates = rank_action_candidates(
vote=vote,
bill_vote_index=bill_vote_index,
amendment_vote_index=amendment_vote_index,
)
if not candidates:
session.add(
VoteContextAudit(
vote_id=vote.id,
step="vote_action_match",
message="no official action matched vote tuple; classification will fall back to vote XML",
severity="warning",
)
)
continue
selected = candidates[0]
if len(candidates) > 1:
session.add(
VoteContextAudit(
vote_id=vote.id,
step="vote_action_match",
message="multiple official actions matched vote tuple; selected highest-ranked candidate",
severity="info",
)
)
for index, candidate in enumerate(candidates):
session.add(
VoteActionMatch(
vote_id=vote.id,
action_scope=candidate.scope,
bill_action_id=candidate.bill_action.id if candidate.bill_action else None,
amendment_action_id=(
candidate.amendment_action.id if candidate.amendment_action else None
),
is_selected=index == 0,
match_method=candidate.match_method,
match_reason=candidate.match_reason,
match_confidence=candidate.match_confidence,
)
)
session.commit()
def _build_bill_action_vote_index(
session: Session,
congress_numbers: Sequence[int],
) -> dict[tuple[int, str, int, int], list[BillActionRecordedVote]]:
rows = session.scalars(
select(BillActionRecordedVote)
.join(BillAction, BillAction.id == BillActionRecordedVote.bill_action_id)
.join(Bill, Bill.id == BillAction.bill_id)
.where(Bill.congress.in_(list(congress_numbers)))
.options(joinedload(BillActionRecordedVote.bill_action).joinedload(BillAction.bill))
).all()
index: dict[tuple[int, str, int, int], list[BillActionRecordedVote]] = {}
for row in rows:
key = (row.congress, row.chamber, row.session_number, row.roll_number)
index.setdefault(key, []).append(row)
return index
def _build_amendment_action_vote_index(
session: Session,
congress_numbers: Sequence[int],
) -> dict[tuple[int, str, int, int], list[AmendmentActionRecordedVote]]:
rows = session.scalars(
select(AmendmentActionRecordedVote)
.join(AmendmentAction, AmendmentAction.id == AmendmentActionRecordedVote.amendment_action_id)
.join(Amendment, Amendment.id == AmendmentAction.amendment_id)
.where(Amendment.congress.in_(list(congress_numbers)))
.options(
joinedload(AmendmentActionRecordedVote.amendment_action).joinedload(
AmendmentAction.amendment
)
)
).all()
index: dict[tuple[int, str, int, int], list[AmendmentActionRecordedVote]] = {}
for row in rows:
key = (row.congress, row.chamber, row.session_number, row.roll_number)
index.setdefault(key, []).append(row)
return index
def rank_action_candidates(
*,
vote: Vote,
bill_vote_index: dict[tuple[int, str, int, int], list[BillActionRecordedVote]],
amendment_vote_index: dict[
tuple[int, str, int, int], list[AmendmentActionRecordedVote]
],
) -> list[ActionCandidate]:
"""Rank candidate official actions for one vote."""
key = (vote.congress, vote.chamber, vote.session_number, vote.roll_number)
bill_candidates = bill_vote_index.get(key, [])
amendment_candidates = amendment_vote_index.get(key, [])
prefer_amendment = has_amendment_signal(
vote.question,
vote.result_text,
raw_amendment_ref=vote.raw_amendment_ref,
)
question_text = normalized_text(vote.question, vote.result_text, vote.vote_type)
candidates: list[ActionCandidate] = []
for row in amendment_candidates:
action_text = normalized_text(row.amendment_action.action_text, vote.question)
score = 100
if prefer_amendment:
score += 50
if is_direct_amendment_text_question(action_text):
score += 15
if row.amendment_action.action_time:
score += 1
candidates.append(
ActionCandidate(
scope=VoteActionScope.AMENDMENT,
bill_action=None,
amendment_action=row.amendment_action,
score=score,
match_method="canonical_vote_tuple",
match_reason="matched amendment action recorded vote tuple",
match_confidence=ConfidenceLevel.HIGH,
)
)
for row in bill_candidates:
source_name = row.bill_action.source_system_name or ""
score = 50
if not prefer_amendment:
score += 20
if "library of congress" not in source_name.casefold():
score += 10
if _semantic_alignment_score(question_text, row.bill_action.action_text) > 0:
score += 10
if row.bill_action.action_time:
score += 1
candidates.append(
ActionCandidate(
scope=VoteActionScope.BILL,
bill_action=row.bill_action,
amendment_action=None,
score=score,
match_method="canonical_vote_tuple",
match_reason="matched bill action recorded vote tuple",
match_confidence=ConfidenceLevel.HIGH,
)
)
candidates.sort(
key=lambda candidate: (
-candidate.score,
candidate.bill_action.sequence if candidate.bill_action else candidate.amendment_action.sequence,
)
)
return candidates
def _semantic_alignment_score(question_text: str, action_text: str) -> int:
normalized_action = normalized_text(action_text)
if question_text and normalized_action and question_text in normalized_action:
return 5
if is_direct_measure_text_question(question_text) and is_direct_measure_text_question(normalized_action):
return 3
if is_procedural_question(question_text) and is_procedural_question(normalized_action):
return 3
return 0
def classify_votes(
session: Session,
*,
congress_numbers: Sequence[int],
bill_map: dict[tuple[int, str, int], int],
) -> None:
"""Populate vote classifications and measure links from selected matches."""
has_votes = session.scalar(select(Vote.id).where(Vote.congress.in_(list(congress_numbers))).limit(1))
if has_votes is None:
return
vote_ids_subquery = select(Vote.id).where(Vote.congress.in_(list(congress_numbers)))
session.execute(
delete(VoteMeasureLink).where(VoteMeasureLink.vote_id.in_(vote_ids_subquery))
)
session.execute(
delete(VoteClassification).where(VoteClassification.vote_id.in_(vote_ids_subquery))
)
session.commit()
vote_stmt = (
select(Vote)
.where(Vote.congress.in_(list(congress_numbers)))
.options(
selectinload(Vote.action_matches).joinedload(VoteActionMatch.bill_action).joinedload(BillAction.bill),
selectinload(Vote.action_matches)
.joinedload(VoteActionMatch.amendment_action)
.joinedload(AmendmentAction.amendment),
)
.order_by(Vote.id)
)
votes = session.scalars(vote_stmt).all()
for vote in votes:
selected_match = next((match for match in vote.action_matches if match.is_selected), None)
classification, measure_links, audit_rows = classify_single_vote(
vote=vote,
selected_match=selected_match,
bill_map=bill_map,
session=session,
)
session.add(classification)
for link in measure_links:
session.add(link)
for audit_row in audit_rows:
session.add(audit_row)
session.commit()
def classify_single_vote(
*,
vote: Vote,
selected_match: VoteActionMatch | None,
bill_map: dict[tuple[int, str, int], int],
session: Session,
) -> tuple[VoteClassification, list[VoteMeasureLink], list[VoteContextAudit]]:
"""Classify a single vote and produce measure links."""
audit_rows: list[VoteContextAudit] = []
question_text = vote.question or ""
result_text = vote.result_text or ""
selected_action_text = ""
bill: Bill | None = None
amendment: Amendment | None = None
method = ClassificationMethod.VOTE_XML_ONLY
confidence = ConfidenceLevel.MEDIUM
if selected_match is not None:
if selected_match.bill_action is not None:
bill = selected_match.bill_action.bill
if selected_match.amendment_action is not None:
amendment = selected_match.amendment_action.amendment
selected_action_text = (
selected_match.bill_action.action_text
if selected_match.bill_action is not None
else selected_match.amendment_action.action_text
if selected_match.amendment_action is not None
else ""
)
method = (
ClassificationMethod.RECORDED_VOTE_ACTION_EXACT
if len(vote.action_matches) <= 1
else ClassificationMethod.RECORDED_VOTE_ACTION_DUPLICATE_SOURCE_DEDUPED
)
confidence = ConfidenceLevel.HIGH
if bill is None and vote.raw_bill_ref:
raw_key = raw_bill_key_from_ref(vote.raw_bill_ref, default_congress=vote.congress)
if raw_key is not None:
raw_bill_id = bill_map.get(raw_key)
if raw_bill_id is not None:
bill = session.get(Bill, raw_bill_id)
subject_type = SubjectType.UNKNOWN
vote_relationship = VoteRelationship.UNKNOWN
measure_subtype: MeasureSubtype | None = None
measure_function: MeasureFunction | None = None
measure_type: str | None = None
is_legislation_related = False
is_direct_text = False
is_substantive = False
is_lawmaking_vehicle = False
is_special_rule = False
measure_links: list[VoteMeasureLink] = []
if vote.raw_nomination_ref or "nomination" in normalized_text(question_text, result_text):
subject_type = SubjectType.NOMINATION
vote_relationship = VoteRelationship.NON_LEGISLATIVE
elif vote.raw_treaty_ref or "treaty" in normalized_text(question_text, result_text):
subject_type = SubjectType.TREATY
vote_relationship = VoteRelationship.NON_LEGISLATIVE
elif is_non_legislative_question(question_text, result_text):
subject_type = SubjectType.CHAMBER_ADMIN
vote_relationship = VoteRelationship.NON_LEGISLATIVE
elif amendment is not None or has_amendment_signal(question_text, selected_action_text, raw_amendment_ref=vote.raw_amendment_ref):
subject_type = SubjectType.AMENDMENT
is_legislation_related = True
if is_direct_amendment_text_question(question_text, selected_action_text):
vote_relationship = VoteRelationship.AMENDMENT_TEXT_VOTE
is_direct_text = True
is_substantive = True
else:
vote_relationship = VoteRelationship.PROCEDURAL_RELATED_TO_AMENDMENT
if amendment is not None and amendment.amended_bill_id is not None:
role = (
VoteMeasureRole.AMENDS
if vote_relationship is VoteRelationship.AMENDMENT_TEXT_VOTE
else VoteMeasureRole.PROCEDURAL_TARGET
)
measure_links.append(
VoteMeasureLink(
vote_id=vote.id,
measure_id=amendment.amended_bill_id,
role=role,
source=method.value,
confidence=confidence,
notes=amendment.purpose,
)
)
elif bill is not None or vote.raw_bill_ref:
subject_type = SubjectType.MEASURE
is_legislation_related = True
if bill is not None:
measure_type = measure_type_value(bill.bill_type)
measure_subtype = measure_subtype_for_bill_type(bill.bill_type)
measure_function = measure_function_for_vote(
bill=bill,
question=question_text,
action_text=selected_action_text,
)
is_special_rule = measure_function is MeasureFunction.SPECIAL_RULE
is_lawmaking_vehicle = measure_subtype in {
MeasureSubtype.BILL,
MeasureSubtype.JOINT_RESOLUTION,
MeasureSubtype.CONCURRENT_RESOLUTION,
}
if is_direct_measure_text_question(question_text, selected_action_text):
vote_relationship = VoteRelationship.DIRECT_TEXT_VOTE
is_direct_text = True
is_substantive = not is_special_rule and measure_function not in {
MeasureFunction.CHAMBER_INTERNAL,
MeasureFunction.COMMEMORATIVE_OR_SENSE_OF,
}
elif is_procedural_question(question_text, selected_action_text):
vote_relationship = VoteRelationship.PROCEDURAL_RELATED_TO_MEASURE
else:
vote_relationship = VoteRelationship.UNKNOWN
if bill is not None:
role = (
VoteMeasureRole.VOTED_ON
if vote_relationship is VoteRelationship.DIRECT_TEXT_VOTE
else VoteMeasureRole.PROCEDURAL_TARGET
)
measure_links.append(
VoteMeasureLink(
vote_id=vote.id,
measure_id=bill.id,
role=role,
source=method.value,
confidence=confidence,
notes=None,
)
)
if is_special_rule:
underlying_refs = parse_measure_references(
" ".join(
filter(
None,
[bill.title, bill.title_short, bill.official_title, selected_action_text, question_text],
)
),
congress=vote.congress,
)
seen_measure_ids: set[int] = {bill.id}
for key in underlying_refs:
linked_bill_id = bill_map.get(key)
if linked_bill_id is None or linked_bill_id in seen_measure_ids:
continue
seen_measure_ids.add(linked_bill_id)
measure_links.append(
VoteMeasureLink(
vote_id=vote.id,
measure_id=linked_bill_id,
role=VoteMeasureRole.RULE_FOR,
source="measure_text_parse",
confidence=ConfidenceLevel.MEDIUM,
notes="parsed from rule title/question/action text",
)
)
if len(seen_measure_ids) <= 1:
audit_rows.append(
VoteContextAudit(
vote_id=vote.id,
step="vote_context_classify",
message="special rule detected but no underlying measure could be resolved from available text",
severity="warning",
)
)
else:
audit_rows.append(
VoteContextAudit(
vote_id=vote.id,
step="vote_context_classify",
message="vote remains unclassified after action matching and raw-source parsing",
severity="warning",
)
)
classification = VoteClassification(
vote_id=vote.id,
subject_type=subject_type,
measure_type=measure_type,
measure_subtype=measure_subtype,
measure_function=measure_function,
vote_relationship=vote_relationship,
is_legislation_related=is_legislation_related,
is_direct_vote_on_legislative_text=is_direct_text,
is_substantive_policy_vote=is_substantive,
is_lawmaking_vehicle=is_lawmaking_vehicle,
is_special_rule=is_special_rule,
classification_method=method,
classification_confidence_reason=(
"matched recorded vote tuple to official action"
if selected_match is not None
else "classified from raw vote metadata only"
),
confidence=confidence,
classified_at=datetime.now(UTC),
classification_version=CLASSIFICATION_VERSION,
)
return classification, measure_links, audit_rows
def resolve_vote_text_targets(
session: Session,
*,
congress_numbers: Sequence[int],
) -> None:
"""Populate voted/resulting text targets for classified votes."""
has_votes = session.scalar(select(Vote.id).where(Vote.congress.in_(list(congress_numbers))).limit(1))
if has_votes is None:
return
vote_ids_subquery = select(Vote.id).where(Vote.congress.in_(list(congress_numbers)))
session.execute(
delete(VoteTextTarget).where(VoteTextTarget.vote_id.in_(vote_ids_subquery))
)
session.commit()
vote_stmt = (
select(Vote)
.where(Vote.congress.in_(list(congress_numbers)))
.options(
joinedload(Vote.classification),
selectinload(Vote.vote_measure_links).joinedload(VoteMeasureLink.measure).selectinload(Bill.bill_texts),
selectinload(Vote.action_matches).joinedload(VoteActionMatch.bill_action).joinedload(BillAction.bill),
selectinload(Vote.action_matches)
.joinedload(VoteActionMatch.amendment_action)
.joinedload(AmendmentAction.amendment),
)
.order_by(Vote.id)
)
votes = session.scalars(vote_stmt).all()
for vote in votes:
if vote.classification is None:
continue
selected_match = next((match for match in vote.action_matches if match.is_selected), None)
text_target = resolve_text_target_for_vote(vote=vote, selected_match=selected_match)
session.add(text_target)
session.commit()
def resolve_text_target_for_vote(
*,
vote: Vote,
selected_match: VoteActionMatch | None,
) -> VoteTextTarget:
"""Resolve one vote's official text target."""
classification = vote.classification
assert classification is not None
if classification.subject_type is SubjectType.AMENDMENT:
related_amendment_id = (
selected_match.amendment_action.amendment_id
if selected_match and selected_match.amendment_action is not None
else None
)
return VoteTextTarget(
vote_id=vote.id,
text_target_type=TextTargetType.AMENDMENT_TEXT,
voted_text_version_id=None,
resulting_text_version_id=None,
related_amendment_id=related_amendment_id,
text_target_basis=TextTargetBasis.AMENDMENT_TEXT,
text_resolution_method=TextResolutionMethod.AMENDMENT_TEXT_UNMODELED_PHASE1,
text_resolution_confidence_reason="phase 1 does not store amendment text artifacts",
confidence=classification.confidence,
notes=None,
)
if (
classification.subject_type is not SubjectType.MEASURE
or not classification.is_direct_vote_on_legislative_text
):
return VoteTextTarget(
vote_id=vote.id,
text_target_type=TextTargetType.NONE,
voted_text_version_id=None,
resulting_text_version_id=None,
related_amendment_id=None,
text_target_basis=TextTargetBasis.NO_TEXT_TARGET,
text_resolution_method=TextResolutionMethod.NO_TEXT_TARGET,
text_resolution_confidence_reason="vote was not a direct vote on legislative text",
confidence=classification.confidence,
notes=None,
)
voted_on_measure = next(
(
link.measure
for link in vote.vote_measure_links
if link.role is VoteMeasureRole.VOTED_ON
),
None,
)
if voted_on_measure is None:
return VoteTextTarget(
vote_id=vote.id,
text_target_type=TextTargetType.UNKNOWN,
voted_text_version_id=None,
resulting_text_version_id=None,
related_amendment_id=None,
text_target_basis=TextTargetBasis.UNKNOWN,
text_resolution_method=TextResolutionMethod.UNKNOWN,
text_resolution_confidence_reason="no voted_on measure link exists for direct text vote",
confidence=ConfidenceLevel.LOW,
notes=None,
)
action_date = None
action_text = ""
if selected_match is not None:
if selected_match.bill_action is not None:
action_date = selected_match.bill_action.action_date
action_text = selected_match.bill_action.action_text
elif selected_match.amendment_action is not None:
action_date = selected_match.amendment_action.action_date
action_text = selected_match.amendment_action.action_text
if action_date is None:
action_date = legislative_date_for_comparison(
vote_datetime=vote.vote_datetime,
fallback_date=vote.vote_date,
)
candidate_texts = list(voted_on_measure.bill_texts)
preferred_codes = preferred_bill_text_codes(
vote=vote,
bill=voted_on_measure,
classification=classification,
action_text=action_text,
)
voted_text, method, basis = choose_best_bill_text(
candidate_texts=candidate_texts,
action_date=action_date,
preferred_codes=preferred_codes,
classification=classification,
)
resulting_text = determine_resulting_text_version(
candidate_texts=candidate_texts,
action_date=action_date,
action_text=action_text,
voted_text=voted_text,
)
text_target_type = (
TextTargetType.RESOLUTION_TEXT
if classification.measure_subtype is not MeasureSubtype.BILL
else TextTargetType.BILL_TEXT
)
if classification.is_special_rule:
basis = TextTargetBasis.RULE_RESOLUTION_TEXT
confidence = classification.confidence if voted_text is not None else ConfidenceLevel.LOW
reason = "resolved from official bill text versions" if voted_text is not None else "no eligible bill text version found"
return VoteTextTarget(
vote_id=vote.id,
text_target_type=text_target_type,
voted_text_version_id=voted_text.id if voted_text is not None else None,
resulting_text_version_id=resulting_text.id if resulting_text is not None else None,
related_amendment_id=None,
text_target_basis=basis,
text_resolution_method=method,
text_resolution_confidence_reason=reason,
confidence=confidence,
notes=None,
)
def preferred_bill_text_codes(
*,
vote: Vote,
bill: Bill,
classification: VoteClassification,
action_text: str | None,
) -> tuple[str, ...]:
"""Preferred text-version codes for the vote's immediate measure."""
action = normalized_text(vote.question, action_text, vote.result_text)
origin_is_house = is_house_origin_measure(bill)
origin_is_senate = is_senate_origin_measure(bill)
if classification.is_special_rule:
return ("ath", "ats", "eh", "es", "cph", "cps")
if "conference report" in action:
return ("enr", "eah", "eas", "eh", "es")
if "concur" in action or "with an amendment" in action or "agreed to senate amendments" in action:
return ("eah", "eas", "enr")
if vote.chamber == "House" and origin_is_house:
return ("eh", "cph")
if vote.chamber == "Senate" and origin_is_senate:
return ("es", "cps")
if "without amendment" in action:
if vote.chamber == "Senate" and origin_is_house:
return ("rfs", "rds", "eh", "cph")
if vote.chamber == "House" and origin_is_senate:
return ("rfh", "rdh", "es", "cps")
if vote.chamber == "House":
return ("eah", "eh", "cph")
return ("eas", "es", "cps")
def choose_best_bill_text(
*,
candidate_texts: Sequence[BillText],
action_date: date | None,
preferred_codes: Sequence[str],
classification: VoteClassification,
) -> tuple[BillText | None, TextResolutionMethod, TextTargetBasis]:
"""Resolve the best official text version for a direct measure vote."""
if not candidate_texts:
return None, TextResolutionMethod.UNKNOWN, TextTargetBasis.UNKNOWN
preferred_code_set = tuple(code.lower() for code in preferred_codes)
eligible = [
bill_text
for bill_text in candidate_texts
if action_date is None or bill_text.date is None or bill_text.date <= action_date
]
if not eligible:
eligible = list(candidate_texts)
def sort_key(bill_text: BillText) -> tuple[int, int, date, int]:
code = bill_text.version_code.lower()
exact_date = int(action_date is not None and bill_text.date == action_date)
code_rank = -preferred_code_set.index(code) if code in preferred_code_set else -999
bill_date = bill_text.date or date.min
return (exact_date, code_rank, bill_date.toordinal(), bill_text.id)
best = max(eligible, key=sort_key)
code = best.version_code.lower()
if action_date is not None and best.date == action_date and code in preferred_code_set:
return (
best,
TextResolutionMethod.TEXT_EXACT_ACTION_DATE_AND_CODE,
TextTargetBasis.EXACT_ACTION_TEXT_VERSION,
)
if action_date is not None and best.date == action_date:
return (
best,
TextResolutionMethod.TEXT_EXACT_ACTION_DATE_WRONG_CODE,
TextTargetBasis.EXACT_ACTION_TEXT_VERSION,
)
if code in preferred_code_set:
basis = (
TextTargetBasis.RECEIVED_PRIOR_CHAMBER_VERSION
if code in {"rfh", "rdh", "rfs", "rds"}
else TextTargetBasis.RESULTING_ENGROSSED_VERSION
)
method = (
TextResolutionMethod.TEXT_RECEIVED_PRIOR_CHAMBER_VERSION
if basis is TextTargetBasis.RECEIVED_PRIOR_CHAMBER_VERSION
else TextResolutionMethod.TEXT_PRIOR_VERSION_CODE_MATCH
)
return best, method, basis
return (
best,
TextResolutionMethod.TEXT_PRIOR_VERSION_CODE_MATCH,
TextTargetBasis.RESULTING_ENGROSSED_VERSION,
)
def determine_resulting_text_version(
*,
candidate_texts: Sequence[BillText],
action_date: date | None,
action_text: str | None,
voted_text: BillText | None,
) -> BillText | None:
"""Resolve a resulting/enrolled text version without overwriting the voted text."""
if voted_text is None:
return None
action = normalized_text(action_text)
if not action:
return voted_text
if any(pattern in action for pattern in ("without amendment", "conference report", "agreed to senate amendment", "agreed to house amendment")):
enrolled = sorted(
(
bill_text
for bill_text in candidate_texts
if bill_text.version_code.lower() == "enr"
and (action_date is None or bill_text.date is None or bill_text.date >= action_date)
),
key=lambda bill_text: (bill_text.date or date.max, bill_text.id),
)
if enrolled:
return enrolled[0]
return voted_text
def resolve_vote_position_meanings(
session: Session,
*,
congress_numbers: Sequence[int],
) -> None:
"""Populate yea/nay/present semantic effects for each classified vote."""
has_votes = session.scalar(select(Vote.id).where(Vote.congress.in_(list(congress_numbers))).limit(1))
if has_votes is None:
return
session.execute(
delete(VotePositionMeaning).where(
VotePositionMeaning.vote_id.in_(
select(Vote.id).where(Vote.congress.in_(list(congress_numbers)))
)
)
)
session.commit()
vote_stmt = (
select(Vote)
.where(Vote.congress.in_(list(congress_numbers)))
.options(joinedload(Vote.classification))
.order_by(Vote.id)
)
votes = session.scalars(vote_stmt).all()
for vote in votes:
if vote.classification is None:
continue
session.add(resolve_vote_position_meaning_for_vote(vote=vote))
session.commit()
def resolve_vote_position_meaning_for_vote(*, vote: Vote) -> VotePositionMeaning:
"""Map Yea/Nay/Present into semantic effects for one classified vote."""
classification = vote.classification
assert classification is not None
relationship = classification.vote_relationship
normalized_question = normalized_text(vote.question, vote.result_text)
yea_effect = VoteEffect.UNKNOWN
nay_effect = VoteEffect.UNKNOWN
present_effect = VoteEffect.UNKNOWN
confidence = classification.confidence
method = "classification_relationship"
if relationship in {
VoteRelationship.DIRECT_TEXT_VOTE,
VoteRelationship.AMENDMENT_TEXT_VOTE,
}:
yea_effect = VoteEffect.SUPPORTS_TEXT
nay_effect = VoteEffect.OPPOSES_TEXT
elif relationship is VoteRelationship.PROCEDURAL_RELATED_TO_MEASURE or relationship is VoteRelationship.PROCEDURAL_RELATED_TO_AMENDMENT:
if "motion to table" in normalized_question:
yea_effect = VoteEffect.BLOCKS_PROCEDURE
nay_effect = VoteEffect.ADVANCES_PROCEDURE
elif any(token in normalized_question for token in ("cloture", "motion to proceed", "previous question", "ordering the previous question")):
yea_effect = VoteEffect.ADVANCES_PROCEDURE
nay_effect = VoteEffect.BLOCKS_PROCEDURE
else:
confidence = ConfidenceLevel.LOW
method = "classification_relationship_unknown_procedural_polarity"
else:
confidence = ConfidenceLevel.LOW
method = "non_legislative_or_unknown"
return VotePositionMeaning(
vote_id=vote.id,
yea_effect=yea_effect,
nay_effect=nay_effect,
present_effect=present_effect,
polarity_confidence=confidence,
polarity_method=method,
notes=None,
)
def create_score_run(session: Session) -> ScoreRun:
"""Create a score run tied to the most recent ingest snapshot when available."""
latest_ingest_run_id = session.scalar(
select(IngestRun.id).order_by(IngestRun.id.desc()).limit(1)
)
score_run = ScoreRun(
ingest_run_id=latest_ingest_run_id,
classifier_version=CLASSIFICATION_VERSION,
scoring_version=SCORING_VERSION,
included_vote_count=0,
excluded_vote_count=0,
started_at=datetime.now(UTC),
completed_at=None,
)
session.add(score_run)
session.flush()
return score_run
def finalize_score_run(
session: Session,
*,
score_run: ScoreRun,
included_vote_count: int,
excluded_vote_count: int,
) -> None:
"""Mark a score run complete."""
score_run.included_vote_count = included_vote_count
score_run.excluded_vote_count = excluded_vote_count
score_run.completed_at = datetime.now(UTC)
session.flush()