adding math to bob

added config.toml to git ignore
updated BenchmarkConfig to have from_toml
2026-04-12 10:08:23 -04:00 · 2026-04-12 10:08:23 -04:00 · 2026-04-12 10:08:23 -04:00 · 2026-04-12 10:08:23 -04:00 · 2026-04-12 10:08:23 -04:00 · 2026-04-12 10:08:23 -04:00
161 changed files with 11228 additions and 7050 deletions
@@ -23,6 +23,6 @@ jobs:
    steps:
      - uses: actions/checkout@v4
      - name: Build default package
-        run: "nixos-rebuild build --accept-flake-config --flake ./#${{ matrix.system }}"
+        run: "nixos-rebuild build --flake ./#${{ matrix.system }}"
      - name: copy to nix-cache
        run: nix copy --accept-flake-config --to unix:///host-nix/var/nix/daemon-socket/socket .#nixosConfigurations.${{ matrix.system }}.config.system.build.toplevel
@@ -0,0 +1,30 @@
 name: fix_eval_warnings
 on:
  workflow_run:
    workflows: ["build_systems"]
    types: [completed]
 jobs:
  check-warnings:
    if: >-
      github.event.workflow_run.conclusion != 'cancelled' &&
      github.event.workflow_run.head_branch == 'main' &&
      (github.event.workflow_run.event == 'push' || github.event.workflow_run.event == 'schedule')
    runs-on: self-hosted
    permissions:
      contents: write
      pull-requests: write
    steps:
      - uses: actions/checkout@v4
      - name: Fix eval warnings
        env:
          GH_TOKEN: ${{ secrets.GH_TOKEN_FOR_UPDATES }}
        run: >-
          nix develop .#devShells.x86_64-linux.default -c
          python -m python.eval_warnings.main
          --run-id "${{ github.event.workflow_run.id }}"
          --repo "${{ github.repository }}"
          --ollama-url "${{ secrets.OLLAMA_URL }}"
          --run-url "${{ github.event.workflow_run.html_url }}"
@@ -6,18 +6,24 @@ on:
 jobs:
  merge:
-    runs-on: self-hosted
+    runs-on: ubuntu-latest
    permissions:
      contents: write
      pull-requests: write
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: merge_flake_lock_update
-        run: >-
+        run: |
-          nix develop .#devShells.x86_64-linux.default -c
+          pr_number=$(gh pr list --state open --author RichieCahill --label flake_lock_update --json number --jq '.[0].number')
-          python -m python.gitea_flake_lock merge
+          echo "pr_number=$pr_number" >> $GITHUB_ENV
-          --repo "${{ github.repository }}"
+          if [ -n "$pr_number" ]; then
            gh pr merge "$pr_number" --rebase
          else
            echo "No open PR found with label flake_lock_update"
          fi
        env:
-          GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }}
+          GITHUB_TOKEN: ${{ secrets.GH_TOKEN_FOR_UPDATES }}
          GITEA_URL: https://gitea.tmmworkshop.com
@@ -1,13 +1,13 @@
 name: pytest
 on:
  workflow_dispatch:
  push:
    branches:
      - main
  pull_request:
    branches:
      - main
  merge_group:
 jobs:
  pytest:
@@ -6,21 +6,18 @@ on:
 jobs:
  lockfile:
-    runs-on: self-hosted
+    runs-on: ubuntu-latest
    permissions:
      actions: write
      contents: write
      pull-requests: write
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Install Nix
        uses: DeterminateSystems/nix-installer-action@main
      - name: Update flake.lock
-        run: nix flake update
+        uses: DeterminateSystems/update-flake-lock@main
-      - name: Create or update flake.lock PR
+        with:
-        env:
+          token: ${{ secrets.GH_TOKEN_FOR_UPDATES }}
-          GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }}
+          pr-title: "Update flake.lock"
-          GITEA_URL: https://gitea.tmmworkshop.com
+          pr-labels: |
-        run: >-
+            dependencies
-          nix develop .#devShells.x86_64-linux.default -c
+            automated
-          python -m python.gitea_flake_lock update
+            flake_lock_update
          --repo "${{ github.repository }}"
@@ -170,5 +170,6 @@ test.*
 frontend/dist/
 frontend/node_modules/
-# data from testing llms
+# data dir for training, validation, and testing
-data/*
+data/
 config.toml
@@ -40,6 +40,7 @@
    "cgroupdriver",
    "charliermarsh",
    "Checkpointing",
    "cloudflared",
    "codellama",
    "codezombiech",
    "compactmode",
@@ -203,7 +204,6 @@
    "peerconnection",
    "PESKYFOX",
    "PGID",
    "pgvector",
    "pipewire",
    "pkgs",
    "plugdev",
@@ -0,0 +1,12 @@
 ## Dev environment tips
 - use treefmt to format all files
 - make python code ruff compliant
 - use pytest to test python code
 - always use the minimum amount of complexity
 - if judgment calls are easy to reverse make them. if not ask me first
 - Match existing code style.
 - Use builtin helpers getenv() over os.environ.get.
 - Prefer single-purpose functions over “do everything” helpers.
 - Avoid compatibility branches like PG_USER and POSTGRESQL_URL unless requested.
 - Keep helpers only if reused or they simplify the code otherwise inline.
@@ -23,10 +23,7 @@
  boot = {
    tmp.useTmpfs = true;
    kernelPackages = lib.mkDefault pkgs.linuxPackages_6_12;
-    zfs = {
+    zfs.package = lib.mkDefault pkgs.zfs_2_4;
      package = lib.mkDefault pkgs.zfs_2_4;
      forceImportRoot = lib.mkDefault false;
    };
  };
  hardware.enableRedistributableFirmware = true;
@@ -40,17 +37,10 @@
  nixpkgs = {
    overlays = builtins.attrValues outputs.overlays;
-    config = {
+    config.allowUnfree = true;
      allowUnfree = true;
      permittedInsecurePackages = [
        "openssl-1.1.1w" # This is for discord-canary
      ];
    };
  };
  services = {
    dbus.implementation = "dbus";
    # firmware update
    fwupd.enable = true;
@@ -34,7 +34,6 @@ in
      warn-dirty = false;
      flake-registry = ""; # disable global flake registries
      connect-timeout = 10;
      download-buffer-size = 536870912;
      fallback = true;
    };
@@ -1,256 +0,0 @@
 {
  config,
  lib,
  pkgs,
  ...
 }:
 let
  monitoringInterface = "ztwfunumly";
  nodeTextfileDir = "/var/lib/prometheus-node-exporter-textfile";
  mkProcessNameTemplate =
    perPid: template: if perPid then "${template}:{{.PID}}:{{.StartTime}}" else template;
  mkProcessMatchers = perPid: [
    {
      name = mkProcessNameTemplate perPid "{{.Username}}:{{.Matches.Module}}";
      cmdline = [ "^/nix/store[^ ]*/bin/python[^ ]* -m (?P<Module>[^ ]+)" ];
    }
    {
      name = mkProcessNameTemplate perPid "{{.Username}}:{{.Matches.Wrapped}}";
      cmdline = [
        "^/nix/store[^ ]*/bin/python[^ ]* /nix/store[^ ]*/bin/\\.?(?P<Wrapped>[^ /]+?)(?:-wrapped)?(?:\\s|$)"
      ];
    }
    {
      name = mkProcessNameTemplate perPid "{{.Username}}:{{.Matches.Wrapped}}";
      cmdline = [
        "^/nix/store[^ ]*/bin/node /nix/store[^ ]*-(?P<Wrapped>[A-Za-z0-9._+-]+)-[0-9][^ /]*/"
      ];
    }
    {
      name = mkProcessNameTemplate perPid "{{.Username}}:{{.Matches.Wrapped}}";
      cmdline = [ "^/nix/store[^ ]*/(?:bin/|lib/[^ ]*/)?\\.?(?P<Wrapped>[^ /]+?)(?:-wrapped)?(?:\\s|$)" ];
    }
    {
      name = mkProcessNameTemplate perPid "{{.Username}}:{{.ExeBase}}";
      cmdline = [ ".+" ];
    }
  ];
  perPidConfig = pkgs.writeText "process-exporter-per-pid.yaml" (
    builtins.toJSON {
      process_names = mkProcessMatchers true;
    }
  );
  zpoolLatencyScript = pkgs.writeShellScript "zpool-latency-exporter" ''
        set -euo pipefail
        out_dir=${lib.escapeShellArg nodeTextfileDir}
        host=${lib.escapeShellArg config.networking.hostName}
        tmp_file="$(mktemp "$out_dir/zpool.prom.XXXXXX")"
        trap 'rm -f "$tmp_file"' EXIT
        pools="$(zpool list -H -o name | paste -sd, -)"
        cat >"$tmp_file" <<'EOF'
    # HELP zpool_iostat_total_wait_read_ns Average total read wait time reported by zpool iostat.
    # TYPE zpool_iostat_total_wait_read_ns gauge
    # HELP zpool_iostat_total_wait_write_ns Average total write wait time reported by zpool iostat.
    # TYPE zpool_iostat_total_wait_write_ns gauge
    # HELP zpool_iostat_disk_wait_read_ns Average disk read wait time reported by zpool iostat.
    # TYPE zpool_iostat_disk_wait_read_ns gauge
    # HELP zpool_iostat_disk_wait_write_ns Average disk write wait time reported by zpool iostat.
    # TYPE zpool_iostat_disk_wait_write_ns gauge
    # HELP zpool_iostat_syncq_wait_read_ns Average synchronous queue read wait time reported by zpool iostat.
    # TYPE zpool_iostat_syncq_wait_read_ns gauge
    # HELP zpool_iostat_syncq_wait_write_ns Average synchronous queue write wait time reported by zpool iostat.
    # TYPE zpool_iostat_syncq_wait_write_ns gauge
    # HELP zpool_iostat_asyncq_wait_read_ns Average asynchronous queue read wait time reported by zpool iostat.
    # TYPE zpool_iostat_asyncq_wait_read_ns gauge
    # HELP zpool_iostat_asyncq_wait_write_ns Average asynchronous queue write wait time reported by zpool iostat.
    # TYPE zpool_iostat_asyncq_wait_write_ns gauge
    EOF
        zpool iostat -Hplvy -y 1 1 | awk -F '\t' -v host="$host" -v pools="$pools" '
          function esc(str, out) {
            out = str
            gsub(/\\/, "\\\\", out)
            gsub(/"/, "\\\"", out)
            return out
          }
          function emit(metric, pool, vdev, value) {
            if (value == "" || value == "-") {
              return
            }
            printf "%s{host=\"%s\",pool=\"%s\",vdev=\"%s\"} %s\n",
              metric,
              esc(host),
              esc(pool),
              esc(vdev),
              value
          }
          BEGIN {
            split(pools, pool_names, ",")
            for (idx in pool_names) {
              if (pool_names[idx] != "") {
                known_pools[pool_names[idx]] = 1
              }
            }
          }
          NF == 0 {
            next
          }
          {
            row_name = $1
            if (row_name in known_pools) {
              current_pool = row_name
              current_vdev = "_pool"
            } else if (current_pool == "") {
              next
            } else {
              current_vdev = row_name
            }
            emit("zpool_iostat_total_wait_read_ns", current_pool, current_vdev, $8)
            emit("zpool_iostat_total_wait_write_ns", current_pool, current_vdev, $9)
            emit("zpool_iostat_disk_wait_read_ns", current_pool, current_vdev, $10)
            emit("zpool_iostat_disk_wait_write_ns", current_pool, current_vdev, $11)
            emit("zpool_iostat_syncq_wait_read_ns", current_pool, current_vdev, $12)
            emit("zpool_iostat_syncq_wait_write_ns", current_pool, current_vdev, $13)
            emit("zpool_iostat_asyncq_wait_read_ns", current_pool, current_vdev, $14)
            emit("zpool_iostat_asyncq_wait_write_ns", current_pool, current_vdev, $15)
          }
        ' >>"$tmp_file"
        mv "$tmp_file" "$out_dir/zpool.prom"
        trap - EXIT
  '';
 in
 {
  networking.firewall.interfaces.${monitoringInterface}.allowedTCPPorts = [
    9100
    9134
    9256
    9257
    9633
  ];
  services.prometheus.exporters = {
    node = {
      enable = true;
      enabledCollectors = [
        "pressure"
        "processes"
        "systemd"
      ];
      extraFlags = [ "--collector.textfile.directory=${nodeTextfileDir}" ];
    };
    process = {
      enable = true;
      user = "root";
      group = "root";
      settings.process_names = mkProcessMatchers false;
      extraFlags = [
        "-gather-smaps=false"
        "-remove-empty-groups=true"
        "-threads=false"
      ];
    };
    smartctl.enable = true;
    zfs.enable = true;
  };
  programs.atop = {
    enable = true;
    atopService.enable = true;
    atopRotateTimer.enable = true;
    atopacctService.enable = true;
    settings.interval = 30;
  };
  systemd = {
    services = {
      prometheus-process-pid-exporter = {
        description = "Prometheus process exporter with per-PID naming";
        wantedBy = [ "multi-user.target" ];
        after = [ "network.target" ];
        serviceConfig = {
          ExecStart = ''
            ${pkgs.prometheus-process-exporter}/bin/process-exporter \
              --web.listen-address 0.0.0.0:9257 \
              --config.path ${perPidConfig} \
              -children=false \
              -gather-smaps=false \
              -remove-empty-groups=true \
              -threads=false
          '';
          User = "root";
          Group = "root";
          Restart = "always";
          WorkingDirectory = "/tmp";
          CapabilityBoundingSet = [ "" ];
          DeviceAllow = [ "" ];
          LockPersonality = true;
          MemoryDenyWriteExecute = true;
          NoNewPrivileges = true;
          PrivateDevices = true;
          PrivateTmp = true;
          ProtectClock = true;
          ProtectControlGroups = true;
          ProtectHome = true;
          ProtectHostname = true;
          ProtectKernelLogs = true;
          ProtectKernelModules = true;
          ProtectKernelTunables = true;
          ProtectSystem = "strict";
          RemoveIPC = true;
          RestrictAddressFamilies = [
            "AF_INET"
            "AF_INET6"
          ];
          RestrictNamespaces = true;
          RestrictRealtime = true;
          RestrictSUIDSGID = true;
          SystemCallArchitectures = "native";
          UMask = "0077";
        };
      };
      zpool-latency-exporter = {
        description = "Exports ZFS latency metrics for node_exporter textfile collection";
        after = [ "zfs-import.target" ];
        requires = [ "zfs-import.target" ];
        path = [
          config.boot.zfs.package
          pkgs.coreutils
          pkgs.gawk
        ];
        serviceConfig = {
          Type = "oneshot";
          ExecStart = zpoolLatencyScript;
        };
      };
    };
    timers.zpool-latency-exporter = {
      wantedBy = [ "timers.target" ];
      timerConfig = {
        OnBootSec = "2m";
        OnUnitActiveSec = "60s";
        Unit = "zpool-latency-exporter.service";
      };
    };
    tmpfiles.rules = [ "d ${nodeTextfileDir} 0755 root root - -" ];
  };
 }
@@ -12,7 +12,7 @@
      brain.id = "SSCGIPI-IV3VYKB-TRNIJE3-COV4T2H-CDBER7F-I2CGHYA-NWOEUDU-3T5QAAN"; # cspell:disable-line
      ipad.id = "KI76T3X-SFUGV2L-VSNYTKR-TSIUV5L-SHWD3HE-GQRGRCN-GY4UFMD-CW6Z6AX"; # cspell:disable-line
      jeeves.id = "ICRHXZW-ECYJCUZ-I4CZ64R-3XRK7CG-LL2HAAK-FGOHD22-BQA4AI6-5OAL6AG"; # cspell:disable-line
-      phone.id = "JPVQKQW-CFXOJXT-Q5G5F3H-QIDHDRE-GKHPTQB-GXZUQSP-U7FR7F7-INP3AAH"; # cspell:disable-line
+      phone.id = "TBRULKD-7DZPGGZ-F6LLB7J-MSO54AY-7KLPBIN-QOFK6PX-W2HBEWI-PHM2CQI"; # cspell:disable-line
      rhapsody-in-green.id = "ASL3KC4-3XEN6PA-7BQBRKE-A7JXLI6-DJT43BY-Q4WPOER-7UALUAZ-VTPQ6Q4"; # cspell:disable-line
    };
  };
@@ -4,7 +4,7 @@
    flags = [ "--accept-flake-config" ];
    randomizedDelaySec = "1h";
    persistent = true;
-    flake = "git+https://gitea.tmmworkshop.com/richie/dotfiles?ref=main";
+    flake = "github:RichieCahill/dotfiles";
    allowReboot = true;
    dates = "Sat *-*-* 06:00:00";
  };
@@ -1,76 +0,0 @@
 # ZFS failed root import recovery
 ## Fast path
 If the machine fails to boot because ZFS refuses to import `root_pool`:
 ### GRUB
 1. At the bootloader menu, select the normal NixOS entry.
 2. Press `e`.
 3. Find the line that starts with `linux`.
 4. Append this to the end of that line:
 ```text
 zfs_force=1
 ```
 5. Boot once with `Ctrl+x` or `F10`.
 ### systemd-boot
 1. At the bootloader menu, highlight the normal NixOS entry.
 2. Press `e`.
 3. Append this to the end of the options line:
 ```text
 zfs_force=1
 ```
 4. Press `Enter` to boot once.
 ## After boot
 Run:
 ```bash
 sudo zpool status
 sudo zpool import
 journalctl -b | rg "ZFS|zfs|import|root_pool"
 ```
 ## Expected result
 `sudo zpool status` should show `root_pool` as `ONLINE`.
 ## Reboot test
 Run:
 ```bash
 sudo reboot
 ```
 Do not add `zfs_force=1` the second time.
 ## If it still fails
 Boot once more with:
 ```text
 zfs_force=1
 ```
 Then run:
 ```bash
 sudo zpool status -v
 sudo zpool history | tail -n 50
 journalctl -b | rg "ZFS|zfs|import|root_pool"
 ```
 ## Notes
 - Root pool name is `root_pool`.
 - This is a one-time recovery path after disk moves, controller changes, dirty exports, or interrupted imports.
 - Some hosts also need the LUKS unlock USB key inserted before boot.
@@ -8,11 +8,11 @@
      },
      "locked": {
        "dir": "pkgs/firefox-addons",
-        "lastModified": 1781150628,
+        "lastModified": 1773979456,
-        "narHash": "sha256-b4mp8l3qWuSCyYYo9HSngDtcB3PpecYiOXjULrjwwlw=",
+        "narHash": "sha256-9kBMJ5IvxqNlkkj/swmE8uK1Sc7TL/LIRUI958m7uBM=",
        "owner": "rycee",
        "repo": "nur-expressions",
-        "rev": "753319310f4673a2dabbfab87482187b40bf9bac",
+        "rev": "81e28f47ac18d9e89513929c77e711e657b64851",
        "type": "gitlab"
      },
      "original": {
@@ -29,11 +29,11 @@
        ]
      },
      "locked": {
-        "lastModified": 1781189114,
+        "lastModified": 1774007980,
-        "narHash": "sha256-5inaamLgUMWy+MOBE9ChF9QAF1o/74LFuHkI0W/9rqc=",
+        "narHash": "sha256-FOnZjElEI8pqqCvB6K/1JRHTE8o4rer8driivTpq2uo=",
        "owner": "nix-community",
        "repo": "home-manager",
-        "rev": "486595d2cf49cfcd649b58a284fa11ac0e34da22",
+        "rev": "9670de2921812bc4e0452f6e3efd8c859696c183",
        "type": "github"
      },
      "original": {
@@ -43,15 +43,12 @@
      }
    },
    "nixos-hardware": {
      "inputs": {
        "nixpkgs": "nixpkgs"
      },
      "locked": {
-        "lastModified": 1781168557,
+        "lastModified": 1774018263,
-        "narHash": "sha256-LOnLQ2tpYF9gqIDDr3+j3DbpJJr/QCH6zPRT2GzEUOE=",
+        "narHash": "sha256-HHYEwK1A22aSaxv2ibhMMkKvrDGKGlA/qObG4smrSqc=",
        "owner": "nixos",
        "repo": "nixos-hardware",
-        "rev": "6358ff76821101c178e3ab4919a62799bfe3652e",
+        "rev": "2d4b4717b2534fad5c715968c1cece04a172b365",
        "type": "github"
      },
      "original": {
@@ -63,24 +60,27 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1767892417,
+        "lastModified": 1773821835,
-        "narHash": "sha256-8bW3q88CEg2u4hSP66Vf4lpbLonHz7hqDNBMcCY7E9U=",
+        "narHash": "sha256-TJ3lSQtW0E2JrznGVm8hOQGVpXjJyXY2guAxku2O9A4=",
-        "rev": "3497aa5c9457a9d88d71fa93a4a8368816fbeeba",
+        "owner": "nixos",
-        "type": "tarball",
+        "repo": "nixpkgs",
-        "url": "https://releases.nixos.org/nixos/unstable/nixos-26.05pre924538.3497aa5c9457/nixexprs.tar.xz"
+        "rev": "b40629efe5d6ec48dd1efba650c797ddbd39ace0",
        "type": "github"
      },
      "original": {
-        "type": "tarball",
+        "owner": "nixos",
-        "url": "https://channels.nixos.org/nixos-unstable/nixexprs.tar.xz"
+        "ref": "nixos-unstable",
        "repo": "nixpkgs",
        "type": "github"
      }
    },
    "nixpkgs-master": {
      "locked": {
-        "lastModified": 1781229721,
+        "lastModified": 1774051532,
-        "narHash": "sha256-ORvqDbb/LYxiJljGIejapjkc/kJbVote2N1WSb9W45I=",
+        "narHash": "sha256-d3CGMweyYIcPuTj5BKq+1Lx4zwlgL31nVtN647tOZKo=",
        "owner": "nixos",
        "repo": "nixpkgs",
-        "rev": "173d0ad7a974f8543a9ab01d2271b2e290341b33",
+        "rev": "8620c0b5cc8fbe76502442181be1d0514bc3a1b7",
        "type": "github"
      },
      "original": {
@@ -106,28 +106,12 @@
        "type": "github"
      }
    },
    "nixpkgs_2": {
      "locked": {
        "lastModified": 1781074563,
        "narHash": "sha256-md8WlXOlfnIeHeOScMTTHFyf2d6iaTwPl2apR5EQ3P4=",
        "owner": "nixos",
        "repo": "nixpkgs",
        "rev": "9ae611a455b90cf061d8f332b977e387bda8e1ca",
        "type": "github"
      },
      "original": {
        "owner": "nixos",
        "ref": "nixos-unstable",
        "repo": "nixpkgs",
        "type": "github"
      }
    },
    "root": {
      "inputs": {
        "firefox-addons": "firefox-addons",
        "home-manager": "home-manager",
        "nixos-hardware": "nixos-hardware",
-        "nixpkgs": "nixpkgs_2",
+        "nixpkgs": "nixpkgs",
        "nixpkgs-master": "nixpkgs-master",
        "nixpkgs-stable": "nixpkgs-stable",
        "sops-nix": "sops-nix",
@@ -141,11 +125,11 @@
        ]
      },
      "locked": {
-        "lastModified": 1780547341,
+        "lastModified": 1773889674,
-        "narHash": "sha256-Gq8KNx5A7hBB3uGJaj6eQfLDIz5YdLu92gqBcvHvoUo=",
+        "narHash": "sha256-+ycaiVAk3MEshJTg35cBTUa0MizGiS+bgpYw/f8ohkg=",
        "owner": "Mic92",
        "repo": "sops-nix",
-        "rev": "9ed65852b6257fbeae4355bc24ecfea307ca759a",
+        "rev": "29b6519f3e0780452bca0ac0be4584f04ac16cc5",
        "type": "github"
      },
      "original": {
@@ -0,0 +1,24 @@
 # Logs
 logs
 *.log
 npm-debug.log*
 yarn-debug.log*
 yarn-error.log*
 pnpm-debug.log*
 lerna-debug.log*
 node_modules
 dist
 dist-ssr
 *.local
 # Editor directories and files
 .vscode/*
 !.vscode/extensions.json
 .idea
 .DS_Store
 *.suo
 *.ntvs*
 *.njsproj
 *.sln
 *.sw?
@@ -24,6 +24,7 @@
        fastapi
        fastapi-cli
        httpx
        huggingface-hub
        mypy
        orjson
        polars
@@ -12,6 +12,7 @@ dependencies = [
    "alembic",
    "apprise",
    "apscheduler",
    "huggingface-hub",
    "httpx",
    "python-multipart",
    "polars",
@@ -26,7 +27,11 @@ dependencies = [
 [project.scripts]
 database = "python.database_cli:app"
 van-inventory = "python.van_inventory.main:serve"
-whisper-transcribe = "python.tools.whisper.transcribe:main"
+prompt-bench = "python.prompt_bench.main:cli"
 prompt-bench-download = "python.prompt_bench.downloader:cli"
 finetune = "python.prompt_bench.finetune:cli"
 finetune-container = "python.prompt_bench.finetune_container:cli"
 build-finetune-dataset = "python.prompt_bench.build_finetune_dataset:cli"
 [dependency-groups]
 dev = [
@@ -51,7 +56,6 @@ lint.ignore = [
    "COM812", # (TEMP) conflicts when used with the formatter
    "ISC001", # (TEMP) conflicts when used with the formatter
    "S603",   # (PERM) This is known to cause a false positive
    "S607",   # (PERM) This is becoming a consistent annoyance
 ]
 [tool.ruff.lint.per-file-ignores]
@@ -80,10 +84,20 @@ lint.ignore = [
 "python/congress_tracker/**" = [
    "TC003", # (perm) this creates issues because sqlalchemy uses these at runtime
 ]
-
+"python/eval_warnings/**" = [
    "S607", # (perm) gh and git are expected on PATH in the runner environment
 ]
 "python/prompt_bench/**" = [
    "FBT002",  # (perm) typer requires boolean defaults for --flag/--no-flag options
    "PLR0913", # (perm) typer CLIs naturally have many parameters
    "S607",    # (perm) docker and nvidia-smi are expected on PATH
 ]
 "python/alembic/**" = [
    "INP001", # (perm) this creates LSP issues for alembic
 ]
 "python/signal_bot/**" = [
    "D107", # (perm) class docstrings cover __init__
 ]
 [tool.ruff.lint.pydocstyle]
 convention = "google"
@@ -0,0 +1,50 @@
 """adding FailedIngestion.
 Revision ID: 2f43120e3ffc
 Revises: f99be864fe69
 Create Date: 2026-03-24 23:46:17.277897
 """
 from __future__ import annotations
 from typing import TYPE_CHECKING
 import sqlalchemy as sa
 from alembic import op
 from python.orm import DataScienceDevBase
 if TYPE_CHECKING:
    from collections.abc import Sequence
 # revision identifiers, used by Alembic.
 revision: str = "2f43120e3ffc"
 down_revision: str | None = "f99be864fe69"
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 schema = DataScienceDevBase.schema_name
 def upgrade() -> None:
    """Upgrade."""
    # ### commands auto generated by Alembic - please adjust! ###
    op.create_table(
        "failed_ingestion",
        sa.Column("raw_line", sa.Text(), nullable=False),
        sa.Column("error", sa.Text(), nullable=False),
        sa.Column("id", sa.Integer(), nullable=False),
        sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.PrimaryKeyConstraint("id", name=op.f("pk_failed_ingestion")),
        schema=schema,
    )
    # ### end Alembic commands ###
 def downgrade() -> None:
    """Downgrade."""
    # ### commands auto generated by Alembic - please adjust! ###
    op.drop_table("failed_ingestion", schema=schema)
    # ### end Alembic commands ###
@@ -0,0 +1,72 @@
 """Attach all partition tables to the posts parent table.
 Alembic autogenerate creates partition tables as standalone tables but does not
 emit the ALTER TABLE ... ATTACH PARTITION statements needed for PostgreSQL to
 route inserts to the correct partition.
 Revision ID: a1b2c3d4e5f6
 Revises: 605b1794838f
 Create Date: 2026-03-25 10:00:00.000000
 """
 from __future__ import annotations
 from typing import TYPE_CHECKING
 from alembic import op
 from sqlalchemy import text
 from python.orm import DataScienceDevBase
 from python.orm.data_science_dev.posts.partitions import (
    PARTITION_END_YEAR,
    PARTITION_START_YEAR,
    iso_weeks_in_year,
    week_bounds,
 )
 if TYPE_CHECKING:
    from collections.abc import Sequence
 # revision identifiers, used by Alembic.
 revision: str = "a1b2c3d4e5f6"
 down_revision: str | None = "605b1794838f"
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 schema = DataScienceDevBase.schema_name
 ALREADY_ATTACHED_QUERY = text("""
    SELECT inhrelid::regclass::text
    FROM pg_inherits
    WHERE inhparent = :parent::regclass
 """)
 def upgrade() -> None:
    """Attach all weekly partition tables to the posts parent table."""
    connection = op.get_bind()
    already_attached = {row[0] for row in connection.execute(ALREADY_ATTACHED_QUERY, {"parent": f"{schema}.posts"})}
    for year in range(PARTITION_START_YEAR, PARTITION_END_YEAR + 1):
        for week in range(1, iso_weeks_in_year(year) + 1):
            table_name = f"posts_{year}_{week:02d}"
            qualified_name = f"{schema}.{table_name}"
            if qualified_name in already_attached:
                continue
            start, end = week_bounds(year, week)
            start_str = start.strftime("%Y-%m-%d %H:%M:%S")
            end_str = end.strftime("%Y-%m-%d %H:%M:%S")
            op.execute(
                f"ALTER TABLE {schema}.posts "
                f"ATTACH PARTITION {qualified_name} "
                f"FOR VALUES FROM ('{start_str}') TO ('{end_str}')"
            )
 def downgrade() -> None:
    """Detach all weekly partition tables from the posts parent table."""
    for year in range(PARTITION_START_YEAR, PARTITION_END_YEAR + 1):
        for week in range(1, iso_weeks_in_year(year) + 1):
            table_name = f"posts_{year}_{week:02d}"
            op.execute(f"ALTER TABLE {schema}.posts DETACH PARTITION {schema}.{table_name}")
@@ -0,0 +1,153 @@
 """adding congress data.
 Revision ID: 83bfc8af92d8
 Revises: a1b2c3d4e5f6
 Create Date: 2026-03-27 10:43:02.324510
 """
 from __future__ import annotations
 from typing import TYPE_CHECKING
 import sqlalchemy as sa
 from alembic import op
 from python.orm import DataScienceDevBase
 if TYPE_CHECKING:
    from collections.abc import Sequence
 # revision identifiers, used by Alembic.
 revision: str = "83bfc8af92d8"
 down_revision: str | None = "a1b2c3d4e5f6"
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 schema = DataScienceDevBase.schema_name
 def upgrade() -> None:
    """Upgrade."""
    # ### commands auto generated by Alembic - please adjust! ###
    op.create_table(
        "bill",
        sa.Column("congress", sa.Integer(), nullable=False),
        sa.Column("bill_type", sa.String(), nullable=False),
        sa.Column("number", sa.Integer(), nullable=False),
        sa.Column("title", sa.String(), nullable=True),
        sa.Column("title_short", sa.String(), nullable=True),
        sa.Column("official_title", sa.String(), nullable=True),
        sa.Column("status", sa.String(), nullable=True),
        sa.Column("status_at", sa.Date(), nullable=True),
        sa.Column("sponsor_bioguide_id", sa.String(), nullable=True),
        sa.Column("subjects_top_term", sa.String(), nullable=True),
        sa.Column("id", sa.Integer(), nullable=False),
        sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.PrimaryKeyConstraint("id", name=op.f("pk_bill")),
        sa.UniqueConstraint("congress", "bill_type", "number", name="uq_bill_congress_type_number"),
        schema=schema,
    )
    op.create_index("ix_bill_congress", "bill", ["congress"], unique=False, schema=schema)
    op.create_table(
        "legislator",
        sa.Column("bioguide_id", sa.Text(), nullable=False),
        sa.Column("thomas_id", sa.String(), nullable=True),
        sa.Column("lis_id", sa.String(), nullable=True),
        sa.Column("govtrack_id", sa.Integer(), nullable=True),
        sa.Column("opensecrets_id", sa.String(), nullable=True),
        sa.Column("fec_ids", sa.String(), nullable=True),
        sa.Column("first_name", sa.String(), nullable=False),
        sa.Column("last_name", sa.String(), nullable=False),
        sa.Column("official_full_name", sa.String(), nullable=True),
        sa.Column("nickname", sa.String(), nullable=True),
        sa.Column("birthday", sa.Date(), nullable=True),
        sa.Column("gender", sa.String(), nullable=True),
        sa.Column("current_party", sa.String(), nullable=True),
        sa.Column("current_state", sa.String(), nullable=True),
        sa.Column("current_district", sa.Integer(), nullable=True),
        sa.Column("current_chamber", sa.String(), nullable=True),
        sa.Column("id", sa.Integer(), nullable=False),
        sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.PrimaryKeyConstraint("id", name=op.f("pk_legislator")),
        schema=schema,
    )
    op.create_index(op.f("ix_legislator_bioguide_id"), "legislator", ["bioguide_id"], unique=True, schema=schema)
    op.create_table(
        "bill_text",
        sa.Column("bill_id", sa.Integer(), nullable=False),
        sa.Column("version_code", sa.String(), nullable=False),
        sa.Column("version_name", sa.String(), nullable=True),
        sa.Column("text_content", sa.String(), nullable=True),
        sa.Column("date", sa.Date(), nullable=True),
        sa.Column("id", sa.Integer(), nullable=False),
        sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.ForeignKeyConstraint(
            ["bill_id"], [f"{schema}.bill.id"], name=op.f("fk_bill_text_bill_id_bill"), ondelete="CASCADE"
        ),
        sa.PrimaryKeyConstraint("id", name=op.f("pk_bill_text")),
        sa.UniqueConstraint("bill_id", "version_code", name="uq_bill_text_bill_id_version_code"),
        schema=schema,
    )
    op.create_table(
        "vote",
        sa.Column("congress", sa.Integer(), nullable=False),
        sa.Column("chamber", sa.String(), nullable=False),
        sa.Column("session", sa.Integer(), nullable=False),
        sa.Column("number", sa.Integer(), nullable=False),
        sa.Column("vote_type", sa.String(), nullable=True),
        sa.Column("question", sa.String(), nullable=True),
        sa.Column("result", sa.String(), nullable=True),
        sa.Column("result_text", sa.String(), nullable=True),
        sa.Column("vote_date", sa.Date(), nullable=False),
        sa.Column("yea_count", sa.Integer(), nullable=True),
        sa.Column("nay_count", sa.Integer(), nullable=True),
        sa.Column("not_voting_count", sa.Integer(), nullable=True),
        sa.Column("present_count", sa.Integer(), nullable=True),
        sa.Column("bill_id", sa.Integer(), nullable=True),
        sa.Column("id", sa.Integer(), nullable=False),
        sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.ForeignKeyConstraint(["bill_id"], [f"{schema}.bill.id"], name=op.f("fk_vote_bill_id_bill")),
        sa.PrimaryKeyConstraint("id", name=op.f("pk_vote")),
        sa.UniqueConstraint("congress", "chamber", "session", "number", name="uq_vote_congress_chamber_session_number"),
        schema=schema,
    )
    op.create_index("ix_vote_congress_chamber", "vote", ["congress", "chamber"], unique=False, schema=schema)
    op.create_index("ix_vote_date", "vote", ["vote_date"], unique=False, schema=schema)
    op.create_table(
        "vote_record",
        sa.Column("vote_id", sa.Integer(), nullable=False),
        sa.Column("legislator_id", sa.Integer(), nullable=False),
        sa.Column("position", sa.String(), nullable=False),
        sa.ForeignKeyConstraint(
            ["legislator_id"],
            [f"{schema}.legislator.id"],
            name=op.f("fk_vote_record_legislator_id_legislator"),
            ondelete="CASCADE",
        ),
        sa.ForeignKeyConstraint(
            ["vote_id"], [f"{schema}.vote.id"], name=op.f("fk_vote_record_vote_id_vote"), ondelete="CASCADE"
        ),
        sa.PrimaryKeyConstraint("vote_id", "legislator_id", name=op.f("pk_vote_record")),
        schema=schema,
    )
    # ### end Alembic commands ###
 def downgrade() -> None:
    """Downgrade."""
    # ### commands auto generated by Alembic - please adjust! ###
    op.drop_table("vote_record", schema=schema)
    op.drop_index("ix_vote_date", table_name="vote", schema=schema)
    op.drop_index("ix_vote_congress_chamber", table_name="vote", schema=schema)
    op.drop_table("vote", schema=schema)
    op.drop_table("bill_text", schema=schema)
    op.drop_index(op.f("ix_legislator_bioguide_id"), table_name="legislator", schema=schema)
    op.drop_table("legislator", schema=schema)
    op.drop_index("ix_bill_congress", table_name="bill", schema=schema)
    op.drop_table("bill", schema=schema)
    # ### end Alembic commands ###
@@ -0,0 +1,58 @@
 """adding LegislatorSocialMedia.
 Revision ID: 5cd7eee3549d
 Revises: 83bfc8af92d8
 Create Date: 2026-03-29 11:53:44.224799
 """
 from __future__ import annotations
 from typing import TYPE_CHECKING
 import sqlalchemy as sa
 from alembic import op
 from python.orm import DataScienceDevBase
 if TYPE_CHECKING:
    from collections.abc import Sequence
 # revision identifiers, used by Alembic.
 revision: str = "5cd7eee3549d"
 down_revision: str | None = "83bfc8af92d8"
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 schema = DataScienceDevBase.schema_name
 def upgrade() -> None:
    """Upgrade."""
    # ### commands auto generated by Alembic - please adjust! ###
    op.create_table(
        "legislator_social_media",
        sa.Column("legislator_id", sa.Integer(), nullable=False),
        sa.Column("platform", sa.String(), nullable=False),
        sa.Column("account_name", sa.String(), nullable=False),
        sa.Column("url", sa.String(), nullable=True),
        sa.Column("source", sa.String(), nullable=False),
        sa.Column("id", sa.Integer(), nullable=False),
        sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.ForeignKeyConstraint(
            ["legislator_id"],
            [f"{schema}.legislator.id"],
            name=op.f("fk_legislator_social_media_legislator_id_legislator"),
        ),
        sa.PrimaryKeyConstraint("id", name=op.f("pk_legislator_social_media")),
        schema=schema,
    )
    # ### end Alembic commands ###
 def downgrade() -> None:
    """Downgrade."""
    # ### commands auto generated by Alembic - please adjust! ###
    op.drop_table("legislator_social_media", schema=schema)
    # ### end Alembic commands ###
@@ -1,93 +0,0 @@
 """adding audiobook libreary metadata.
 Revision ID: d7864d1ffc17
 Revises: c8a794340928
 Create Date: 2026-06-03 20:24:09.200837
 """
 from __future__ import annotations
 from typing import TYPE_CHECKING
 import sqlalchemy as sa
 from alembic import op
 from python.orm import RichieBase
 if TYPE_CHECKING:
    from collections.abc import Sequence
 # revision identifiers, used by Alembic.
 revision: str = "d7864d1ffc17"
 down_revision: str | None = "c8a794340928"
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 schema = RichieBase.schema_name
 def upgrade() -> None:
    """Upgrade."""
    # ### commands auto generated by Alembic - please adjust! ###
    op.create_table(
        "audiobook_author",
        sa.Column("name", sa.String(), nullable=False),
        sa.Column("id", sa.Integer(), nullable=False),
        sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.PrimaryKeyConstraint("id", name=op.f("pk_audiobook_author")),
        sa.UniqueConstraint("name", name=op.f("uq_audiobook_author_name")),
        schema=schema,
    )
    op.create_table(
        "audiobook_series",
        sa.Column("name", sa.String(), nullable=False),
        sa.Column("author_id", sa.Integer(), nullable=False),
        sa.Column("id", sa.Integer(), nullable=False),
        sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.ForeignKeyConstraint(
            ["author_id"],
            [f"{schema}.audiobook_author.id"],
            name=op.f("fk_audiobook_series_author_id_audiobook_author"),
            ondelete="CASCADE",
        ),
        sa.PrimaryKeyConstraint("id", name=op.f("pk_audiobook_series")),
        sa.UniqueConstraint("author_id", "name", name=op.f("uq_audiobook_series_author_id")),
        schema=schema,
    )
    op.create_table(
        "audiobook",
        sa.Column("title", sa.String(), nullable=False),
        sa.Column("author_id", sa.Integer(), nullable=False),
        sa.Column("series_id", sa.Integer(), nullable=True),
        sa.Column("series_index", sa.Integer(), nullable=False),
        sa.Column("id", sa.Integer(), nullable=False),
        sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.ForeignKeyConstraint(
            ["author_id"],
            [f"{schema}.audiobook_author.id"],
            name=op.f("fk_audiobook_author_id_audiobook_author"),
            ondelete="CASCADE",
        ),
        sa.ForeignKeyConstraint(
            ["series_id"],
            [f"{schema}.audiobook_series.id"],
            name=op.f("fk_audiobook_series_id_audiobook_series"),
            ondelete="SET NULL",
        ),
        sa.PrimaryKeyConstraint("id", name=op.f("pk_audiobook")),
        schema=schema,
    )
    # ### end Alembic commands ###
 def downgrade() -> None:
    """Downgrade."""
    # ### commands auto generated by Alembic - please adjust! ###
    op.drop_table("audiobook", schema=schema)
    op.drop_table("audiobook_series", schema=schema)
    op.drop_table("audiobook_author", schema=schema)
    # ### end Alembic commands ###
@@ -1,63 +0,0 @@
 """updated series_index to float and added UniqueConstraint to audiobook and audiobook_author.
 Revision ID: b3c60cc5beb5
 Revises: d7864d1ffc17
 Create Date: 2026-06-10 20:02:43.073725
 """
 from __future__ import annotations
 from typing import TYPE_CHECKING
 import sqlalchemy as sa
 from alembic import op
 from python.orm import RichieBase
 if TYPE_CHECKING:
    from collections.abc import Sequence
 # revision identifiers, used by Alembic.
 revision: str = "b3c60cc5beb5"
 down_revision: str | None = "d7864d1ffc17"
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 schema = RichieBase.schema_name
 def upgrade() -> None:
    """Upgrade."""
    # ### commands auto generated by Alembic - please adjust! ###
    op.alter_column(
        "audiobook",
        "series_index",
        existing_type=sa.INTEGER(),
        type_=sa.Float(),
        existing_nullable=False,
        schema=schema,
    )
    op.create_unique_constraint(
        op.f("uq_audiobook_author_id"),
        "audiobook",
        ["author_id", "series_id", "title"],
        schema=schema,
        postgresql_nulls_not_distinct=True,
    )
    # ### end Alembic commands ###
 def downgrade() -> None:
    """Downgrade."""
    # ### commands auto generated by Alembic - please adjust! ###
    op.drop_constraint(op.f("uq_audiobook_author_id"), "audiobook", schema=schema, type_="unique")
    op.alter_column(
        "audiobook",
        "series_index",
        existing_type=sa.Float(),
        type_=sa.INTEGER(),
        existing_nullable=False,
        schema=schema,
    )
    # ### end Alembic commands ###
@@ -0,0 +1,100 @@
 """seprating signal_bot database.
 Revision ID: 6eaf696e07a5
 Revises:
 Create Date: 2026-03-17 21:35:37.612672
 """
 from __future__ import annotations
 from typing import TYPE_CHECKING
 import sqlalchemy as sa
 from alembic import op
 from sqlalchemy.dialects import postgresql
 from python.orm import SignalBotBase
 if TYPE_CHECKING:
    from collections.abc import Sequence
 # revision identifiers, used by Alembic.
 revision: str = "6eaf696e07a5"
 down_revision: str | None = None
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 schema = SignalBotBase.schema_name
 def upgrade() -> None:
    """Upgrade."""
    # ### commands auto generated by Alembic - please adjust! ###
    op.create_table(
        "dead_letter_message",
        sa.Column("source", sa.String(), nullable=False),
        sa.Column("message", sa.Text(), nullable=False),
        sa.Column("received_at", sa.DateTime(timezone=True), nullable=False),
        sa.Column(
            "status", postgresql.ENUM("UNPROCESSED", "PROCESSED", name="message_status", schema=schema), nullable=False
        ),
        sa.Column("id", sa.Integer(), nullable=False),
        sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.PrimaryKeyConstraint("id", name=op.f("pk_dead_letter_message")),
        schema=schema,
    )
    op.create_table(
        "role",
        sa.Column("name", sa.String(length=50), nullable=False),
        sa.Column("id", sa.SmallInteger(), nullable=False),
        sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.PrimaryKeyConstraint("id", name=op.f("pk_role")),
        sa.UniqueConstraint("name", name=op.f("uq_role_name")),
        schema=schema,
    )
    op.create_table(
        "signal_device",
        sa.Column("phone_number", sa.String(length=50), nullable=False),
        sa.Column("safety_number", sa.String(), nullable=True),
        sa.Column(
            "trust_level",
            postgresql.ENUM("VERIFIED", "UNVERIFIED", "BLOCKED", name="trust_level", schema=schema),
            nullable=False,
        ),
        sa.Column("last_seen", sa.DateTime(timezone=True), nullable=False),
        sa.Column("id", sa.Integer(), nullable=False),
        sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.PrimaryKeyConstraint("id", name=op.f("pk_signal_device")),
        sa.UniqueConstraint("phone_number", name=op.f("uq_signal_device_phone_number")),
        schema=schema,
    )
    op.create_table(
        "device_role",
        sa.Column("device_id", sa.Integer(), nullable=False),
        sa.Column("role_id", sa.SmallInteger(), nullable=False),
        sa.Column("id", sa.Integer(), nullable=False),
        sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
        sa.ForeignKeyConstraint(
            ["device_id"], [f"{schema}.signal_device.id"], name=op.f("fk_device_role_device_id_signal_device")
        ),
        sa.ForeignKeyConstraint(["role_id"], [f"{schema}.role.id"], name=op.f("fk_device_role_role_id_role")),
        sa.PrimaryKeyConstraint("id", name=op.f("pk_device_role")),
        sa.UniqueConstraint("device_id", "role_id", name="uq_device_role_device_role"),
        schema=schema,
    )
    # ### end Alembic commands ###
 def downgrade() -> None:
    """Downgrade."""
    # ### commands auto generated by Alembic - please adjust! ###
    op.drop_table("device_role", schema=schema)
    op.drop_table("signal_device", schema=schema)
    op.drop_table("role", schema=schema)
    op.drop_table("dead_letter_message", schema=schema)
    # ### end Alembic commands ###
@@ -0,0 +1,72 @@
 """test.
 Revision ID: 66bdd532bcab
 Revises: 6eaf696e07a5
 Create Date: 2026-03-18 19:21:14.561568
 """
 from __future__ import annotations
 from typing import TYPE_CHECKING
 import sqlalchemy as sa
 from alembic import op
 from sqlalchemy.dialects import postgresql
 from python.orm import SignalBotBase
 if TYPE_CHECKING:
    from collections.abc import Sequence
 # revision identifiers, used by Alembic.
 revision: str = "66bdd532bcab"
 down_revision: str | None = "6eaf696e07a5"
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 schema = SignalBotBase.schema_name
 def upgrade() -> None:
    """Upgrade."""
    # ### commands auto generated by Alembic - please adjust! ###
    op.alter_column(
        "dead_letter_message",
        "status",
        existing_type=postgresql.ENUM("UNPROCESSED", "PROCESSED", name="message_status", schema=schema),
        type_=sa.Enum("UNPROCESSED", "PROCESSED", name="message_status", native_enum=False),
        existing_nullable=False,
        schema=schema,
    )
    op.alter_column(
        "signal_device",
        "trust_level",
        existing_type=postgresql.ENUM("VERIFIED", "UNVERIFIED", "BLOCKED", name="trust_level", schema=schema),
        type_=sa.Enum("VERIFIED", "UNVERIFIED", "BLOCKED", name="trust_level", native_enum=False),
        existing_nullable=False,
        schema=schema,
    )
    # ### end Alembic commands ###
 def downgrade() -> None:
    """Downgrade."""
    # ### commands auto generated by Alembic - please adjust! ###
    op.alter_column(
        "signal_device",
        "trust_level",
        existing_type=sa.Enum("VERIFIED", "UNVERIFIED", "BLOCKED", name="trust_level", native_enum=False),
        type_=postgresql.ENUM("VERIFIED", "UNVERIFIED", "BLOCKED", name="trust_level", schema=schema),
        existing_nullable=False,
        schema=schema,
    )
    op.alter_column(
        "dead_letter_message",
        "status",
        existing_type=sa.Enum("UNPROCESSED", "PROCESSED", name="message_status", native_enum=False),
        type_=postgresql.ENUM("UNPROCESSED", "PROCESSED", name="message_status", schema=schema),
        existing_nullable=False,
        schema=schema,
    )
    # ### end Alembic commands ###
@@ -0,0 +1,3 @@
 """Data science CLI tools."""
 from __future__ import annotations
@@ -0,0 +1,613 @@
 """Ingestion pipeline for loading congress data from unitedstates/congress JSON files.
 Loads legislators, bills, votes, vote records, and bill text into the data_science_dev database.
 Expects the parent directory to contain congress-tracker/ and congress-legislators/ as siblings.
 Usage:
    ingest-congress /path/to/parent/
    ingest-congress /path/to/parent/ --congress 118
    ingest-congress /path/to/parent/ --congress 118 --only bills
 """
 from __future__ import annotations
 import logging
 from pathlib import Path  # noqa: TC003 needed at runtime for typer CLI argument
 from typing import TYPE_CHECKING, Annotated
 import orjson
 import typer
 import yaml
 from sqlalchemy import select
 from sqlalchemy.orm import Session
 from python.common import configure_logger
 from python.orm.common import get_postgres_engine
 from python.orm.data_science_dev.congress import Bill, BillText, Legislator, LegislatorSocialMedia, Vote, VoteRecord
 if TYPE_CHECKING:
    from collections.abc import Iterator
    from sqlalchemy.engine import Engine
 logger = logging.getLogger(__name__)
 BATCH_SIZE = 10_000
 app = typer.Typer(help="Ingest unitedstates/congress data into data_science_dev.")
@app.command()
 def main(
    parent_dir: Annotated[
        Path,
        typer.Argument(help="Parent directory containing congress-tracker/ and congress-legislators/"),
    ],
    congress: Annotated[int | None, typer.Option(help="Only ingest a specific congress number")] = None,
    only: Annotated[
        str | None,
        typer.Option(help="Only run a specific step: legislators, social-media, bills, votes, bill-text"),
    ] = None,
 ) -> None:
    """Ingest congress data from unitedstates/congress JSON files."""
    configure_logger(level="INFO")
    data_dir = parent_dir / "congress-tracker/congress/data/"
    legislators_dir = parent_dir / "congress-legislators"
    if not data_dir.is_dir():
        typer.echo(f"Expected congress-tracker/ directory: {data_dir}", err=True)
        raise typer.Exit(code=1)
    if not legislators_dir.is_dir():
        typer.echo(f"Expected congress-legislators/ directory: {legislators_dir}", err=True)
        raise typer.Exit(code=1)
    engine = get_postgres_engine(name="DATA_SCIENCE_DEV")
    congress_dirs = _resolve_congress_dirs(data_dir, congress)
    if not congress_dirs:
        typer.echo("No congress directories found.", err=True)
        raise typer.Exit(code=1)
    logger.info("Found %d congress directories to process", len(congress_dirs))
    steps: dict[str, tuple] = {
        "legislators": (ingest_legislators, (engine, legislators_dir)),
        "legislators-social-media": (ingest_social_media, (engine, legislators_dir)),
        "bills": (ingest_bills, (engine, congress_dirs)),
        "votes": (ingest_votes, (engine, congress_dirs)),
        "bill-text": (ingest_bill_text, (engine, congress_dirs)),
    }
    if only:
        if only not in steps:
            typer.echo(f"Unknown step: {only}. Choose from: {', '.join(steps)}", err=True)
            raise typer.Exit(code=1)
        steps = {only: steps[only]}
    for step_name, (step_func, step_args) in steps.items():
        logger.info("=== Starting step: %s ===", step_name)
        step_func(*step_args)
        logger.info("=== Finished step: %s ===", step_name)
    logger.info("ingest-congress done")
 def _resolve_congress_dirs(data_dir: Path, congress: int | None) -> list[Path]:
    """Find congress number directories under data_dir."""
    if congress is not None:
        target = data_dir / str(congress)
        return [target] if target.is_dir() else []
    return sorted(path for path in data_dir.iterdir() if path.is_dir() and path.name.isdigit())
 def _flush_batch(session: Session, batch: list[object], label: str) -> int:
    """Add a batch of ORM objects to the session and commit. Returns count added."""
    if not batch:
        return 0
    session.add_all(batch)
    session.commit()
    count = len(batch)
    logger.info("Committed %d %s", count, label)
    batch.clear()
    return count
 # ---------------------------------------------------------------------------
 # Legislators — loaded from congress-legislators YAML files
 # ---------------------------------------------------------------------------
 def ingest_legislators(engine: Engine, legislators_dir: Path) -> None:
    """Load legislators from congress-legislators YAML files."""
    legislators_data = _load_legislators_yaml(legislators_dir)
    logger.info("Loaded %d legislators from YAML files", len(legislators_data))
    with Session(engine) as session:
        existing_legislators = {
            legislator.bioguide_id: legislator for legislator in session.scalars(select(Legislator)).all()
        }
        logger.info("Found %d existing legislators in DB", len(existing_legislators))
        total_inserted = 0
        total_updated = 0
        for entry in legislators_data:
            bioguide_id = entry.get("id", {}).get("bioguide")
            if not bioguide_id:
                continue
            fields = _parse_legislator(entry)
            if existing := existing_legislators.get(bioguide_id):
                changed = False
                for field, value in fields.items():
                    if value is not None and getattr(existing, field) != value:
                        setattr(existing, field, value)
                        changed = True
                if changed:
                    total_updated += 1
            else:
                session.add(Legislator(bioguide_id=bioguide_id, **fields))
                total_inserted += 1
        session.commit()
    logger.info("Inserted %d new legislators, updated %d existing", total_inserted, total_updated)
 def _load_legislators_yaml(legislators_dir: Path) -> list[dict]:
    """Load and combine legislators-current.yaml and legislators-historical.yaml."""
    legislators: list[dict] = []
    for filename in ("legislators-current.yaml", "legislators-historical.yaml"):
        path = legislators_dir / filename
        if not path.exists():
            logger.warning("Legislators file not found: %s", path)
            continue
        with path.open() as file:
            data = yaml.safe_load(file)
            if isinstance(data, list):
                legislators.extend(data)
    return legislators
 def _parse_legislator(entry: dict) -> dict:
    """Extract Legislator fields from a congress-legislators YAML entry."""
    ids = entry.get("id", {})
    name = entry.get("name", {})
    bio = entry.get("bio", {})
    terms = entry.get("terms", [])
    latest_term = terms[-1] if terms else {}
    fec_ids = ids.get("fec")
    fec_ids_joined = ",".join(fec_ids) if isinstance(fec_ids, list) else fec_ids
    chamber = latest_term.get("type")
    chamber_normalized = {"rep": "House", "sen": "Senate"}.get(chamber, chamber)
    return {
        "thomas_id": ids.get("thomas"),
        "lis_id": ids.get("lis"),
        "govtrack_id": ids.get("govtrack"),
        "opensecrets_id": ids.get("opensecrets"),
        "fec_ids": fec_ids_joined,
        "first_name": name.get("first"),
        "last_name": name.get("last"),
        "official_full_name": name.get("official_full"),
        "nickname": name.get("nickname"),
        "birthday": bio.get("birthday"),
        "gender": bio.get("gender"),
        "current_party": latest_term.get("party"),
        "current_state": latest_term.get("state"),
        "current_district": latest_term.get("district"),
        "current_chamber": chamber_normalized,
    }
 # ---------------------------------------------------------------------------
 # Social Media — loaded from legislators-social-media.yaml
 # ---------------------------------------------------------------------------
 SOCIAL_MEDIA_PLATFORMS = {
    "twitter": "https://twitter.com/{account}",
    "facebook": "https://facebook.com/{account}",
    "youtube": "https://youtube.com/{account}",
    "instagram": "https://instagram.com/{account}",
    "mastodon": None,
 }
 def ingest_social_media(engine: Engine, legislators_dir: Path) -> None:
    """Load social media accounts from legislators-social-media.yaml."""
    social_media_path = legislators_dir / "legislators-social-media.yaml"
    if not social_media_path.exists():
        logger.warning("Social media file not found: %s", social_media_path)
        return
    with social_media_path.open() as file:
        social_media_data = yaml.safe_load(file)
    if not isinstance(social_media_data, list):
        logger.warning("Unexpected format in %s", social_media_path)
        return
    logger.info("Loaded %d entries from legislators-social-media.yaml", len(social_media_data))
    with Session(engine) as session:
        legislator_map = _build_legislator_map(session)
        existing_accounts = {
            (account.legislator_id, account.platform)
            for account in session.scalars(select(LegislatorSocialMedia)).all()
        }
        logger.info("Found %d existing social media accounts in DB", len(existing_accounts))
        total_inserted = 0
        total_updated = 0
        for entry in social_media_data:
            bioguide_id = entry.get("id", {}).get("bioguide")
            if not bioguide_id:
                continue
            legislator_id = legislator_map.get(bioguide_id)
            if legislator_id is None:
                continue
            social = entry.get("social", {})
            for platform, url_template in SOCIAL_MEDIA_PLATFORMS.items():
                account_name = social.get(platform)
                if not account_name:
                    continue
                url = url_template.format(account=account_name) if url_template else None
                if (legislator_id, platform) in existing_accounts:
                    total_updated += 1
                else:
                    session.add(
                        LegislatorSocialMedia(
                            legislator_id=legislator_id,
                            platform=platform,
                            account_name=str(account_name),
                            url=url,
                            source="https://github.com/unitedstates/congress-legislators",
                        )
                    )
                    existing_accounts.add((legislator_id, platform))
                    total_inserted += 1
        session.commit()
    logger.info("Inserted %d new social media accounts, updated %d existing", total_inserted, total_updated)
 def _iter_voters(position_group: object) -> Iterator[dict]:
    """Yield voter dicts from a vote position group (handles list, single dict, or string)."""
    if isinstance(position_group, dict):
        yield position_group
    elif isinstance(position_group, list):
        for voter in position_group:
            if isinstance(voter, dict):
                yield voter
 # ---------------------------------------------------------------------------
 # Bills
 # ---------------------------------------------------------------------------
 def ingest_bills(engine: Engine, congress_dirs: list[Path]) -> None:
    """Load bill data.json files."""
    with Session(engine) as session:
        existing_bills = {(bill.congress, bill.bill_type, bill.number) for bill in session.scalars(select(Bill)).all()}
        logger.info("Found %d existing bills in DB", len(existing_bills))
        total_inserted = 0
        batch: list[Bill] = []
        for congress_dir in congress_dirs:
            bills_dir = congress_dir / "bills"
            if not bills_dir.is_dir():
                continue
            logger.info("Scanning bills from %s", congress_dir.name)
            for bill_file in bills_dir.rglob("data.json"):
                data = _read_json(bill_file)
                if data is None:
                    continue
                bill = _parse_bill(data, existing_bills)
                if bill is not None:
                    batch.append(bill)
                    if len(batch) >= BATCH_SIZE:
                        total_inserted += _flush_batch(session, batch, "bills")
        total_inserted += _flush_batch(session, batch, "bills")
    logger.info("Inserted %d new bills total", total_inserted)
 def _parse_bill(data: dict, existing_bills: set[tuple[int, str, int]]) -> Bill | None:
    """Parse a bill data.json dict into a Bill ORM object, skipping existing."""
    raw_congress = data.get("congress")
    bill_type = data.get("bill_type")
    raw_number = data.get("number")
    if raw_congress is None or bill_type is None or raw_number is None:
        return None
    congress = int(raw_congress)
    number = int(raw_number)
    if (congress, bill_type, number) in existing_bills:
        return None
    sponsor_bioguide = None
    sponsor = data.get("sponsor")
    if sponsor:
        sponsor_bioguide = sponsor.get("bioguide_id")
    return Bill(
        congress=congress,
        bill_type=bill_type,
        number=number,
        title=data.get("short_title") or data.get("official_title"),
        title_short=data.get("short_title"),
        official_title=data.get("official_title"),
        status=data.get("status"),
        status_at=data.get("status_at"),
        sponsor_bioguide_id=sponsor_bioguide,
        subjects_top_term=data.get("subjects_top_term"),
    )
 # ---------------------------------------------------------------------------
 # Votes (and vote records)
 # ---------------------------------------------------------------------------
 def ingest_votes(engine: Engine, congress_dirs: list[Path]) -> None:
    """Load vote data.json files with their vote records."""
    with Session(engine) as session:
        legislator_map = _build_legislator_map(session)
        logger.info("Loaded %d legislators into lookup map", len(legislator_map))
        bill_map = _build_bill_map(session)
        logger.info("Loaded %d bills into lookup map", len(bill_map))
        existing_votes = {
            (vote.congress, vote.chamber, vote.session, vote.number) for vote in session.scalars(select(Vote)).all()
        }
        logger.info("Found %d existing votes in DB", len(existing_votes))
        total_inserted = 0
        batch: list[Vote] = []
        for congress_dir in congress_dirs:
            votes_dir = congress_dir / "votes"
            if not votes_dir.is_dir():
                continue
            logger.info("Scanning votes from %s", congress_dir.name)
            for vote_file in votes_dir.rglob("data.json"):
                data = _read_json(vote_file)
                if data is None:
                    continue
                vote = _parse_vote(data, legislator_map, bill_map, existing_votes)
                if vote is not None:
                    batch.append(vote)
                    if len(batch) >= BATCH_SIZE:
                        total_inserted += _flush_batch(session, batch, "votes")
        total_inserted += _flush_batch(session, batch, "votes")
    logger.info("Inserted %d new votes total", total_inserted)
 def _build_legislator_map(session: Session) -> dict[str, int]:
    """Build a mapping of bioguide_id -> legislator.id."""
    return {legislator.bioguide_id: legislator.id for legislator in session.scalars(select(Legislator)).all()}
 def _build_bill_map(session: Session) -> dict[tuple[int, str, int], int]:
    """Build a mapping of (congress, bill_type, number) -> bill.id."""
    return {(bill.congress, bill.bill_type, bill.number): bill.id for bill in session.scalars(select(Bill)).all()}
 def _parse_vote(
    data: dict,
    legislator_map: dict[str, int],
    bill_map: dict[tuple[int, str, int], int],
    existing_votes: set[tuple[int, str, int, int]],
 ) -> Vote | None:
    """Parse a vote data.json dict into a Vote ORM object with records."""
    raw_congress = data.get("congress")
    chamber = data.get("chamber")
    raw_number = data.get("number")
    vote_date = data.get("date")
    if raw_congress is None or chamber is None or raw_number is None or vote_date is None:
        return None
    raw_session = data.get("session")
    if raw_session is None:
        return None
    congress = int(raw_congress)
    number = int(raw_number)
    session_number = int(raw_session)
    # Normalize chamber from "h"/"s" to "House"/"Senate"
    chamber_normalized = {"h": "House", "s": "Senate"}.get(chamber, chamber)
    if (congress, chamber_normalized, session_number, number) in existing_votes:
        return None
    # Resolve linked bill
    bill_id = None
    bill_ref = data.get("bill")
    if bill_ref:
        bill_key = (
            int(bill_ref.get("congress", congress)),
            bill_ref.get("type"),
            int(bill_ref.get("number", 0)),
        )
        bill_id = bill_map.get(bill_key)
    raw_votes = data.get("votes", {})
    vote_counts = _count_votes(raw_votes)
    vote_records = _build_vote_records(raw_votes, legislator_map)
    return Vote(
        congress=congress,
        chamber=chamber_normalized,
        session=session_number,
        number=number,
        vote_type=data.get("type"),
        question=data.get("question"),
        result=data.get("result"),
        result_text=data.get("result_text"),
        vote_date=vote_date[:10] if isinstance(vote_date, str) else vote_date,
        bill_id=bill_id,
        vote_records=vote_records,
        **vote_counts,
    )
 def _count_votes(raw_votes: dict) -> dict[str, int]:
    """Count voters per position category, correctly handling dict and list formats."""
    yea_count = 0
    nay_count = 0
    not_voting_count = 0
    present_count = 0
    for position, position_group in raw_votes.items():
        voter_count = sum(1 for _ in _iter_voters(position_group))
        if position in ("Yea", "Aye"):
            yea_count += voter_count
        elif position in ("Nay", "No"):
            nay_count += voter_count
        elif position == "Not Voting":
            not_voting_count += voter_count
        elif position == "Present":
            present_count += voter_count
    return {
        "yea_count": yea_count,
        "nay_count": nay_count,
        "not_voting_count": not_voting_count,
        "present_count": present_count,
    }
 def _build_vote_records(raw_votes: dict, legislator_map: dict[str, int]) -> list[VoteRecord]:
    """Build VoteRecord objects from raw vote data."""
    records: list[VoteRecord] = []
    for position, position_group in raw_votes.items():
        for voter in _iter_voters(position_group):
            bioguide_id = voter.get("id")
            if not bioguide_id:
                continue
            legislator_id = legislator_map.get(bioguide_id)
            if legislator_id is None:
                continue
            records.append(
                VoteRecord(
                    legislator_id=legislator_id,
                    position=position,
                )
            )
    return records
 # ---------------------------------------------------------------------------
 # Bill Text
 # ---------------------------------------------------------------------------
 def ingest_bill_text(engine: Engine, congress_dirs: list[Path]) -> None:
    """Load bill text from text-versions directories."""
    with Session(engine) as session:
        bill_map = _build_bill_map(session)
        logger.info("Loaded %d bills into lookup map", len(bill_map))
        existing_bill_texts = {
            (bill_text.bill_id, bill_text.version_code) for bill_text in session.scalars(select(BillText)).all()
        }
        logger.info("Found %d existing bill text versions in DB", len(existing_bill_texts))
        total_inserted = 0
        batch: list[BillText] = []
        for congress_dir in congress_dirs:
            logger.info("Scanning bill texts from %s", congress_dir.name)
            for bill_text in _iter_bill_texts(congress_dir, bill_map, existing_bill_texts):
                batch.append(bill_text)
                if len(batch) >= BATCH_SIZE:
                    total_inserted += _flush_batch(session, batch, "bill texts")
        total_inserted += _flush_batch(session, batch, "bill texts")
    logger.info("Inserted %d new bill text versions total", total_inserted)
 def _iter_bill_texts(
    congress_dir: Path,
    bill_map: dict[tuple[int, str, int], int],
    existing_bill_texts: set[tuple[int, str]],
 ) -> Iterator[BillText]:
    """Yield BillText objects for a single congress directory, skipping existing."""
    bills_dir = congress_dir / "bills"
    if not bills_dir.is_dir():
        return
    for bill_dir in bills_dir.rglob("text-versions"):
        if not bill_dir.is_dir():
            continue
        bill_key = _bill_key_from_dir(bill_dir.parent, congress_dir)
        if bill_key is None:
            continue
        bill_id = bill_map.get(bill_key)
        if bill_id is None:
            continue
        for version_dir in sorted(bill_dir.iterdir()):
            if not version_dir.is_dir():
                continue
            if (bill_id, version_dir.name) in existing_bill_texts:
                continue
            text_content = _read_bill_text(version_dir)
            version_data = _read_json(version_dir / "data.json")
            yield BillText(
                bill_id=bill_id,
                version_code=version_dir.name,
                version_name=version_data.get("version_name") if version_data else None,
                date=version_data.get("issued_on") if version_data else None,
                text_content=text_content,
            )
 def _bill_key_from_dir(bill_dir: Path, congress_dir: Path) -> tuple[int, str, int] | None:
    """Extract (congress, bill_type, number) from directory structure."""
    congress = int(congress_dir.name)
    bill_type = bill_dir.parent.name
    name = bill_dir.name
    # Directory name is like "hr3590" — strip the type prefix to get the number
    number_str = name[len(bill_type) :]
    if not number_str.isdigit():
        return None
    return (congress, bill_type, int(number_str))
 def _read_bill_text(version_dir: Path) -> str | None:
    """Read bill text from a version directory, preferring .txt over .xml."""
    for extension in ("txt", "htm", "html", "xml"):
        candidates = list(version_dir.glob(f"document.{extension}"))
        if not candidates:
            candidates = list(version_dir.glob(f"*.{extension}"))
        if candidates:
            try:
                return candidates[0].read_text(encoding="utf-8")
            except Exception:
                logger.exception("Failed to read %s", candidates[0])
    return None
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 def _read_json(path: Path) -> dict | None:
    """Read and parse a JSON file, returning None on failure."""
    try:
        return orjson.loads(path.read_bytes())
    except FileNotFoundError:
        return None
    except Exception:
        logger.exception("Failed to parse %s", path)
        return None
 if __name__ == "__main__":
    app()
@@ -0,0 +1,247 @@
 """Ingestion pipeline for loading JSONL post files into the weekly-partitioned posts table.
 Usage:
    ingest-posts /path/to/files/
    ingest-posts /path/to/single_file.jsonl
    ingest-posts /data/dir/ --workers 4 --batch-size 5000
 """
 from __future__ import annotations
 import logging
 from datetime import UTC, datetime
 from pathlib import Path  # noqa: TC003 this is needed for typer
 from typing import TYPE_CHECKING, Annotated
 import orjson
 import psycopg
 import typer
 from python.common import configure_logger
 from python.orm.common import get_connection_info
 from python.parallelize import parallelize_process
 if TYPE_CHECKING:
    from collections.abc import Iterator
 logger = logging.getLogger(__name__)
 app = typer.Typer(help="Ingest JSONL post files into the partitioned posts table.")
@app.command()
 def main(
    path: Annotated[Path, typer.Argument(help="Directory containing JSONL files, or a single JSONL file")],
    batch_size: Annotated[int, typer.Option(help="Rows per INSERT batch")] = 10000,
    workers: Annotated[int, typer.Option(help="Parallel workers for multi-file ingestion")] = 4,
    pattern: Annotated[str, typer.Option(help="Glob pattern for JSONL files")] = "*.jsonl",
 ) -> None:
    """Ingest JSONL post files into the weekly-partitioned posts table."""
    configure_logger(level="INFO")
    logger.info("starting ingest-posts")
    logger.info("path=%s batch_size=%d workers=%d pattern=%s", path, batch_size, workers, pattern)
    if path.is_file():
        ingest_file(path, batch_size=batch_size)
    elif path.is_dir():
        ingest_directory(path, batch_size=batch_size, max_workers=workers, pattern=pattern)
    else:
        typer.echo(f"Path does not exist: {path}", err=True)
        raise typer.Exit(code=1)
    logger.info("ingest-posts done")
 def ingest_directory(
    directory: Path,
    *,
    batch_size: int,
    max_workers: int,
    pattern: str = "*.jsonl",
 ) -> None:
    """Ingest all JSONL files in a directory using parallel workers."""
    files = sorted(directory.glob(pattern))
    if not files:
        logger.warning("No JSONL files found in %s", directory)
        return
    logger.info("Found %d JSONL files to ingest", len(files))
    kwargs_list = [{"path": fp, "batch_size": batch_size} for fp in files]
    parallelize_process(ingest_file, kwargs_list, max_workers=max_workers)
 SCHEMA = "main"
 COLUMNS = (
    "post_id",
    "user_id",
    "instance",
    "date",
    "text",
    "langs",
    "like_count",
    "reply_count",
    "repost_count",
    "reply_to",
    "replied_author",
    "thread_root",
    "thread_root_author",
    "repost_from",
    "reposted_author",
    "quotes",
    "quoted_author",
    "labels",
    "sent_label",
    "sent_score",
 )
 INSERT_FROM_STAGING = f"""
    INSERT INTO {SCHEMA}.posts ({", ".join(COLUMNS)})
    SELECT {", ".join(COLUMNS)} FROM pg_temp.staging
    ON CONFLICT (post_id, date) DO NOTHING
 """  # noqa: S608
 FAILED_INSERT = f"""
    INSERT INTO {SCHEMA}.failed_ingestion (raw_line, error)
    VALUES (%(raw_line)s, %(error)s)
 """  # noqa: S608
 def get_psycopg_connection() -> psycopg.Connection:
    """Create a raw psycopg3 connection from environment variables."""
    database, host, port, username, password = get_connection_info("DATA_SCIENCE_DEV")
    return psycopg.connect(
        dbname=database,
        host=host,
        port=int(port),
        user=username,
        password=password,
        autocommit=False,
    )
 def ingest_file(path: Path, *, batch_size: int) -> None:
    """Ingest a single JSONL file into the posts table."""
    log_trigger = max(100_000 // batch_size, 1)
    failed_lines: list[dict] = []
    try:
        with get_psycopg_connection() as connection:
            for index, batch in enumerate(read_jsonl_batches(path, batch_size, failed_lines), 1):
                ingest_batch(connection, batch)
                if index % log_trigger == 0:
                    logger.info("Ingested %d batches (%d rows) from %s", index, index * batch_size, path)
            if failed_lines:
                logger.warning("Recording %d malformed lines from %s", len(failed_lines), path.name)
                with connection.cursor() as cursor:
                    cursor.executemany(FAILED_INSERT, failed_lines)
                connection.commit()
    except Exception:
        logger.exception("Failed to ingest file: %s", path)
        raise
 def ingest_batch(connection: psycopg.Connection, batch: list[dict]) -> None:
    """COPY batch into a temp staging table, then INSERT ... ON CONFLICT into posts."""
    if not batch:
        return
    try:
        with connection.cursor() as cursor:
            cursor.execute(f"""
                CREATE TEMP TABLE IF NOT EXISTS staging
                (LIKE {SCHEMA}.posts INCLUDING DEFAULTS)
                ON COMMIT DELETE ROWS
            """)
            cursor.execute("TRUNCATE pg_temp.staging")
            with cursor.copy(f"COPY pg_temp.staging ({', '.join(COLUMNS)}) FROM STDIN") as copy:
                for row in batch:
                    copy.write_row(tuple(row.get(column) for column in COLUMNS))
            cursor.execute(INSERT_FROM_STAGING)
        connection.commit()
    except Exception as error:
        connection.rollback()
        if len(batch) == 1:
            logger.exception("Skipping bad row post_id=%s", batch[0].get("post_id"))
            with connection.cursor() as cursor:
                cursor.execute(
                    FAILED_INSERT,
                    {
                        "raw_line": orjson.dumps(batch[0], default=str).decode(),
                        "error": str(error),
                    },
                )
            connection.commit()
            return
        midpoint = len(batch) // 2
        ingest_batch(connection, batch[:midpoint])
        ingest_batch(connection, batch[midpoint:])
 def read_jsonl_batches(file_path: Path, batch_size: int, failed_lines: list[dict]) -> Iterator[list[dict]]:
    """Stream a JSONL file and yield batches of transformed rows."""
    batch: list[dict] = []
    with file_path.open("r", encoding="utf-8") as handle:
        for raw_line in handle:
            line = raw_line.strip()
            if not line:
                continue
            batch.extend(parse_line(line, file_path, failed_lines))
            if len(batch) >= batch_size:
                yield batch
                batch = []
    if batch:
        yield batch
 def parse_line(line: str, file_path: Path, failed_lines: list[dict]) -> Iterator[dict]:
    """Parse a JSONL line, handling concatenated JSON objects."""
    try:
        yield transform_row(orjson.loads(line))
    except orjson.JSONDecodeError:
        if "}{" not in line:
            logger.warning("Skipping malformed line in %s: %s", file_path.name, line[:120])
            failed_lines.append({"raw_line": line, "error": "malformed JSON"})
            return
        fragments = line.replace("}{", "}\n{").split("\n")
        for fragment in fragments:
            try:
                yield transform_row(orjson.loads(fragment))
            except (orjson.JSONDecodeError, KeyError, ValueError) as error:
                logger.warning("Skipping malformed fragment in %s: %s", file_path.name, fragment[:120])
                failed_lines.append({"raw_line": fragment, "error": str(error)})
    except Exception as error:
        logger.exception("Skipping bad row in %s: %s", file_path.name, line[:120])
        failed_lines.append({"raw_line": line, "error": str(error)})
 def transform_row(raw: dict) -> dict:
    """Transform a raw JSONL row into a dict matching the Posts table columns."""
    raw["date"] = parse_date(raw["date"])
    if raw.get("langs") is not None:
        raw["langs"] = orjson.dumps(raw["langs"])
    if raw.get("text") is not None:
        raw["text"] = raw["text"].replace("\x00", "")
    return raw
 def parse_date(raw_date: int) -> datetime:
    """Parse compact YYYYMMDDHHmm integer into a naive datetime (input is UTC by spec)."""
    return datetime(
        raw_date // 100000000,
        (raw_date // 1000000) % 100,
        (raw_date // 10000) % 100,
        (raw_date // 100) % 100,
        raw_date % 100,
        tzinfo=UTC,
    )
 if __name__ == "__main__":
    app()
@@ -83,6 +83,20 @@ DATABASES: dict[str, DatabaseConfig] = {
        base_class_name="VanInventoryBase",
        models_module="python.orm.van_inventory.models",
    ),
    "signal_bot": DatabaseConfig(
        env_prefix="SIGNALBOT",
        version_location="python/alembic/signal_bot/versions",
        base_module="python.orm.signal_bot.base",
        base_class_name="SignalBotBase",
        models_module="python.orm.signal_bot.models",
    ),
    "data_science_dev": DatabaseConfig(
        env_prefix="DATA_SCIENCE_DEV",
        version_location="python/alembic/data_science_dev/versions",
        base_module="python.orm.data_science_dev.base",
        base_class_name="DataScienceDevBase",
        models_module="python.orm.data_science_dev.models",
    ),
 }
@@ -1,347 +0,0 @@
 """Small Gitea API client for repository automation."""
 from __future__ import annotations
 from dataclasses import dataclass
 from typing import Self
 from urllib.parse import quote
 import httpx
 DEFAULT_PAGE_SIZE = 100
 EXPECTED_NO_CONTENT = 204
 EXPECTED_CREATED = 201
 EXPECTED_OK = 200
@dataclass(frozen=True)
 class CreatedIssue:
    """Issue data returned by Gitea."""
    number: int | None
    html_url: str | None
    title: str
@dataclass(frozen=True)
 class PullRequest:
    """Pull request data returned by Gitea."""
    number: int
    title: str
    html_url: str | None
    labels: tuple[str, ...]
    head_branch: str | None
    base_branch: str | None
@dataclass(frozen=True)
 class WorkflowJob:
    """Workflow job data returned by Gitea Actions."""
    id: int
    name: str
    run_id: int | None
    status: str | None
    conclusion: str | None
 class GiteaError(RuntimeError):
    """Raised when Gitea rejects an API request."""
 def split_repo_name(repo: str) -> tuple[str, str]:
    """Split an owner/repo string into its parts."""
    owner, separator, repo_name = repo.partition("/")
    if not separator or not owner or not repo_name:
        msg = f"Invalid repository name: {repo}"
        raise ValueError(msg)
    return owner, repo_name
 class GiteaClient:
    """HTTP client for the subset of Gitea APIs used in this repository."""
    def __init__(
        self,
        *,
        base_url: str,
        token: str,
        timeout: int = 30,
        transport: httpx.BaseTransport | None = None,
    ) -> None:
        """Initialize the Gitea client."""
        self._client = httpx.Client(
            base_url=base_url.rstrip("/"),
            timeout=timeout,
            headers={"Authorization": f"token {token}"},
            transport=transport,
        )
    def create_issue(
        self,
        *,
        owner: str,
        repo: str,
        title: str,
        body: str,
        labels: list[int] | None = None,
    ) -> CreatedIssue:
        """Create a Gitea issue."""
        payload: dict[str, object] = {"title": title, "body": body, "labels": labels or []}
        response = self._request(
            "POST",
            f"/api/v1/repos/{owner}/{repo}/issues",
            expected_statuses={EXPECTED_CREATED},
            json=payload,
        )
        data = response.json()
        return CreatedIssue(
            number=_optional_int(data.get("number")),
            html_url=_optional_str(data.get("html_url")),
            title=str(data.get("title", title)),
        )
    def resolve_label_ids(self, *, owner: str, repo: str, labels: list[str]) -> list[int]:
        """Resolve label names to Gitea label IDs."""
        if not labels:
            return []
        available_labels: dict[str, int] = {}
        page = 1
        while True:
            response = self._request(
                "GET",
                f"/api/v1/repos/{owner}/{repo}/labels",
                params={"page": page, "limit": DEFAULT_PAGE_SIZE},
            )
            batch = response.json()
            if not batch:
                break
            for label in batch:
                label_name = str(label.get("name", ""))
                label_id = _optional_int(label.get("id"))
                if label_name and label_id is not None:
                    available_labels[label_name] = label_id
            if len(batch) < DEFAULT_PAGE_SIZE:
                break
            page += 1
        missing = [label for label in labels if label not in available_labels]
        if missing:
            missing_names = ", ".join(sorted(missing))
            msg = f"Missing Gitea labels: {missing_names}"
            raise GiteaError(msg)
        return [available_labels[label] for label in labels]
    def list_open_pull_requests(
        self,
        *,
        owner: str,
        repo: str,
        labels: list[str] | None = None,
        head: str | None = None,
    ) -> list[PullRequest]:
        """List open pull requests for a repository."""
        expected_labels = set(labels or [])
        pull_requests: list[PullRequest] = []
        page = 1
        while True:
            response = self._request(
                "GET",
                f"/api/v1/repos/{owner}/{repo}/pulls",
                params={"state": "open", "page": page, "limit": DEFAULT_PAGE_SIZE},
            )
            batch = response.json()
            if not batch:
                break
            for item in batch:
                pull_request = _pull_request_from_api(item)
                if head and pull_request.head_branch != head:
                    continue
                if expected_labels and not expected_labels.issubset(set(pull_request.labels)):
                    continue
                pull_requests.append(pull_request)
            if len(batch) < DEFAULT_PAGE_SIZE:
                break
            page += 1
        return pull_requests
    def create_pull_request(
        self,
        *,
        owner: str,
        repo: str,
        title: str,
        body: str,
        head: str,
        base: str,
        labels: list[str] | None = None,
    ) -> PullRequest:
        """Create a pull request."""
        payload: dict[str, object] = {
            "title": title,
            "body": body,
            "head": head,
            "base": base,
        }
        if labels:
            payload["labels"] = self.resolve_label_ids(owner=owner, repo=repo, labels=labels)
        response = self._request(
            "POST",
            f"/api/v1/repos/{owner}/{repo}/pulls",
            expected_statuses={EXPECTED_CREATED},
            json=payload,
        )
        return _pull_request_from_api(response.json())
    def merge_pull_request(
        self,
        *,
        owner: str,
        repo: str,
        number: int,
        merge_method: str = "rebase",
        head_commit_id: str | None = None,
        delete_branch_after_merge: bool = False,
    ) -> None:
        """Merge a pull request."""
        payload: dict[str, object] = {
            "Do": merge_method,
            "delete_branch_after_merge": delete_branch_after_merge,
        }
        if head_commit_id:
            payload["head_commit_id"] = head_commit_id
        self._request(
            "POST",
            f"/api/v1/repos/{owner}/{repo}/pulls/{number}/merge",
            json=payload,
        )
    def dispatch_workflow(self, *, owner: str, repo: str, workflow_id: str, ref: str) -> None:
        """Trigger a workflow_dispatch run."""
        workflow_path = quote(workflow_id, safe="")
        self._request(
            "POST",
            f"/api/v1/repos/{owner}/{repo}/actions/workflows/{workflow_path}/dispatches",
            expected_statuses={EXPECTED_OK, EXPECTED_NO_CONTENT},
            json={"ref": ref},
        )
    def list_run_jobs(self, *, owner: str, repo: str, run_id: str | int) -> list[WorkflowJob]:
        """List workflow jobs for a specific run."""
        jobs: list[WorkflowJob] = []
        page = 1
        while True:
            response = self._request(
                "GET",
                f"/api/v1/repos/{owner}/{repo}/actions/jobs",
                params={"page": page, "limit": DEFAULT_PAGE_SIZE},
            )
            payload = response.json()
            batch = payload.get("jobs", [])
            if not batch:
                break
            for item in batch:
                if str(item.get("run_id")) != str(run_id):
                    continue
                jobs.append(_workflow_job_from_api(item))
            if len(batch) < DEFAULT_PAGE_SIZE:
                break
            page += 1
        return jobs
    def download_job_logs(self, *, owner: str, repo: str, job_id: int) -> str:
        """Download logs for a workflow job."""
        response = self._request(
            "GET",
            f"/api/v1/repos/{owner}/{repo}/actions/jobs/{job_id}/logs",
        )
        return response.text
    def close(self) -> None:
        """Close the underlying HTTP client."""
        self._client.close()
    def __enter__(self) -> Self:
        """Enter the context manager."""
        return self
    def __exit__(self, *args: object) -> None:
        """Close the HTTP client."""
        self.close()
    def _request(
        self,
        method: str,
        path: str,
        *,
        expected_statuses: set[int] | None = None,
        **kwargs: object,
    ) -> httpx.Response:
        """Send an HTTP request and validate the response status."""
        response = self._client.request(method, path, **kwargs)
        statuses = expected_statuses or {EXPECTED_OK}
        if response.status_code not in statuses:
            msg = f"Gitea request failed ({response.status_code}): {response.text}"
            raise GiteaError(msg)
        return response
 def _pull_request_from_api(data: dict[str, object]) -> PullRequest:
    """Convert Gitea API pull-request data into a dataclass."""
    number = _optional_int(data.get("number")) or _optional_int(data.get("index"))
    if number is None:
        msg = "Gitea pull request payload is missing a number"
        raise GiteaError(msg)
    labels = tuple(str(label.get("name", "")) for label in data.get("labels", []))
    head = data.get("head", {})
    base = data.get("base", {})
    return PullRequest(
        number=number,
        title=str(data.get("title", "")),
        html_url=_optional_str(data.get("html_url")),
        labels=tuple(label for label in labels if label),
        head_branch=_optional_str(head.get("ref")) or _optional_str(data.get("head_branch")),
        base_branch=_optional_str(base.get("ref")) or _optional_str(data.get("base_branch")),
    )
 def _workflow_job_from_api(data: dict[str, object]) -> WorkflowJob:
    """Convert Gitea API workflow-job data into a dataclass."""
    job_id = _optional_int(data.get("id"))
    if job_id is None:
        msg = "Gitea workflow job payload is missing an ID"
        raise GiteaError(msg)
    return WorkflowJob(
        id=job_id,
        name=str(data.get("name", "")),
        run_id=_optional_int(data.get("run_id")),
        status=_optional_str(data.get("status")),
        conclusion=_optional_str(data.get("conclusion")),
    )
 def _optional_int(value: object) -> int | None:
    """Convert an API value to an integer when present."""
    if value is None:
        return None
    return int(value)
 def _optional_str(value: object) -> str | None:
    """Convert an API value to a string when present."""
    if value is None:
        return None
    return str(value)
@@ -1,148 +0,0 @@
 """Automation helpers for flake.lock pull requests on Gitea."""
 from __future__ import annotations
 import subprocess
 from os import getenv
 from typing import Annotated
 import typer
 from python.gitea import GiteaClient, PullRequest, split_repo_name
 DEFAULT_BASE_BRANCH = "main"
 DEFAULT_BRANCH = "automation/update-flake-lock"
 DEFAULT_GITEA_URL = "https://gitea.tmmworkshop.com"
 PR_LABELS = ["dependencies", "automated", "flake_lock_update"]
 PR_CHECK_WORKFLOWS = ["build_systems.yml", "treefmt.yml", "pytest.yml"]
 PR_TITLE = "Update flake.lock"
 PR_BODY = "Automated flake.lock update."
 app = typer.Typer(add_completion=False)
 def run_cmd(cmd: list[str], *, check: bool = True) -> subprocess.CompletedProcess[str]:
    """Run a subprocess command."""
    return subprocess.run(cmd, capture_output=True, text=True, check=check)
 def ensure_flake_lock_pull_request(
    client: GiteaClient,
    *,
    owner: str,
    repo: str,
    branch: str,
    base: str,
 ) -> PullRequest:
    """Return an existing flake.lock PR for the branch or create one."""
    pull_requests = client.list_open_pull_requests(owner=owner, repo=repo, head=branch)
    if pull_requests:
        return pull_requests[0]
    return client.create_pull_request(
        owner=owner,
        repo=repo,
        title=PR_TITLE,
        body=PR_BODY,
        head=branch,
        base=base,
        labels=PR_LABELS,
    )
 def find_flake_lock_pull_request(client: GiteaClient, *, owner: str, repo: str) -> PullRequest | None:
    """Find the first open flake.lock pull request."""
    pull_requests = client.list_open_pull_requests(owner=owner, repo=repo, labels=["flake_lock_update"])
    if not pull_requests:
        return None
    return pull_requests[0]
 def dispatch_pull_request_checks(client: GiteaClient, *, owner: str, repo: str, branch: str) -> None:
    """Dispatch the workflows that normally run for pull requests."""
    for workflow in PR_CHECK_WORKFLOWS:
        client.dispatch_workflow(owner=owner, repo=repo, workflow_id=workflow, ref=branch)
 def has_worktree_changes() -> bool:
    """Return whether `flake.lock` has worktree changes."""
    result = run_cmd(["git", "diff", "--quiet", "--", "flake.lock"], check=False)
    return result.returncode != 0
 def commit_flake_lock_update(*, branch: str) -> None:
    """Commit the updated lock file to the automation branch."""
    run_cmd(["git", "config", "user.name", "gitea-actions[bot]"])
    run_cmd(["git", "config", "user.email", "gitea-actions@tmmworkshop.com"])
    run_cmd(["git", "checkout", "-B", branch])
    run_cmd(["git", "add", "flake.lock"])
    run_cmd(["git", "commit", "-m", "chore: update flake.lock"])
 def push_branch(*, branch: str) -> None:
    """Push the automation branch to origin."""
    run_cmd(["git", "push", "origin", f"HEAD:{branch}", "--force"])
 def _required_gitea_token() -> str:
    """Read the required Gitea token from the environment."""
    token = getenv("GITEA_TOKEN")
    if token:
        return token
    msg = "GITEA_TOKEN environment variable is required"
    raise RuntimeError(msg)
@app.command()
 def update(
    repo: Annotated[str, typer.Option("--repo", help="Gitea repository in owner/repo form")],
    base: Annotated[str, typer.Option("--base", help="Base branch")] = DEFAULT_BASE_BRANCH,
    branch: Annotated[str, typer.Option("--branch", help="Automation branch")] = DEFAULT_BRANCH,
 ) -> None:
    """Commit flake.lock changes and ensure a pull request exists."""
    if not has_worktree_changes():
        typer.echo("No flake.lock changes detected")
        return
    commit_flake_lock_update(branch=branch)
    push_branch(branch=branch)
    owner, repo_name = split_repo_name(repo)
    with GiteaClient(
        base_url=getenv("GITEA_URL", DEFAULT_GITEA_URL),
        token=_required_gitea_token(),
    ) as client:
        pull_request = ensure_flake_lock_pull_request(
            client,
            owner=owner,
            repo=repo_name,
            branch=branch,
            base=base,
        )
        # We can remove this if Gitea fixes the following issue:
        # https://github.com/go-gitea/gitea/issues/33963
        dispatch_pull_request_checks(client, owner=owner, repo=repo_name, branch=branch)
    typer.echo(pull_request.html_url or f"Pull request #{pull_request.number}")
@app.command()
 def merge(
    repo: Annotated[str, typer.Option("--repo", help="Gitea repository in owner/repo form")],
 ) -> None:
    """Merge the first open flake.lock pull request."""
    owner, repo_name = split_repo_name(repo)
    with GiteaClient(
        base_url=getenv("GITEA_URL", DEFAULT_GITEA_URL),
        token=_required_gitea_token(),
    ) as client:
        pull_request = find_flake_lock_pull_request(client, owner=owner, repo=repo_name)
        if not pull_request:
            typer.echo("No open PR found with label flake_lock_update")
            return
        client.merge_pull_request(owner=owner, repo=repo_name, number=pull_request.number, merge_method="rebase")
    typer.echo(f"Merged PR #{pull_request.number}")
 if __name__ == "__main__":
    app()
@@ -1,9 +1,13 @@
 """ORM package exports."""
 from python.orm.data_science_dev.base import DataScienceDevBase
 from python.orm.richie.base import RichieBase
 from python.orm.signal_bot.base import SignalBotBase
 from python.orm.van_inventory.base import VanInventoryBase
 __all__ = [
    "DataScienceDevBase",
    "RichieBase",
    "SignalBotBase",
    "VanInventoryBase",
 ]
@@ -0,0 +1,11 @@
 """Data science dev database ORM exports."""
 from __future__ import annotations
 from python.orm.data_science_dev.base import DataScienceDevBase, DataScienceDevTableBase, DataScienceDevTableBaseBig
 __all__ = [
    "DataScienceDevBase",
    "DataScienceDevTableBase",
    "DataScienceDevTableBaseBig",
 ]
@@ -0,0 +1,52 @@
 """Data science dev database ORM base."""
 from __future__ import annotations
 from datetime import datetime
 from sqlalchemy import BigInteger, DateTime, MetaData, func
 from sqlalchemy.ext.declarative import AbstractConcreteBase
 from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
 from python.orm.common import NAMING_CONVENTION
 class DataScienceDevBase(DeclarativeBase):
    """Base class for data_science_dev database ORM models."""
    schema_name = "main"
    metadata = MetaData(
        schema=schema_name,
        naming_convention=NAMING_CONVENTION,
    )
 class _TableMixin:
    """Shared timestamp columns for all table bases."""
    created: Mapped[datetime] = mapped_column(
        DateTime(timezone=True),
        server_default=func.now(),
    )
    updated: Mapped[datetime] = mapped_column(
        DateTime(timezone=True),
        server_default=func.now(),
        onupdate=func.now(),
    )
 class DataScienceDevTableBase(_TableMixin, AbstractConcreteBase, DataScienceDevBase):
    """Table with Integer primary key."""
    __abstract__ = True
    id: Mapped[int] = mapped_column(primary_key=True)
 class DataScienceDevTableBaseBig(_TableMixin, AbstractConcreteBase, DataScienceDevBase):
    """Table with BigInteger primary key."""
    __abstract__ = True
    id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
@@ -0,0 +1,14 @@
 """init."""
 from python.orm.data_science_dev.congress.bill import Bill, BillText
 from python.orm.data_science_dev.congress.legislator import Legislator, LegislatorSocialMedia
 from python.orm.data_science_dev.congress.vote import Vote, VoteRecord
 __all__ = [
    "Bill",
    "BillText",
    "Legislator",
    "LegislatorSocialMedia",
    "Vote",
    "VoteRecord",
 ]
@@ -0,0 +1,66 @@
 """Bill model - legislation introduced in Congress."""
 from __future__ import annotations
 from datetime import date
 from typing import TYPE_CHECKING
 from sqlalchemy import ForeignKey, Index, UniqueConstraint
 from sqlalchemy.orm import Mapped, mapped_column, relationship
 from python.orm.data_science_dev.base import DataScienceDevTableBase
 if TYPE_CHECKING:
    from python.orm.data_science_dev.congress.vote import Vote
 class Bill(DataScienceDevTableBase):
    """Legislation with congress number, type, titles, status, and sponsor."""
    __tablename__ = "bill"
    congress: Mapped[int]
    bill_type: Mapped[str]
    number: Mapped[int]
    title: Mapped[str | None]
    title_short: Mapped[str | None]
    official_title: Mapped[str | None]
    status: Mapped[str | None]
    status_at: Mapped[date | None]
    sponsor_bioguide_id: Mapped[str | None]
    subjects_top_term: Mapped[str | None]
    votes: Mapped[list[Vote]] = relationship(
        "Vote",
        back_populates="bill",
    )
    bill_texts: Mapped[list[BillText]] = relationship(
        "BillText",
        back_populates="bill",
        cascade="all, delete-orphan",
    )
    __table_args__ = (
        UniqueConstraint("congress", "bill_type", "number", name="uq_bill_congress_type_number"),
        Index("ix_bill_congress", "congress"),
    )
 class BillText(DataScienceDevTableBase):
    """Stores different text versions of a bill (introduced, enrolled, etc.)."""
    __tablename__ = "bill_text"
    bill_id: Mapped[int] = mapped_column(ForeignKey("main.bill.id", ondelete="CASCADE"))
    version_code: Mapped[str]
    version_name: Mapped[str | None]
    text_content: Mapped[str | None]
    date: Mapped[date | None]
    bill: Mapped[Bill] = relationship("Bill", back_populates="bill_texts")
    __table_args__ = (UniqueConstraint("bill_id", "version_code", name="uq_bill_text_bill_id_version_code"),)
@@ -0,0 +1,66 @@
 """Legislator model - members of Congress."""
 from __future__ import annotations
 from datetime import date
 from typing import TYPE_CHECKING
 from sqlalchemy import ForeignKey, Text
 from sqlalchemy.orm import Mapped, mapped_column, relationship
 from python.orm.data_science_dev.base import DataScienceDevTableBase
 if TYPE_CHECKING:
    from python.orm.data_science_dev.congress.vote import VoteRecord
 class Legislator(DataScienceDevTableBase):
    """Members of Congress with identification and current term info."""
    __tablename__ = "legislator"
    bioguide_id: Mapped[str] = mapped_column(Text, unique=True, index=True)
    thomas_id: Mapped[str | None]
    lis_id: Mapped[str | None]
    govtrack_id: Mapped[int | None]
    opensecrets_id: Mapped[str | None]
    fec_ids: Mapped[str | None]
    first_name: Mapped[str]
    last_name: Mapped[str]
    official_full_name: Mapped[str | None]
    nickname: Mapped[str | None]
    birthday: Mapped[date | None]
    gender: Mapped[str | None]
    current_party: Mapped[str | None]
    current_state: Mapped[str | None]
    current_district: Mapped[int | None]
    current_chamber: Mapped[str | None]
    social_media_accounts: Mapped[list[LegislatorSocialMedia]] = relationship(
        "LegislatorSocialMedia",
        back_populates="legislator",
        cascade="all, delete-orphan",
    )
    vote_records: Mapped[list[VoteRecord]] = relationship(
        "VoteRecord",
        back_populates="legislator",
        cascade="all, delete-orphan",
    )
 class LegislatorSocialMedia(DataScienceDevTableBase):
    """Social media account linked to a legislator."""
    __tablename__ = "legislator_social_media"
    legislator_id: Mapped[int] = mapped_column(ForeignKey("main.legislator.id"))
    platform: Mapped[str]
    account_name: Mapped[str]
    url: Mapped[str | None]
    source: Mapped[str]
    legislator: Mapped[Legislator] = relationship(back_populates="social_media_accounts")
@@ -0,0 +1,79 @@
 """Vote model - roll call votes in Congress."""
 from __future__ import annotations
 from datetime import date
 from typing import TYPE_CHECKING
 from sqlalchemy import ForeignKey, Index, UniqueConstraint
 from sqlalchemy.orm import Mapped, mapped_column, relationship
 from python.orm.data_science_dev.base import DataScienceDevBase, DataScienceDevTableBase
 if TYPE_CHECKING:
    from python.orm.data_science_dev.congress.bill import Bill
    from python.orm.data_science_dev.congress.legislator import Legislator
    from python.orm.data_science_dev.congress.vote import Vote
 class VoteRecord(DataScienceDevBase):
    """Links a vote to a legislator with their position (Yea, Nay, etc.)."""
    __tablename__ = "vote_record"
    vote_id: Mapped[int] = mapped_column(
        ForeignKey("main.vote.id", ondelete="CASCADE"),
        primary_key=True,
    )
    legislator_id: Mapped[int] = mapped_column(
        ForeignKey("main.legislator.id", ondelete="CASCADE"),
        primary_key=True,
    )
    position: Mapped[str]
    vote: Mapped[Vote] = relationship("Vote", back_populates="vote_records")
    legislator: Mapped[Legislator] = relationship("Legislator", back_populates="vote_records")
 class Vote(DataScienceDevTableBase):
    """Roll call votes with counts and optional bill linkage."""
    __tablename__ = "vote"
    congress: Mapped[int]
    chamber: Mapped[str]
    session: Mapped[int]
    number: Mapped[int]
    vote_type: Mapped[str | None]
    question: Mapped[str | None]
    result: Mapped[str | None]
    result_text: Mapped[str | None]
    vote_date: Mapped[date]
    yea_count: Mapped[int | None]
    nay_count: Mapped[int | None]
    not_voting_count: Mapped[int | None]
    present_count: Mapped[int | None]
    bill_id: Mapped[int | None] = mapped_column(ForeignKey("main.bill.id"))
    bill: Mapped[Bill | None] = relationship("Bill", back_populates="votes")
    vote_records: Mapped[list[VoteRecord]] = relationship(
        "VoteRecord",
        back_populates="vote",
        cascade="all, delete-orphan",
    )
    __table_args__ = (
        UniqueConstraint(
            "congress",
            "chamber",
            "session",
            "number",
            name="uq_vote_congress_chamber_session_number",
        ),
        Index("ix_vote_date", "vote_date"),
        Index("ix_vote_congress_chamber", "congress", "chamber"),
    )
@@ -0,0 +1,16 @@
 """Data science dev database ORM models."""
 from __future__ import annotations
 from python.orm.data_science_dev.congress import Bill, BillText, Legislator, Vote, VoteRecord
 from python.orm.data_science_dev.posts import partitions  # noqa: F401 — registers partition classes in metadata
 from python.orm.data_science_dev.posts.tables import Posts
 __all__ = [
    "Bill",
    "BillText",
    "Legislator",
    "Posts",
    "Vote",
    "VoteRecord",
 ]
@@ -0,0 +1,11 @@
 """Posts module — weekly-partitioned posts table and partition ORM models."""
 from __future__ import annotations
 from python.orm.data_science_dev.posts.failed_ingestion import FailedIngestion
 from python.orm.data_science_dev.posts.tables import Posts
 __all__ = [
    "FailedIngestion",
    "Posts",
 ]
@@ -0,0 +1,33 @@
 """Shared column definitions for the posts partitioned table family."""
 from __future__ import annotations
 from datetime import datetime
 from sqlalchemy import BigInteger, SmallInteger, Text
 from sqlalchemy.orm import Mapped, mapped_column
 class PostsColumns:
    """Mixin providing all posts columns. Used by both the parent table and partitions."""
    post_id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
    user_id: Mapped[int] = mapped_column(BigInteger)
    instance: Mapped[str]
    date: Mapped[datetime] = mapped_column(primary_key=True)
    text: Mapped[str] = mapped_column(Text)
    langs: Mapped[str | None]
    like_count: Mapped[int]
    reply_count: Mapped[int]
    repost_count: Mapped[int]
    reply_to: Mapped[int | None] = mapped_column(BigInteger)
    replied_author: Mapped[int | None] = mapped_column(BigInteger)
    thread_root: Mapped[int | None] = mapped_column(BigInteger)
    thread_root_author: Mapped[int | None] = mapped_column(BigInteger)
    repost_from: Mapped[int | None] = mapped_column(BigInteger)
    reposted_author: Mapped[int | None] = mapped_column(BigInteger)
    quotes: Mapped[int | None] = mapped_column(BigInteger)
    quoted_author: Mapped[int | None] = mapped_column(BigInteger)
    labels: Mapped[str | None]
    sent_label: Mapped[int | None] = mapped_column(SmallInteger)
    sent_score: Mapped[float | None]
@@ -0,0 +1,17 @@
 """Table for storing JSONL lines that failed during post ingestion."""
 from __future__ import annotations
 from sqlalchemy import Text
 from sqlalchemy.orm import Mapped, mapped_column
 from python.orm.data_science_dev.base import DataScienceDevTableBase
 class FailedIngestion(DataScienceDevTableBase):
    """Stores raw JSONL lines and their error messages when ingestion fails."""
    __tablename__ = "failed_ingestion"
    raw_line: Mapped[str] = mapped_column(Text)
    error: Mapped[str] = mapped_column(Text)
@@ -0,0 +1,71 @@
 """Dynamically generated ORM classes for each weekly partition of the posts table.
 Each class maps to a PostgreSQL partition table (e.g. posts_2024_01).
 These are real ORM models tracked by Alembic autogenerate.
 Uses ISO week numbering (datetime.isocalendar().week). ISO years can have
 52 or 53 weeks, and week boundaries are always Monday to Monday.
 """
 from __future__ import annotations
 import sys
 from datetime import UTC, datetime
 from python.orm.data_science_dev.base import DataScienceDevBase
 from python.orm.data_science_dev.posts.columns import PostsColumns
 PARTITION_START_YEAR = 2023
 PARTITION_END_YEAR = 2026
 _current_module = sys.modules[__name__]
 def iso_weeks_in_year(year: int) -> int:
    """Return the number of ISO weeks in a given year (52 or 53)."""
    dec_28 = datetime(year, 12, 28, tzinfo=UTC)
    return dec_28.isocalendar().week
 def week_bounds(year: int, week: int) -> tuple[datetime, datetime]:
    """Return (start, end) datetimes for an ISO week.
    Start = Monday 00:00:00 UTC of the given ISO week.
    End   = Monday 00:00:00 UTC of the following ISO week.
    """
    start = datetime.fromisocalendar(year, week, 1).replace(tzinfo=UTC)
    if week < iso_weeks_in_year(year):
        end = datetime.fromisocalendar(year, week + 1, 1).replace(tzinfo=UTC)
    else:
        end = datetime.fromisocalendar(year + 1, 1, 1).replace(tzinfo=UTC)
    return start, end
 def _build_partition_classes() -> dict[str, type]:
    """Generate one ORM class per ISO week partition."""
    classes: dict[str, type] = {}
    for year in range(PARTITION_START_YEAR, PARTITION_END_YEAR + 1):
        for week in range(1, iso_weeks_in_year(year) + 1):
            class_name = f"PostsWeek{year}W{week:02d}"
            table_name = f"posts_{year}_{week:02d}"
            partition_class = type(
                class_name,
                (PostsColumns, DataScienceDevBase),
                {
                    "__tablename__": table_name,
                    "__table_args__": ({"implicit_returning": False},),
                },
            )
            classes[class_name] = partition_class
    return classes
 # Generate all partition classes and register them on this module
 _partition_classes = _build_partition_classes()
 for _name, _cls in _partition_classes.items():
    setattr(_current_module, _name, _cls)
 __all__ = list(_partition_classes.keys())
@@ -0,0 +1,13 @@
 """Posts parent table with PostgreSQL weekly range partitioning on date column."""
 from __future__ import annotations
 from python.orm.data_science_dev.base import DataScienceDevBase
 from python.orm.data_science_dev.posts.columns import PostsColumns
 class Posts(PostsColumns, DataScienceDevBase):
    """Parent partitioned table for posts, partitioned by week on `date`."""
    __tablename__ = "posts"
    __table_args__ = ({"postgresql_partition_by": "RANGE (date)"},)
@@ -2,7 +2,6 @@
 from __future__ import annotations
 from python.orm.richie.audiobook import Audiobook, AudiobookAuthor, AudiobookSeries
 from python.orm.richie.base import RichieBase, TableBase, TableBaseBig, TableBaseSmall
 from python.orm.richie.contact import (
    Contact,
@@ -13,9 +12,6 @@ from python.orm.richie.contact import (
 )
 __all__ = [
    "Audiobook",
    "AudiobookAuthor",
    "AudiobookSeries",
    "Contact",
    "ContactNeed",
    "ContactRelationship",
@@ -1,55 +0,0 @@
 """Audiobook catalog models."""
 from __future__ import annotations
 from sqlalchemy import ForeignKey, String, UniqueConstraint
 from sqlalchemy.orm import Mapped, mapped_column, relationship
 from python.orm.richie.base import TableBase
 class AudiobookAuthor(TableBase):
    """Canonical audiobook author."""
    __tablename__ = "audiobook_author"
    __table_args__ = (UniqueConstraint("name"),)
    name: Mapped[str] = mapped_column(String, unique=True)
    books: Mapped[list[Audiobook]] = relationship("Audiobook", back_populates="author")
    series: Mapped[list[AudiobookSeries]] = relationship("AudiobookSeries", back_populates="author")
 class AudiobookSeries(TableBase):
    """Canonical audiobook series."""
    __tablename__ = "audiobook_series"
    __table_args__ = (UniqueConstraint("author_id", "name"),)
    name: Mapped[str] = mapped_column(String)
    author_id: Mapped[int] = mapped_column(ForeignKey("main.audiobook_author.id", ondelete="CASCADE"))
    author: Mapped[AudiobookAuthor] = relationship("AudiobookAuthor", back_populates="series")
    books: Mapped[list[Audiobook]] = relationship("Audiobook", back_populates="series")
 class Audiobook(TableBase):
    """Canonical audiobook title."""
    __tablename__ = "audiobook"
    __table_args__ = (
        UniqueConstraint(
            "author_id",
            "series_id",
            "title",
            postgresql_nulls_not_distinct=True,
        ),
    )
    title: Mapped[str] = mapped_column(String)
    author_id: Mapped[int] = mapped_column(ForeignKey("main.audiobook_author.id", ondelete="CASCADE"))
    series_id: Mapped[int | None] = mapped_column(ForeignKey("main.audiobook_series.id", ondelete="SET NULL"))
    series_index: Mapped[float] = mapped_column(default=0.0)
    author: Mapped[AudiobookAuthor] = relationship("AudiobookAuthor", back_populates="books")
    series: Mapped[AudiobookSeries | None] = relationship("AudiobookSeries", back_populates="books")
@@ -0,0 +1,16 @@
 """Signal bot database ORM exports."""
 from __future__ import annotations
 from python.orm.signal_bot.base import SignalBotBase, SignalBotTableBase, SignalBotTableBaseSmall
 from python.orm.signal_bot.models import DeadLetterMessage, DeviceRole, RoleRecord, SignalDevice
 __all__ = [
    "DeadLetterMessage",
    "DeviceRole",
    "RoleRecord",
    "SignalBotBase",
    "SignalBotTableBase",
    "SignalBotTableBaseSmall",
    "SignalDevice",
 ]
@@ -0,0 +1,52 @@
 """Signal bot database ORM base."""
 from __future__ import annotations
 from datetime import datetime
 from sqlalchemy import DateTime, MetaData, SmallInteger, func
 from sqlalchemy.ext.declarative import AbstractConcreteBase
 from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
 from python.orm.common import NAMING_CONVENTION
 class SignalBotBase(DeclarativeBase):
    """Base class for signal_bot database ORM models."""
    schema_name = "main"
    metadata = MetaData(
        schema=schema_name,
        naming_convention=NAMING_CONVENTION,
    )
 class _TableMixin:
    """Shared timestamp columns for all table bases."""
    created: Mapped[datetime] = mapped_column(
        DateTime(timezone=True),
        server_default=func.now(),
    )
    updated: Mapped[datetime] = mapped_column(
        DateTime(timezone=True),
        server_default=func.now(),
        onupdate=func.now(),
    )
 class SignalBotTableBaseSmall(_TableMixin, AbstractConcreteBase, SignalBotBase):
    """Table with SmallInteger primary key."""
    __abstract__ = True
    id: Mapped[int] = mapped_column(SmallInteger, primary_key=True)
 class SignalBotTableBase(_TableMixin, AbstractConcreteBase, SignalBotBase):
    """Table with Integer primary key."""
    __abstract__ = True
    id: Mapped[int] = mapped_column(primary_key=True)
@@ -0,0 +1,62 @@
 """Signal bot device, role, and dead letter ORM models."""
 from __future__ import annotations
 from datetime import datetime
 from sqlalchemy import DateTime, Enum, ForeignKey, SmallInteger, String, Text, UniqueConstraint
 from sqlalchemy.orm import Mapped, mapped_column, relationship
 from python.orm.signal_bot.base import SignalBotTableBase, SignalBotTableBaseSmall
 from python.signal_bot.models import MessageStatus, TrustLevel
 class RoleRecord(SignalBotTableBaseSmall):
    """Lookup table for RBAC roles, keyed by smallint."""
    __tablename__ = "role"
    name: Mapped[str] = mapped_column(String(50), unique=True)
 class DeviceRole(SignalBotTableBase):
    """Association between a device and a role."""
    __tablename__ = "device_role"
    __table_args__ = (
        UniqueConstraint("device_id", "role_id", name="uq_device_role_device_role"),
        {"schema": "main"},
    )
    device_id: Mapped[int] = mapped_column(ForeignKey("main.signal_device.id"))
    role_id: Mapped[int] = mapped_column(SmallInteger, ForeignKey("main.role.id"))
 class SignalDevice(SignalBotTableBase):
    """A Signal device tracked by phone number and safety number."""
    __tablename__ = "signal_device"
    phone_number: Mapped[str] = mapped_column(String(50), unique=True)
    safety_number: Mapped[str | None]
    trust_level: Mapped[TrustLevel] = mapped_column(
        Enum(TrustLevel, name="trust_level", create_constraint=False, native_enum=False),
        default=TrustLevel.UNVERIFIED,
    )
    last_seen: Mapped[datetime] = mapped_column(DateTime(timezone=True))
    roles: Mapped[list[RoleRecord]] = relationship(secondary=DeviceRole.__table__)
 class DeadLetterMessage(SignalBotTableBase):
    """A Signal message that failed processing and was sent to the dead letter queue."""
    __tablename__ = "dead_letter_message"
    source: Mapped[str]
    message: Mapped[str] = mapped_column(Text)
    received_at: Mapped[datetime] = mapped_column(DateTime(timezone=True))
    status: Mapped[MessageStatus] = mapped_column(
        Enum(MessageStatus, name="message_status", create_constraint=False, native_enum=False),
        default=MessageStatus.UNPROCESSED,
    )
@@ -0,0 +1,25 @@
 # Unsloth fine-tuning container for Qwen 3.5 4B on RTX 3090.
 #
 # Build:
 #   docker build -f python/prompt_bench/Dockerfile.finetune -t bill-finetune .
 #
 # Run:
 #   docker run --rm --device=nvidia.com/gpu=all --ipc=host \
 #     -v $(pwd)/output:/workspace/output \
 #     -v $(pwd)/output/finetune_dataset.jsonl:/workspace/dataset.jsonl:ro \
 #     -v /zfs/models/hf:/models \
 #     bill-finetune \
 #     --dataset /workspace/dataset.jsonl \
 #     --output-dir /workspace/output/qwen-bill-summarizer
 FROM ghcr.io/unslothai/unsloth:latest
 RUN pip install --no-cache-dir typer
 WORKDIR /workspace
 COPY python/prompt_bench/finetune.py python/prompt_bench/finetune.py
 COPY python/prompt_bench/summarization_prompts.py python/prompt_bench/summarization_prompts.py
 COPY python/prompt_bench/__init__.py python/prompt_bench/__init__.py
 COPY python/__init__.py python/__init__.py
 ENTRYPOINT ["python", "-m", "python.prompt_bench.finetune"]
@@ -0,0 +1 @@
 """Prompt benchmarking system for evaluating LLMs via vLLM."""
@@ -0,0 +1,233 @@
 """Submit an OpenAI Batch API bill-summarization job over compressed text.
 Reads the first N bills from a CSV with a `text_content` column, compresses
 each via `bill_token_compression.compress_bill_text`, builds a JSONL file of
 summarization requests, and submits it as an asynchronous Batch API job
 against `/v1/chat/completions`. Also writes a CSV of per-bill pre/post-
 compression token counts.
 """
 from __future__ import annotations
 import csv
 import json
 import logging
 import re
 import sys
 from os import getenv
 from pathlib import Path
 from typing import Annotated
 import httpx
 import typer
 from tiktoken import Encoding, get_encoding
 from python.prompt_bench.bill_token_compression import compress_bill_text
 from python.prompt_bench.summarization_prompts import SUMMARIZATION_SYSTEM_PROMPT, SUMMARIZATION_USER_TEMPLATE
 logger = logging.getLogger(__name__)
 OPENAI_API_BASE = "https://api.openai.com/v1"
 def load_bills(csv_path: Path, count: int = 0) -> list[tuple[str, str]]:
    """Return (bill_id, text_content) tuples with non-empty text.
    If `count` is 0 or negative, all rows are returned.
    """
    csv.field_size_limit(sys.maxsize)
    bills: list[tuple[str, str]] = []
    with csv_path.open(newline="", encoding="utf-8") as handle:
        reader = csv.DictReader(handle)
        for row in reader:
            text_content = (row.get("text_content") or "").strip()
            if not text_content:
                continue
            bill_id = row.get("bill_id") or row.get("id") or f"row-{len(bills)}"
            version_code = row.get("version_code") or ""
            unique_id = f"{bill_id}-{version_code}" if version_code else bill_id
            bills.append((unique_id, text_content))
            if count > 0 and len(bills) >= count:
                break
    return bills
 def safe_filename(value: str) -> str:
    """Make a string safe for use as a filename or batch custom_id."""
    return re.sub(r"[^A-Za-z0-9._-]+", "_", value).strip("_") or "unnamed"
 def build_request(custom_id: str, model: str, bill_text: str) -> dict:
    """Build one OpenAI batch request line."""
    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": model,
            "messages": [
                {"role": "system", "content": SUMMARIZATION_SYSTEM_PROMPT},
                {"role": "user", "content": SUMMARIZATION_USER_TEMPLATE.format(text_content=bill_text)},
            ],
        },
    }
 def write_jsonl(path: Path, lines: list[dict]) -> None:
    """Write a list of dicts as JSONL."""
    with path.open("w", encoding="utf-8") as handle:
        for line in lines:
            handle.write(json.dumps(line, ensure_ascii=False))
            handle.write("\n")
 def upload_file(client: httpx.Client, path: Path) -> str:
    """Upload a JSONL file to the OpenAI Files API and return its file id."""
    with path.open("rb") as handle:
        response = client.post(
            f"{OPENAI_API_BASE}/files",
            files={"file": (path.name, handle, "application/jsonl")},
            data={"purpose": "batch"},
        )
    response.raise_for_status()
    return response.json()["id"]
 def prepare_requests(
    bills: list[tuple[str, str]],
    *,
    model: str,
    encoder: Encoding,
 ) -> tuple[list[dict], list[dict]]:
    """Build (request_lines, token_rows) from bills.
    Each bill is compressed before being turned into a request line.
    Each `token_rows` entry has chars + token counts for one bill so the caller
    can write a per-bill CSV.
    """
    request_lines: list[dict] = []
    token_rows: list[dict] = []
    for bill_id, text_content in bills:
        raw_token_count = len(encoder.encode(text_content))
        compressed_text = compress_bill_text(text_content)
        compressed_token_count = len(encoder.encode(compressed_text))
        token_rows.append(
            {
                "bill_id": bill_id,
                "raw_chars": len(text_content),
                "compressed_chars": len(compressed_text),
                "raw_tokens": raw_token_count,
                "compressed_tokens": compressed_token_count,
                "token_ratio": (compressed_token_count / raw_token_count) if raw_token_count else None,
            },
        )
        safe_id = safe_filename(bill_id)
        request_lines.append(build_request(safe_id, model, compressed_text))
    return request_lines, token_rows
 def write_token_csv(path: Path, token_rows: list[dict]) -> tuple[int, int]:
    """Write per-bill token counts to CSV. Returns (raw_total, compressed_total)."""
    with path.open("w", newline="", encoding="utf-8") as handle:
        writer = csv.DictWriter(
            handle,
            fieldnames=["bill_id", "raw_chars", "compressed_chars", "raw_tokens", "compressed_tokens", "token_ratio"],
        )
        writer.writeheader()
        writer.writerows(token_rows)
    raw_total = sum(row["raw_tokens"] for row in token_rows)
    compressed_total = sum(row["compressed_tokens"] for row in token_rows)
    return raw_total, compressed_total
 def create_batch(client: httpx.Client, input_file_id: str, description: str) -> dict:
    """Create a batch job and return its full response payload."""
    response = client.post(
        f"{OPENAI_API_BASE}/batches",
        json={
            "input_file_id": input_file_id,
            "endpoint": "/v1/chat/completions",
            "completion_window": "24h",
            "metadata": {"description": description},
        },
    )
    response.raise_for_status()
    return response.json()
 def main(
    csv_path: Annotated[Path, typer.Option("--csv", help="Bills CSV path")] = Path("bills.csv"),
    output_dir: Annotated[Path, typer.Option("--output-dir", help="Where to write JSONL + metadata")] = Path(
        "output/openai_batch",
    ),
    model: Annotated[str, typer.Option(help="OpenAI model id")] = "gpt-5-mini",
    count: Annotated[int, typer.Option(help="Max bills to process, 0 = all")] = 0,
    log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
 ) -> None:
    """Submit an OpenAI Batch job of compressed bill summaries."""
    logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
    api_key = getenv("CLOSEDAI_TOKEN") or getenv("OPENAI_API_KEY")
    if not api_key:
        message = "Neither CLOSEDAI_TOKEN nor OPENAI_API_KEY is set"
        raise typer.BadParameter(message)
    if not csv_path.is_file():
        message = f"CSV not found: {csv_path}"
        raise typer.BadParameter(message)
    output_dir.mkdir(parents=True, exist_ok=True)
    logger.info("Loading %d bills from %s", count, csv_path)
    bills = load_bills(csv_path, count)
    if len(bills) < count:
        logger.warning("Only %d bills available (requested %d)", len(bills), count)
    encoder = get_encoding("o200k_base")
    request_lines, token_rows = prepare_requests(bills, model=model, encoder=encoder)
    token_csv_path = output_dir / "token_counts.csv"
    raw_tokens_total, compressed_tokens_total = write_token_csv(token_csv_path, token_rows)
    logger.info(
        "Token counts: raw=%d compressed=%d ratio=%.3f -> %s",
        raw_tokens_total,
        compressed_tokens_total,
        (compressed_tokens_total / raw_tokens_total) if raw_tokens_total else 0.0,
        token_csv_path,
    )
    jsonl_path = output_dir / "requests.jsonl"
    write_jsonl(jsonl_path, request_lines)
    logger.info("Wrote %s (%d bills)", jsonl_path, len(request_lines))
    headers = {"Authorization": f"Bearer {api_key}"}
    with httpx.Client(headers=headers, timeout=httpx.Timeout(300.0)) as client:
        logger.info("Uploading JSONL")
        file_id = upload_file(client, jsonl_path)
        logger.info("Uploaded: %s", file_id)
        logger.info("Creating batch")
        batch = create_batch(client, file_id, f"compressed bill summaries x{len(request_lines)} ({model})")
        logger.info("Batch created: %s", batch["id"])
    metadata = {
        "model": model,
        "count": len(bills),
        "jsonl": str(jsonl_path),
        "input_file_id": file_id,
        "batch_id": batch["id"],
        "raw_tokens_total": raw_tokens_total,
        "compressed_tokens_total": compressed_tokens_total,
        "batch": batch,
    }
    metadata_path = output_dir / "batch.json"
    metadata_path.write_text(json.dumps(metadata, indent=2))
    logger.info("Wrote metadata to %s", metadata_path)
 def cli() -> None:
    """Typer entry point."""
    typer.run(main)
 if __name__ == "__main__":
    cli()
@@ -0,0 +1,162 @@
 """Lossless-ish text compression for Congressional bill text."""
 from __future__ import annotations
 import re
 STATES = (
    "Alabama",
    "Alaska",
    "Arizona",
    "Arkansas",
    "California",
    "Colorado",
    "Connecticut",
    "Delaware",
    "Florida",
    "Georgia",
    "Hawaii",
    "Idaho",
    "Illinois",
    "Indiana",
    "Iowa",
    "Kansas",
    "Kentucky",
    "Louisiana",
    "Maine",
    "Maryland",
    "Massachusetts",
    "Michigan",
    "Minnesota",
    "Mississippi",
    "Missouri",
    "Montana",
    "Nebraska",
    "Nevada",
    "New Hampshire",
    "New Jersey",
    "New Mexico",
    "New York",
    "North Carolina",
    "North Dakota",
    "Ohio",
    "Oklahoma",
    "Oregon",
    "Pennsylvania",
    "Rhode Island",
    "South Carolina",
    "South Dakota",
    "Tennessee",
    "Texas",
    "Utah",
    "Vermont",
    "Virginia",
    "Washington",
    "West Virginia",
    "Wisconsin",
    "Wyoming",
    "Puerto Rico",
    "Guam",
    "American Samoa",
    "District of Columbia",
    "US Virgin Islands",
 )
 STATE_PATTERNS = [(re.compile(re.escape(state), re.IGNORECASE), state) for state in STATES]
 def normalize_state_names(text: str) -> str:
    """Replace any casing of state names with title case."""
    for pattern, replacement in STATE_PATTERNS:
        text = pattern.sub(replacement, text)
    return text
 def strip_number_commas(text: str) -> str:
    """Remove commas from numeric thousands separators."""
    return re.sub(r"(\d{1,3}(?:,\d{3})+)", lambda match: match.group().replace(",", ""), text)
 def strip_horizontal_rules(text: str) -> str:
    """Remove ASCII horizontal-rule lines built from underscores, dashes, equals, or asterisks."""
    return re.sub(r"^\s*[_\-=\*]{3,}\s*$", "", text, flags=re.MULTILINE)
 def collapse_double_dashes(text: str) -> str:
    """Replace ``--`` em-dash stand-ins with a single space so they don't tokenize oddly."""
    return text.replace("--", " ")
 def collapse_inline_whitespace(text: str) -> str:
    """Collapse runs of horizontal whitespace (spaces, tabs) into a single space, leaving newlines intact."""
    return re.sub(r"[^\S\n]+", " ", text)
 def collapse_blank_lines(text: str) -> str:
    """Collapse three-or-more consecutive newlines down to a blank-line separator."""
    return re.sub(r"\n{3,}", "\n\n", text)
 def trim_line_edges(text: str) -> str:
    """Strip spaces immediately before and after newline characters on every line."""
    text = re.sub(r" +\n", "\n", text)
    return re.sub(r"\n +", "\n", text)
 def shorten_section_markers(text: str) -> str:
    """Rewrite ``Sec. 12.`` style section headings as the more compact ``SEC 12``."""
    return re.sub(r"(?i)sec\.\s*(\d+[a-zA-Z]?)\.", r"SEC \1", text)
 def unwrap_parens(text: str) -> str:
    """Strip parentheses around short alphanumeric labels like ``(a)`` or ``(12)``."""
    return re.sub(r"\(([a-zA-Z0-9]+)\)", r"\1", text)
 def strip_typeset_quotes(text: str) -> str:
    """Remove the `` and '' typeset quote markers used in the GPO bill format."""
    return text.replace("``", "").replace("''", "")
 def normalize_usc_acronym(text: str) -> str:
    """Collapse ``U.S.C.`` to ``USC`` to save tokens on the common citation."""
    return text.replace("U.S.C.", "USC")
 def normalize_us_acronym(text: str) -> str:
    """Normalize the various ``U.S.``/``U. S.`` spellings to the bare ``US`` form."""
    for acronym in ("U. S.", "u. s.", "U.S. ", "u.s. "):
        text = text.replace(acronym, "US ")
    return text
 def collapse_ellipses(text: str) -> str:
    """Collapse runs of two-or-more periods (``...``, ``....``) down to a single period."""
    return re.sub(r"\.{2,}", ".", text)
 COMPRESSION_STEPS = (
    strip_horizontal_rules,
    collapse_double_dashes,
    collapse_inline_whitespace,
    collapse_blank_lines,
    trim_line_edges,
    shorten_section_markers,
    unwrap_parens,
    strip_typeset_quotes,
    normalize_usc_acronym,
    normalize_us_acronym,
    strip_number_commas,
    collapse_ellipses,
    normalize_state_names,
 )
 def compress_bill_text(text: str) -> str:
    """Apply lossless-ish whitespace and boilerplate compression to bill text.
    Runs every transform in :data:`COMPRESSION_STEPS` in order, then strips
    leading/trailing whitespace from the final result.
    """
    for step in COMPRESSION_STEPS:
        text = step(text)
    return text.strip()
@@ -0,0 +1,236 @@
 """Run two interactive OpenAI chat-completion sweeps over bill text.
 Reads the first N bills from a CSV with a `text_content` column and sends two
 sweeps through `/v1/chat/completions` concurrently — one with the raw bill
 text, one with the compressed bill text. Each request's prompt is saved to
 disk alongside the OpenAI response id so the prompts and responses can be
 correlated later.
 """
 from __future__ import annotations
 import csv
 import json
 import logging
 import re
 import sys
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from os import getenv
 from pathlib import Path
 from typing import Annotated
 import httpx
 import typer
 from python.prompt_bench.bill_token_compression import compress_bill_text
 from python.prompt_bench.summarization_prompts import SUMMARIZATION_SYSTEM_PROMPT, SUMMARIZATION_USER_TEMPLATE
 logger = logging.getLogger(__name__)
 OPENAI_API_BASE = "https://api.openai.com/v1"
 DEFAULT_MODEL = "gpt-5.4-mini"
 DEFAULT_COUNT = 100
 SEED = 42
 def load_bills(csv_path: Path, count: int) -> list[tuple[str, str]]:
    """Return up to `count` (bill_id, text_content) tuples with non-empty text."""
    csv.field_size_limit(sys.maxsize)
    bills: list[tuple[str, str]] = []
    with csv_path.open(newline="", encoding="utf-8") as handle:
        reader = csv.DictReader(handle)
        for row in reader:
            text_content = (row.get("text_content") or "").strip()
            if not text_content:
                continue
            bill_id = row.get("bill_id") or row.get("id") or f"row-{len(bills)}"
            version_code = row.get("version_code") or ""
            unique_id = f"{bill_id}-{version_code}" if version_code else bill_id
            bills.append((unique_id, text_content))
            if len(bills) >= count:
                break
    return bills
 def build_messages(bill_text: str) -> list[dict]:
    """Return the system + user message pair for a bill."""
    return [
        {"role": "system", "content": SUMMARIZATION_SYSTEM_PROMPT},
        {"role": "user", "content": SUMMARIZATION_USER_TEMPLATE.format(text_content=bill_text)},
    ]
 def safe_filename(value: str) -> str:
    """Make a string safe for use as a filename."""
    return re.sub(r"[^A-Za-z0-9._-]+", "_", value).strip("_") or "unnamed"
 def run_one_request(
    client: httpx.Client,
    *,
    bill_id: str,
    label: str,
    bill_text: str,
    model: str,
    output_path: Path,
 ) -> tuple[bool, float, str | None]:
    """Send one chat-completion request and persist prompt + response.
    Returns (success, elapsed_seconds, response_id).
    """
    messages = build_messages(bill_text)
    payload = {
        "model": model,
        "messages": messages,
        "seed": SEED,
    }
    start = time.monotonic()
    record: dict = {
        "bill_id": bill_id,
        "label": label,
        "model": model,
        "seed": SEED,
        "input_chars": len(bill_text),
        "messages": messages,
    }
    try:
        response = client.post(f"{OPENAI_API_BASE}/chat/completions", json=payload)
        response.raise_for_status()
        body = response.json()
    except httpx.HTTPStatusError as error:
        elapsed = time.monotonic() - start
        record["error"] = {
            "status_code": error.response.status_code,
            "body": error.response.text,
            "elapsed_seconds": elapsed,
        }
        output_path.write_text(json.dumps(record, ensure_ascii=False, indent=2))
        logger.exception("HTTP error for %s/%s after %.2fs", label, bill_id, elapsed)
        return False, elapsed, None
    except Exception as error:
        elapsed = time.monotonic() - start
        record["error"] = {"message": str(error), "elapsed_seconds": elapsed}
        output_path.write_text(json.dumps(record, ensure_ascii=False, indent=2))
        logger.exception("Failed: %s/%s after %.2fs", label, bill_id, elapsed)
        return False, elapsed, None
    elapsed = time.monotonic() - start
    response_id = body.get("id")
    record["response_id"] = response_id
    record["elapsed_seconds"] = elapsed
    record["usage"] = body.get("usage")
    record["response"] = body
    output_path.write_text(json.dumps(record, ensure_ascii=False, indent=2))
    logger.info("Done: %s/%s id=%s in %.2fs", label, bill_id, response_id, elapsed)
    return True, elapsed, response_id
 def main(
    csv_path: Annotated[Path, typer.Option("--csv", help="Bills CSV path")] = Path("bills.csv"),
    output_dir: Annotated[Path, typer.Option("--output-dir", help="Where to write per-request JSON")] = Path(
        "output/openai_runs",
    ),
    model: Annotated[str, typer.Option(help="OpenAI model id")] = DEFAULT_MODEL,
    count: Annotated[int, typer.Option(help="Number of bills per set")] = DEFAULT_COUNT,
    concurrency: Annotated[int, typer.Option(help="Concurrent in-flight requests")] = 16,
    log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
 ) -> None:
    """Run two interactive OpenAI sweeps (compressed + uncompressed) over bill text."""
    logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
    api_key = getenv("CLOSEDAI_TOKEN") or getenv("OPENAI_API_KEY")
    if not api_key:
        message = "Neither CLOSEDAI_TOKEN nor OPENAI_API_KEY is set"
        raise typer.BadParameter(message)
    if not csv_path.is_file():
        message = f"CSV not found: {csv_path}"
        raise typer.BadParameter(message)
    compressed_dir = output_dir / "compressed"
    uncompressed_dir = output_dir / "uncompressed"
    compressed_dir.mkdir(parents=True, exist_ok=True)
    uncompressed_dir.mkdir(parents=True, exist_ok=True)
    logger.info("Loading %d bills from %s", count, csv_path)
    bills = load_bills(csv_path, count)
    if len(bills) < count:
        logger.warning("Only %d bills available (requested %d)", len(bills), count)
    tasks: list[tuple[str, str, str, Path]] = []
    for bill_id, text_content in bills:
        filename = f"{safe_filename(bill_id)}.json"
        tasks.append((bill_id, "compressed", compress_bill_text(text_content), compressed_dir / filename))
        tasks.append((bill_id, "uncompressed", text_content, uncompressed_dir / filename))
    logger.info("Submitting %d requests at concurrency=%d", len(tasks), concurrency)
    headers = {"Authorization": f"Bearer {api_key}"}
    completed = 0
    failed = 0
    index: list[dict] = []
    wall_start = time.monotonic()
    with (
        httpx.Client(headers=headers, timeout=httpx.Timeout(300.0)) as client,
        ThreadPoolExecutor(
            max_workers=concurrency,
        ) as executor,
    ):
        future_to_task = {
            executor.submit(
                run_one_request,
                client,
                bill_id=bill_id,
                label=label,
                bill_text=bill_text,
                model=model,
                output_path=output_path,
            ): (bill_id, label, output_path)
            for bill_id, label, bill_text, output_path in tasks
        }
        for future in as_completed(future_to_task):
            bill_id, label, output_path = future_to_task[future]
            success, elapsed, response_id = future.result()
            if success:
                completed += 1
            else:
                failed += 1
            index.append(
                {
                    "bill_id": bill_id,
                    "label": label,
                    "response_id": response_id,
                    "elapsed_seconds": elapsed,
                    "success": success,
                    "path": str(output_path),
                },
            )
    wall_elapsed = time.monotonic() - wall_start
    summary = {
        "model": model,
        "count": len(bills),
        "completed": completed,
        "failed": failed,
        "wall_seconds": wall_elapsed,
        "concurrency": concurrency,
        "results": index,
    }
    summary_path = output_dir / "summary.json"
    summary_path.write_text(json.dumps(summary, indent=2))
    logger.info(
        "Done: completed=%d failed=%d wall=%.1fs summary=%s",
        completed,
        failed,
        wall_elapsed,
        summary_path,
    )
 def cli() -> None:
    """Typer entry point."""
    typer.run(main)
 if __name__ == "__main__":
    cli()
@@ -0,0 +1 @@
 """Prompt benchmarking system for evaluating LLMs via vLLM."""
@@ -0,0 +1,165 @@
 """Docker container lifecycle management for Unsloth fine-tuning."""
 from __future__ import annotations
 import logging
 import subprocess
 from pathlib import Path
 from typing import Annotated
 import typer
 from python.prompt_bench.containers.lib import check_gpu_free
 logger = logging.getLogger(__name__)
 CONTAINER_NAME = "bill-finetune"
 FINETUNE_IMAGE = "bill-finetune:latest"
 DOCKERFILE_PATH = "/home/richie/dotfiles/python/prompt_bench/Dockerfile.finetune"
 DEFAULT_HF_CACHE = Path("/zfs/models/hf")
 def build_image() -> None:
    """Build the fine-tuning Docker image."""
    logger.info("Building fine-tuning image: %s", FINETUNE_IMAGE)
    result = subprocess.run(
        ["docker", "build", "-f", DOCKERFILE_PATH, "-t", FINETUNE_IMAGE, "."],
        text=True,
        check=False,
    )
    if result.returncode != 0:
        message = "Failed to build fine-tuning image"
        raise RuntimeError(message)
    logger.info("Image built: %s", FINETUNE_IMAGE)
 def start_finetune(
    *,
    dataset_path: Path,
    output_dir: Path,
    hf_cache: Path = DEFAULT_HF_CACHE,
 ) -> None:
    """Run the fine-tuning container.
    Args:
        dataset_path: Host path to the fine-tuning JSONL dataset.
        output_dir: Host path where the trained model will be saved.
        hf_cache: Host path to HuggingFace model cache (bind-mounted to avoid re-downloading).
        validation_split: Fraction of data held out for validation.
    """
    dataset_path = dataset_path.resolve()
    output_dir = output_dir.resolve()
    if not dataset_path.is_file():
        message = f"Dataset not found: {dataset_path}"
        raise FileNotFoundError(message)
    output_dir.mkdir(parents=True, exist_ok=True)
    stop_finetune()
    hf_cache = hf_cache.resolve()
    hf_cache.mkdir(parents=True, exist_ok=True)
    command = [
        "docker",
        "run",
        "--name",
        CONTAINER_NAME,
        "--device=nvidia.com/gpu=all",
        "--ipc=host",
        "-v",
        f"{hf_cache}:/root/.cache/huggingface",
        "-v",
        f"{output_dir}:/workspace/output/qwen-bill-summarizer",
        "-v",
        f"{dataset_path}:/workspace/dataset.jsonl:ro",
        FINETUNE_IMAGE,
        "--dataset",
        "/workspace/dataset.jsonl",
        "--output-dir",
        "/workspace/output/qwen-bill-summarizer",
    ]
    logger.info("Starting fine-tuning container")
    logger.info("  Dataset:    %s", dataset_path)
    logger.info("  Output:     %s", output_dir)
    result = subprocess.run(command, text=True, check=False)
    if result.returncode != 0:
        message = f"Fine-tuning container exited with code {result.returncode}"
        raise RuntimeError(message)
    logger.info("Fine-tuning complete. Model saved to %s", output_dir)
 def stop_finetune() -> None:
    """Stop and remove the fine-tuning container."""
    logger.info("Stopping fine-tuning container")
    subprocess.run(["docker", "stop", CONTAINER_NAME], capture_output=True, check=False)
    subprocess.run(["docker", "rm", "-f", CONTAINER_NAME], capture_output=True, check=False)
 def logs_finetune() -> str | None:
    """Return recent logs from the fine-tuning container, or None if not running."""
    result = subprocess.run(
        ["docker", "logs", "--tail", "50", CONTAINER_NAME],
        capture_output=True,
        text=True,
        check=False,
    )
    if result.returncode != 0:
        return None
    return result.stdout + result.stderr
 app = typer.Typer(help="Fine-tuning container management.")
@app.command()
 def build() -> None:
    """Build the fine-tuning Docker image."""
    build_image()
@app.command()
 def run(
    dataset: Annotated[Path, typer.Option(help="Fine-tuning JSONL")] = Path(
        "/home/richie/dotfiles/data/finetune_dataset.jsonl"
    ),
    output_dir: Annotated[Path, typer.Option(help="Where to save the trained model")] = Path(
        "/home/richie/dotfiles/data/output/qwen-bill-summarizer",
    ),
    hf_cache: Annotated[Path, typer.Option(help="Host path to HuggingFace model cache")] = DEFAULT_HF_CACHE,
    log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
 ) -> None:
    """Run fine-tuning inside a Docker container."""
    logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
    check_gpu_free()
    start_finetune(
        dataset_path=dataset,
        output_dir=output_dir,
        hf_cache=hf_cache,
    )
@app.command()
 def stop() -> None:
    """Stop and remove the fine-tuning container."""
    stop_finetune()
@app.command()
 def logs() -> None:
    """Show recent logs from the fine-tuning container."""
    output = logs_finetune()
    if output is None:
        typer.echo("No running fine-tuning container found.")
        raise typer.Exit(code=1)
    typer.echo(output)
 def cli() -> None:
    """Typer entry point."""
    app()
 if __name__ == "__main__":
    cli()
@@ -0,0 +1,23 @@
 from __future__ import annotations
 import logging
 import subprocess
 logger = logging.getLogger(__name__)
 def check_gpu_free() -> None:
    """Warn if GPU-heavy processes (e.g. Ollama) are running."""
    result = subprocess.run(
        ["nvidia-smi", "--query-compute-apps=pid,process_name", "--format=csv,noheader"],
        capture_output=True,
        text=True,
        check=False,
    )
    if result.returncode != 0:
        logger.warning("Could not query GPU processes: %s", result.stderr.strip())
        return
    processes = result.stdout.strip()
    if processes:
        logger.warning("GPU processes detected:\n%s", processes)
        logger.warning("Consider stopping Ollama (sudo systemctl stop ollama) before benchmarking")
@@ -0,0 +1,70 @@
 """Docker container lifecycle management for vLLM."""
 from __future__ import annotations
 import logging
 import subprocess
 logger = logging.getLogger(__name__)
 CONTAINER_NAME = "vllm-bench"
 VLLM_IMAGE = "vllm/vllm-openai:v0.19.0"
 def start_vllm(
    *,
    model: str,
    port: int,
    model_dir: str,
    gpu_memory_utilization: float,
 ) -> None:
    """Start a vLLM container serving the given model.
    Args:
        model: HuggingFace model directory name (relative to model_dir).
        port: Host port to bind.
        model_dir: Host path containing HuggingFace model directories.
        gpu_memory_utilization: Fraction of GPU memory to use (0-1).
    """
    command = [
        "docker",
        "run",
        "-d",
        "--name",
        CONTAINER_NAME,
        "--device=nvidia.com/gpu=all",
        "--ipc=host",
        "-v",
        f"{model_dir}:/models",
        "-p",
        f"{port}:8000",
        VLLM_IMAGE,
        "--model",
        f"/models/{model}",
        "--served-model-name",
        model,
        "--gpu-memory-utilization",
        str(gpu_memory_utilization),
        "--max-model-len",
        "4096",
    ]
    logger.info("Starting vLLM container with model: %s", model)
    stop_vllm()
    result = subprocess.run(command, capture_output=True, text=True, check=False)
    if result.returncode != 0:
        msg = f"Failed to start vLLM container: {result.stderr.strip()}"
        raise RuntimeError(msg)
    logger.info("vLLM container started: %s", result.stdout.strip()[:12])
 def stop_vllm() -> None:
    """Stop and remove the vLLM benchmark container."""
    logger.info("Stopping vLLM container")
    subprocess.run(["docker", "stop", CONTAINER_NAME], capture_output=True, check=False)
    subprocess.run(["docker", "rm", "-f", CONTAINER_NAME], capture_output=True, check=False)
    subprocess.run(
        ["docker", "network", "disconnect", "-f", "bridge", CONTAINER_NAME],
        capture_output=True,
        check=False,
    )
    logger.info("vLLM container stopped and removed")
@@ -0,0 +1,75 @@
 """HuggingFace model downloader."""
 from __future__ import annotations
 import logging
 from pathlib import Path
 from typing import Annotated
 import typer
 from huggingface_hub import snapshot_download
 from python.prompt_bench.models import BenchmarkConfig
 logger = logging.getLogger(__name__)
 def local_model_path(repo: str, model_dir: str) -> Path:
    """Return the local directory path for a HuggingFace repo."""
    return Path(model_dir) / repo
 def is_model_present(repo: str, model_dir: str) -> bool:
    """Check if a model has already been downloaded."""
    path = local_model_path(repo, model_dir)
    return path.exists() and any(path.iterdir())
 def download_model(repo: str, model_dir: str) -> Path:
    """Download a HuggingFace model to the local model directory.
    Skips the download if the model directory already exists and contains files.
    """
    local_path = local_model_path(repo, model_dir)
    if is_model_present(repo, model_dir):
        logger.info("Model already exists: %s", local_path)
        return local_path
    logger.info("Downloading model: %s -> %s", repo, local_path)
    snapshot_download(
        repo_id=repo,
        local_dir=str(local_path),
    )
    logger.info("Download complete: %s", repo)
    return local_path
 def download_all(config: BenchmarkConfig) -> None:
    """Download every model listed in the config, top to bottom."""
    for repo in config.models:
        download_model(repo, config.model_dir)
 def main(
    config: Annotated[Path, typer.Option(help="Path to TOML config file")] = Path("bench.toml"),
    log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
 ) -> None:
    """Download all models listed in the benchmark config."""
    logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
    if not config.is_file():
        message = f"Config file does not exist: {config}"
        raise typer.BadParameter(message)
    benchmark_config = BenchmarkConfig.from_toml(config)
    download_all(benchmark_config)
 def cli() -> None:
    """Typer entry point."""
    typer.run(main)
 if __name__ == "__main__":
    cli()
@@ -0,0 +1,214 @@
 """Fine-tune Qwen 3.5 4B on bill summarization data using Unsloth.
 Loads a ChatML-style JSONL dataset (system/user/assistant messages),
 applies QLoRA with 4-bit quantization, and saves the merged model
 in HuggingFace format. Designed for a single RTX 3090 (24GB).
 Usage:
    python -m python.prompt_bench.finetune \
        --dataset output/finetune_dataset.jsonl \
        --output-dir output/qwen-bill-summarizer
 """
 from __future__ import annotations
 import json
 import logging
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Annotated
 import tomllib
 import typer
 from unsloth import FastLanguageModel
 from datasets import Dataset
 from transformers import TrainingArguments
 from trl import SFTTrainer
 logger = logging.getLogger(__name__)
@dataclass
 class LoraConfig:
    """LoRA adapter hyperparameters."""
    rank: int
    alpha: int
    dropout: float
    targets: list[str]
@dataclass
 class TrainingConfig:
    """Training loop hyperparameters."""
    learning_rate: float
    epochs: int
    batch_size: int
    gradient_accumulation: int
    max_seq_length: int
    warmup_ratio: float
    weight_decay: float
    logging_steps: int
    save_steps: int
@dataclass
 class FinetuneConfig:
    """Top-level finetune configuration."""
    base_model: str
    lora: LoraConfig
    training: TrainingConfig
    @classmethod
    def from_toml(cls, config_path: Path) -> FinetuneConfig:
        """Load finetune config from a TOML file."""
        raw = tomllib.loads(config_path.read_text())["finetune"]
        return cls(
            base_model=raw["base_model"],
            lora=LoraConfig(**raw["lora"]),
            training=TrainingConfig(**raw["training"]),
        )
 def _messages_to_chatml(messages: list[dict]) -> str:
    r"""Convert a message list to Qwen ChatML format.
    Produces:
        <|im_start|>system\n...\n<|im_end|>
        <|im_start|>user\n...\n<|im_end|>
        <|im_start|>assistant\n...\n<|im_end|>
    """
    parts = []
    for message in messages:
        role = message["role"]
        content = message["content"]
        parts.append(f"<|im_start|>{role}\n{content}<|im_end|>")
    return "\n".join(parts)
 def load_dataset_from_jsonl(path: Path) -> Dataset:
    """Load a ChatML JSONL file into a HuggingFace Dataset.
    Each line must have {"messages": [{"role": ..., "content": ...}, ...]}.
    Pre-formats into a `text` column with the Qwen ChatML template applied,
    which SFTTrainer consumes directly.
    """
    records = []
    with path.open(encoding="utf-8") as handle:
        for raw_line in handle:
            stripped = raw_line.strip()
            if stripped:
                entry = json.loads(stripped)
                records.append({"text": _messages_to_chatml(entry["messages"])})
    logger.info("Loaded %d examples from %s", len(records), path)
    return Dataset.from_list(records)
 def main(
    dataset_path: Annotated[Path, typer.Option("--dataset", help="Fine-tuning JSONL")] = Path(
        "output/finetune_dataset.jsonl",
    ),
    validation_split: Annotated[float, typer.Option("--val-split", help="Fraction held out for validation")] = 0.1,
    output_dir: Annotated[Path, typer.Option("--output-dir", help="Where to save the merged model")] = Path(
        "output/qwen-bill-summarizer",
    ),
    config_path: Annotated[
        Path,
        typer.Option("--config", help="TOML config file"),
    ] = Path(__file__).parent / "config.toml",
    save_gguf: Annotated[bool, typer.Option("--save-gguf/--no-save-gguf", help="Also save GGUF")] = False,
 ) -> None:
    """Fine-tune Qwen 3.5 4B on bill summarization with Unsloth + QLoRA."""
    logging.basicConfig(level="INFO", format="%(asctime)s %(levelname)s %(name)s: %(message)s")
    if not dataset_path.is_file():
        message = f"Dataset not found: {dataset_path}"
        raise typer.BadParameter(message)
    config = FinetuneConfig.from_toml(config_path)
    logger.info("Loading base model: %s", config.base_model)
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=config.base_model,
        max_seq_length=config.training.max_seq_length,
        load_in_4bit=True,
        dtype=None,
    )
    logger.info("Applying LoRA (rank=%d, alpha=%d)", config.lora.rank, config.lora.alpha)
    model = FastLanguageModel.get_peft_model(
        model,
        r=config.lora.rank,
        lora_alpha=config.lora.alpha,
        lora_dropout=config.lora.dropout,
        target_modules=config.lora.targets,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=42,
    )
    full_dataset = load_dataset_from_jsonl(dataset_path)
    split = full_dataset.train_test_split(test_size=validation_split, seed=42)
    train_dataset = split["train"]
    validation_dataset = split["test"]
    logger.info("Split: %d train, %d validation", len(train_dataset), len(validation_dataset))
    training_args = TrainingArguments(
        output_dir=str(output_dir / "checkpoints"),
        num_train_epochs=config.training.epochs,
        per_device_train_batch_size=config.training.batch_size,
        gradient_accumulation_steps=config.training.gradient_accumulation,
        learning_rate=config.training.learning_rate,
        warmup_ratio=config.training.warmup_ratio,
        weight_decay=config.training.weight_decay,
        lr_scheduler_type="cosine",
        logging_steps=config.training.logging_steps,
        save_steps=config.training.save_steps,
        save_total_limit=3,
        eval_strategy="steps",
        eval_steps=config.training.save_steps,
        load_best_model_at_end=True,
        bf16=True,
        optim="adamw_8bit",
        seed=42,
        report_to="none",
    )
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset,
        args=training_args,
        max_seq_length=config.training.max_seq_length,
        packing=True,
    )
    logger.info(
        "Starting training: %d train, %d val, %d epochs",
        len(train_dataset),
        len(validation_dataset),
        config.training.epochs,
    )
    trainer.train()
    merged_path = str(output_dir / "merged")
    logger.info("Saving merged model to %s", merged_path)
    model.save_pretrained_merged(merged_path, tokenizer, save_method="merged_16bit")
    if save_gguf:
        gguf_path = str(output_dir / "gguf")
        logger.info("Saving GGUF to %s", gguf_path)
        model.save_pretrained_gguf(gguf_path, tokenizer, quantization_method="q4_k_m")
    logger.info("Done! Model saved to %s", output_dir)
 def cli() -> None:
    """Typer entry point."""
    typer.run(main)
 if __name__ == "__main__":
    cli()
@@ -0,0 +1,215 @@
 """CLI entry point for the prompt benchmarking system."""
 from __future__ import annotations
 import json
 import logging
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 from typing import Annotated
 import typer
 from python.prompt_bench.containers.lib import check_gpu_free
 from python.prompt_bench.containers.vllm import start_vllm, stop_vllm
 from python.prompt_bench.downloader import is_model_present
 from python.prompt_bench.models import BenchmarkConfig
 from python.prompt_bench.vllm_client import VLLMClient
 logger = logging.getLogger(__name__)
 def discover_prompts(input_dir: Path) -> list[Path]:
    """Find all .txt files in the input directory."""
    prompts = list(input_dir.glob("*.txt"))
    if not prompts:
        message = f"No .txt files found in {input_dir}"
        raise FileNotFoundError(message)
    return prompts
 def _run_prompt(
    client: VLLMClient,
    prompt_path: Path,
    *,
    repo: str,
    model_dir_name: str,
    model_output: Path,
    temperature: float,
 ) -> tuple[bool, float]:
    """Run a single prompt. Returns (success, elapsed_seconds)."""
    filename = prompt_path.name
    output_path = model_output / filename
    start = time.monotonic()
    try:
        prompt_text = prompt_path.read_text()
        response = client.complete(prompt_text, model_dir_name, temperature=temperature)
        output_path.write_text(response)
        elapsed = time.monotonic() - start
        logger.info("Completed: %s / %s in %.2fs", repo, filename, elapsed)
    except Exception:
        elapsed = time.monotonic() - start
        error_path = model_output / f"{filename}.error"
        logger.exception("Failed: %s / %s after %.2fs", repo, filename, elapsed)
        error_path.write_text(f"Error processing {filename}")
        return False, elapsed
    return True, elapsed
 def benchmark_model(
    client: VLLMClient,
    prompts: list[Path],
    *,
    repo: str,
    model_dir_name: str,
    model_output: Path,
    temperature: float,
    concurrency: int,
 ) -> tuple[int, int]:
    """Run all prompts against a single model in parallel.
    vLLM batches concurrent requests internally, so submitting many at once is
    significantly faster than running them serially.
    """
    pending = [prompt for prompt in prompts if not (model_output / prompt.name).exists()]
    skipped = len(prompts) - len(pending)
    if skipped:
        logger.info("Skipping %d prompts with existing output for %s", skipped, repo)
    if not pending:
        logger.info("Nothing to do for %s", repo)
        return 0, 0
    completed = 0
    failed = 0
    latencies: list[float] = []
    wall_start = time.monotonic()
    with ThreadPoolExecutor(max_workers=concurrency) as executor:
        futures = [
            executor.submit(
                _run_prompt,
                client,
                prompt_path,
                repo=repo,
                model_dir_name=model_dir_name,
                model_output=model_output,
                temperature=temperature,
            )
            for prompt_path in pending
        ]
        for future in as_completed(futures):
            success, elapsed = future.result()
            latencies.append(elapsed)
            if success:
                completed += 1
            else:
                failed += 1
    wall_elapsed = time.monotonic() - wall_start
    attempted = completed + failed
    avg_latency = sum(latencies) / attempted
    throughput = attempted / wall_elapsed if wall_elapsed > 0 else 0.0
    timing = {
        "repo": repo,
        "wall_seconds": wall_elapsed,
        "attempted": attempted,
        "completed": completed,
        "failed": failed,
        "avg_latency_seconds": avg_latency,
        "throughput_prompts_per_second": throughput,
        "concurrency": concurrency,
    }
    timing_path = model_output / "_timing.json"
    timing_path.write_text(json.dumps(timing, indent=2))
    return completed, failed
 def run_benchmark(
    config: BenchmarkConfig,
    input_dir: Path,
    output_dir: Path,
 ) -> None:
    """Execute the benchmark across all models and prompts."""
    prompts = discover_prompts(input_dir)
    logger.info("Found %d prompts in %s", len(prompts), input_dir)
    check_gpu_free()
    total_completed = 0
    total_failed = 0
    for repo in config.models:
        if not is_model_present(repo, config.model_dir):
            logger.warning("Skipping (not downloaded): %s", repo)
            continue
        model_output = output_dir / repo
        model_output.mkdir(parents=True, exist_ok=True)
        logger.info("=== Benchmarking model: %s ===", repo)
        stop_vllm()
        try:
            start_vllm(
                model=repo,
                port=config.port,
                model_dir=config.model_dir,
                gpu_memory_utilization=config.gpu_memory_utilization,
            )
        except RuntimeError:
            logger.exception("Failed to start vLLM for %s, skipping", repo)
            continue
        logger.info("vLLM started for %s", repo)
        try:
            with VLLMClient(port=config.port, timeout=config.timeout) as client:
                client.wait_ready(max_wait=config.vllm_startup_timeout)
                completed, failed = benchmark_model(
                    client,
                    prompts,
                    repo=repo,
                    model_dir_name=repo,
                    model_output=model_output,
                    temperature=config.temperature,
                    concurrency=config.concurrency,
                )
                total_completed += completed
                total_failed += failed
        finally:
            stop_vllm()
    logger.info("=== Benchmark complete ===")
    logger.info("Completed: %d | Failed: %d", total_completed, total_failed)
 def main(
    input_dir: Annotated[Path, typer.Argument(help="Directory containing input .txt prompt files")],
    config: Annotated[Path, typer.Option(help="Path to TOML config file")] = Path("bench.toml"),
    output_dir: Annotated[Path, typer.Option(help="Output directory for results")] = Path("output"),
    log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
 ) -> None:
    """Run prompts through multiple LLMs via vLLM and save results."""
    logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
    if not input_dir.is_dir():
        message = f"Input directory does not exist: {input_dir}"
        raise typer.BadParameter(message)
    if not config.is_file():
        message = f"Config file does not exist: {config}"
        raise typer.BadParameter(message)
    benchmark_config = BenchmarkConfig.from_toml(config)
    output_dir.mkdir(parents=True, exist_ok=True)
    run_benchmark(benchmark_config, input_dir, output_dir)
 def cli() -> None:
    """Typer entry point."""
    typer.run(main)
 if __name__ == "__main__":
    cli()
@@ -0,0 +1,30 @@
 """Pydantic models for benchmark configuration."""
 from __future__ import annotations
 import tomllib
 from typing import TYPE_CHECKING
 from pydantic import BaseModel
 if TYPE_CHECKING:
    from pathlib import Path
 class BenchmarkConfig(BaseModel):
    """Top-level benchmark configuration loaded from TOML."""
    models: list[str]
    model_dir: str = "/zfs/models/hf"
    port: int = 8000
    gpu_memory_utilization: float = 0.90
    temperature: float = 0.0
    timeout: int = 300
    concurrency: int = 4
    vllm_startup_timeout: int = 900
    @classmethod
    def from_toml(cls, config_path: Path) -> BenchmarkConfig:
        """Load benchmark config from a TOML file."""
        raw = tomllib.loads(config_path.read_text())["bench"]
        return cls(**raw)
@@ -0,0 +1,34 @@
 SUMMARIZATION_SYSTEM_PROMPT = """You are a legislative analyst extracting policy substance from Congressional bill text.
 Your job is to compress a bill into a dense, neutral structured summary that captures every distinct policy action — including secondary effects that might be buried in subsections.
 EXTRACTION RULES:
 - IGNORE: whereas clauses, congressional findings that are purely political statements, recitals, preambles, citations of existing law by number alone, and procedural boilerplate.
 - FOCUS ON: operative verbs — what the bill SHALL do, PROHIBIT, REQUIRE, AUTHORIZE, AMEND, APPROPRIATE, or ESTABLISH.
 - SURFACE ALL THREADS: If the bill touches multiple policy areas, list each thread separately. Do not collapse them.
 - BE CONCRETE: Name the affected population, the mechanism, and the direction (expands/restricts/maintains).
 - STAY NEUTRAL: No political framing. Describe what the text does, not what its sponsors claim it does.
 OUTPUT FORMAT — plain structured text, not JSON:
 OPERATIVE ACTIONS:
 [Numbered list of what the bill actually does, one action per line, max 20 words each]
 AFFECTED POPULATIONS:
 [Who gains something, who loses something, or whose behavior is regulated]
 MECHANISMS:
 [How it works: new funding, mandate, prohibition, amendment to existing statute, grant program, study commission, etc.]
 POLICY THREADS:
 [List each distinct policy domain this bill touches, even minor ones. Use plain language, not domain codes.]
 SYMBOLIC/PROCEDURAL ONLY:
 [Yes or No — is this bill primarily a resolution, designation, or awareness declaration with no operative effect?]
 LENGTH TARGET: 150-250 words total. Be ruthless about cutting. Density over completeness."""
 SUMMARIZATION_USER_TEMPLATE = """Summarize the following Congressional bill according to your instructions.
 BILL TEXT:
 {text_content}"""
@@ -0,0 +1,114 @@
 """Build a fine-tuning JSONL dataset from batch request + output files.
 Joins the original request JSONL (system + user messages) with the batch
 output JSONL (assistant completions) by custom_id to produce a ChatML-style
 messages JSONL suitable for fine-tuning.
 """
 from __future__ import annotations
 import json
 import logging
 from pathlib import Path
 from typing import Annotated
 import typer
 logger = logging.getLogger(__name__)
 HTTP_OK = 200
 def load_requests(path: Path) -> dict[str, list[dict]]:
    """Parse request JSONL into {custom_id: messages}."""
    results: dict[str, list[dict]] = {}
    with path.open(encoding="utf-8") as handle:
        for raw_line in handle:
            stripped = raw_line.strip()
            if not stripped:
                continue
            record = json.loads(stripped)
            custom_id = record["custom_id"]
            messages = record["body"]["messages"]
            results[custom_id] = messages
    return results
 def load_completions(path: Path) -> dict[str, str]:
    """Parse batch output JSONL into {custom_id: assistant_content}."""
    results: dict[str, str] = {}
    with path.open(encoding="utf-8") as handle:
        for line_number, raw_line in enumerate(handle, 1):
            stripped = raw_line.strip()
            if not stripped:
                continue
            record = json.loads(stripped)
            custom_id = record["custom_id"]
            response = record.get("response", {})
            if response.get("status_code") != HTTP_OK:
                logger.warning("Skipping %s (line %d): status %s", custom_id, line_number, response.get("status_code"))
                continue
            body = response.get("body", {})
            choices = body.get("choices", [])
            if not choices:
                logger.warning("Skipping %s (line %d): no choices", custom_id, line_number)
                continue
            content = choices[0].get("message", {}).get("content", "")
            if not content:
                logger.warning("Skipping %s (line %d): empty content", custom_id, line_number)
                continue
            results[custom_id] = content
    return results
 def main(
    requests_path: Annotated[Path, typer.Option("--requests", help="Batch request JSONL")] = Path(
        "output/openai_batch/requests.jsonl",
    ),
    batch_output: Annotated[Path, typer.Option("--batch-output", help="Batch output JSONL")] = Path(
        "batch_69d84558d91c819091d53f08d78f9fd6_output.jsonl",
    ),
    output_path: Annotated[Path, typer.Option("--output", help="Fine-tuning JSONL output")] = Path(
        "output/finetune_dataset.jsonl",
    ),
    log_level: Annotated[str, typer.Option(help="Log level")] = "INFO",
 ) -> None:
    """Build fine-tuning dataset by joining request and output JSONL files."""
    logging.basicConfig(level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
    logger.info("Loading requests from %s", requests_path)
    requests = load_requests(requests_path)
    logger.info("Loaded %d requests", len(requests))
    logger.info("Loading completions from %s", batch_output)
    completions = load_completions(batch_output)
    logger.info("Loaded %d completions", len(completions))
    output_path.parent.mkdir(parents=True, exist_ok=True)
    matched = 0
    skipped = 0
    with output_path.open("w", encoding="utf-8") as handle:
        for custom_id, messages in requests.items():
            assistant_content = completions.get(custom_id)
            if assistant_content is None:
                skipped += 1
                continue
            example = {
                "messages": [*messages, {"role": "assistant", "content": assistant_content}],
            }
            handle.write(json.dumps(example, ensure_ascii=False))
            handle.write("\n")
            matched += 1
    logger.info("Wrote %d examples to %s (skipped %d unmatched)", matched, output_path, skipped)
 def cli() -> None:
    """Typer entry point."""
    typer.run(main)
 if __name__ == "__main__":
    cli()
@@ -0,0 +1,97 @@
 """Sum token usage across compressed and uncompressed run directories."""
 from __future__ import annotations
 import json
 import logging
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Annotated
 import typer
 logger = logging.getLogger(__name__)
@dataclass
 class UsageTotals:
    """Aggregate usage counters for a directory of run records."""
    files: int = 0
    errors: int = 0
    prompt_tokens: int = 0
    cached_tokens: int = 0
    completion_tokens: int = 0
    reasoning_tokens: int = 0
    total_tokens: int = 0
    per_file: list[tuple[str, int, int, int]] = field(default_factory=list)
 def tally_directory(directory: Path) -> UsageTotals:
    """Return aggregated usage stats for every JSON record in a directory."""
    totals = UsageTotals()
    decoder = json.JSONDecoder()
    for path in sorted(directory.glob("*.json")):
        text = path.read_text().lstrip()
        record, _ = decoder.raw_decode(text)
        totals.files += 1
        usage = record.get("usage")
        if not usage:
            totals.errors += 1
            continue
        prompt_tokens = usage.get("prompt_tokens", 0)
        completion_tokens = usage.get("completion_tokens", 0)
        total_tokens = usage.get("total_tokens", 0)
        cached_tokens = (usage.get("prompt_tokens_details") or {}).get("cached_tokens", 0)
        reasoning_tokens = (usage.get("completion_tokens_details") or {}).get("reasoning_tokens", 0)
        totals.prompt_tokens += prompt_tokens
        totals.completion_tokens += completion_tokens
        totals.total_tokens += total_tokens
        totals.cached_tokens += cached_tokens
        totals.reasoning_tokens += reasoning_tokens
        totals.per_file.append((path.name, prompt_tokens, completion_tokens, total_tokens))
    return totals
 def log_totals(label: str, totals: UsageTotals) -> None:
    """Log a one-block summary for a directory."""
    counted = totals.files - totals.errors
    average_total = totals.total_tokens / counted if counted else 0
    logger.info("[%s]", label)
    logger.info("  files          : %d (with usage: %d, errors: %d)", totals.files, counted, totals.errors)
    logger.info("  prompt tokens  : %d", totals.prompt_tokens)
    logger.info("  cached tokens  : %d", totals.cached_tokens)
    logger.info("  completion tok : %d", totals.completion_tokens)
    logger.info("  reasoning tok  : %d", totals.reasoning_tokens)
    logger.info("  total tokens   : %d", totals.total_tokens)
    logger.info("  avg total/file : %.1f", average_total)
 def main(
    runs_dir: Annotated[Path, typer.Option("--runs-dir")] = Path("output/openai_runs_temp_1"),
    log_level: Annotated[str, typer.Option("--log-level")] = "INFO",
 ) -> None:
    """Print token usage totals for the compressed and uncompressed run directories."""
    logging.basicConfig(level=log_level, format="%(message)s")
    grand = UsageTotals()
    for label in ("compressed", "uncompressed"):
        directory = runs_dir / label
        if not directory.is_dir():
            logger.warning("%s: directory not found at %s", label, directory)
            continue
        totals = tally_directory(directory)
        log_totals(label, totals)
        grand.files += totals.files
        grand.errors += totals.errors
        grand.prompt_tokens += totals.prompt_tokens
        grand.cached_tokens += totals.cached_tokens
        grand.completion_tokens += totals.completion_tokens
        grand.reasoning_tokens += totals.reasoning_tokens
        grand.total_tokens += totals.total_tokens
    log_totals("grand total", grand)
 if __name__ == "__main__":
    typer.run(main)
@@ -0,0 +1,68 @@
 """OpenAI-compatible client for vLLM's API."""
 from __future__ import annotations
 import logging
 import time
 from typing import Self
 import httpx
 logger = logging.getLogger(__name__)
 READY_POLL_INTERVAL = 2.0
 class VLLMClient:
    """Talk to a vLLM server via its OpenAI-compatible API.
    Args:
        host: vLLM host.
        port: vLLM port.
        timeout: Per-request timeout in seconds.
    """
    def __init__(self, *, host: str = "localhost", port: int = 8000, timeout: int = 300) -> None:
        """Create a client connected to a vLLM server."""
        self._client = httpx.Client(base_url=f"http://{host}:{port}", timeout=timeout)
    def wait_ready(self, max_wait: int) -> None:
        """Poll /v1/models until the server is ready or timeout."""
        deadline = time.monotonic() + max_wait
        while time.monotonic() < deadline:
            try:
                response = self._client.get("/v1/models")
                if response.is_success:
                    logger.info("vLLM server is ready")
                    return
            except httpx.TransportError:
                pass
            time.sleep(READY_POLL_INTERVAL)
        msg = f"vLLM server not ready after {max_wait}s"
        raise TimeoutError(msg)
    def complete(self, prompt: str, model: str, *, temperature: float = 0.0, max_tokens: int = 4096) -> str:
        """Send a prompt to /v1/completions and return the response text."""
        payload = {
            "model": model,
            "prompt": prompt,
            "temperature": temperature,
            "max_tokens": max_tokens,
        }
        logger.info("Sending prompt to %s (%d chars)", model, len(prompt))
        response = self._client.post("/v1/completions", json=payload)
        response.raise_for_status()
        data = response.json()
        return data["choices"][0]["text"]
    def close(self) -> None:
        """Close the HTTP client."""
        self._client.close()
    def __enter__(self) -> Self:
        """Enter the context manager."""
        return self
    def __exit__(self, *args: object) -> None:
        """Close the HTTP client on exit."""
        self.close()
@@ -0,0 +1 @@
 """Signal command and control bot."""
@@ -0,0 +1 @@
 """Signal bot commands."""
@@ -0,0 +1,137 @@
 """Van inventory command — parse receipts and item lists via LLM, push to API."""
 from __future__ import annotations
 import json
 import logging
 from typing import TYPE_CHECKING, Any
 import httpx
 from python.signal_bot.models import InventoryItem, InventoryUpdate
 if TYPE_CHECKING:
    from python.signal_bot.llm_client import LLMClient
    from python.signal_bot.models import SignalMessage
    from python.signal_bot.signal_client import SignalClient
 logger = logging.getLogger(__name__)
 SYSTEM_PROMPT = """\
 You are an inventory assistant. Extract items from the input and return ONLY
 a JSON array. Each element must have these fields:
  - "name": item name (string)
  - "quantity": numeric count or amount (default 1)
  - "unit": unit of measure (e.g. "each", "lb", "oz", "gallon", "bag", "box")
  - "category": category like "food", "tools", "supplies", etc.
  - "notes": any extra detail (empty string if none)
 Example output:
 [{"name": "water bottles", "quantity": 6, "unit": "gallon", "category": "supplies", "notes": "1 gallon each"}]
 Return ONLY the JSON array, no other text.\
 """
 IMAGE_PROMPT = "Extract all items from this receipt or inventory photo."
 TEXT_PROMPT = "Extract all items from this inventory list."
 def parse_llm_response(raw: str) -> list[InventoryItem]:
    """Parse the LLM JSON response into InventoryItem list."""
    text = raw.strip()
    # Strip markdown code fences if present
    if text.startswith("```"):
        lines = text.split("\n")
        lines = [line for line in lines if not line.startswith("```")]
        text = "\n".join(lines)
    items_data: list[dict[str, Any]] = json.loads(text)
    return [InventoryItem.model_validate(item) for item in items_data]
 def _upsert_item(api_url: str, item: InventoryItem) -> None:
    """Create or update an item via the van_inventory API.
    Fetches existing items, and if one with the same name exists,
    patches its quantity (summing). Otherwise creates a new item.
    """
    base = api_url.rstrip("/")
    response = httpx.get(f"{base}/api/items", timeout=10)
    response.raise_for_status()
    existing: list[dict[str, Any]] = response.json()
    match = next((e for e in existing if e["name"].lower() == item.name.lower()), None)
    if match:
        new_qty = match["quantity"] + item.quantity
        patch = {"quantity": new_qty}
        if item.category:
            patch["category"] = item.category
        response = httpx.patch(f"{base}/api/items/{match['id']}", json=patch, timeout=10)
        response.raise_for_status()
        return
    payload = {
        "name": item.name,
        "quantity": item.quantity,
        "unit": item.unit,
        "category": item.category or None,
    }
    response = httpx.post(f"{base}/api/items", json=payload, timeout=10)
    response.raise_for_status()
 def handle_inventory_update(
    message: SignalMessage,
    signal: SignalClient,
    llm: LLMClient,
    api_url: str,
 ) -> InventoryUpdate:
    """Process an inventory update from a Signal message.
    Accepts either an image (receipt photo) or text list.
    Uses the LLM to extract structured items, then pushes to the van_inventory API.
    """
    try:
        logger.info(f"Processing inventory update from {message.source}")
        if message.attachments:
            image_data = signal.get_attachment(message.attachments[0])
            raw_response = llm.chat(
                IMAGE_PROMPT,
                image_data=image_data,
                system=SYSTEM_PROMPT,
            )
            source_type = "receipt_photo"
        elif message.message.strip():
            raw_response = llm.chat(
                f"{TEXT_PROMPT}\n\n{message.message}",
                system=SYSTEM_PROMPT,
            )
            source_type = "text_list"
        else:
            signal.reply(message, "Send a photo of a receipt or a text list of items to update inventory.")
            return InventoryUpdate()
        logger.info(f"{raw_response=}")
        new_items = parse_llm_response(raw_response)
        logger.info(f"{new_items=}")
        for item in new_items:
            _upsert_item(api_url, item)
        summary = _format_summary(new_items)
        signal.reply(message, f"Inventory updated with {len(new_items)} item(s):\n{summary}")
        return InventoryUpdate(items=new_items, raw_response=raw_response, source_type=source_type)
    except Exception:
        logger.exception("Failed to process inventory update")
        signal.reply(message, "Failed to process inventory update. Check logs for details.")
        return InventoryUpdate()
 def _format_summary(items: list[InventoryItem]) -> str:
    """Format items into a readable summary."""
    lines = [f"  - {item.name} x{item.quantity} {item.unit} [{item.category}]" for item in items]
    return "\n".join(lines)
@@ -0,0 +1,64 @@
 """Location command for the Signal bot."""
 from __future__ import annotations
 import logging
 from typing import TYPE_CHECKING, Any
 import httpx
 if TYPE_CHECKING:
    from python.signal_bot.models import SignalMessage
    from python.signal_bot.signal_client import SignalClient
 logger = logging.getLogger(__name__)
 def _get_entity_state(ha_url: str, ha_token: str, entity_id: str) -> dict[str, Any]:
    """Fetch an entity's state from Home Assistant."""
    entity_url = f"{ha_url}/api/states/{entity_id}"
    logger.debug(f"Fetching {entity_url=}")
    response = httpx.get(
        entity_url,
        headers={"Authorization": f"Bearer {ha_token}"},
        timeout=30,
    )
    response.raise_for_status()
    return response.json()
 def _format_location(latitude: str, longitude: str) -> str:
    """Render a friendly location response."""
    return f"Van location: {latitude}, {longitude}\nhttps://maps.google.com/?q={latitude},{longitude}"
 def handle_location_request(
    message: SignalMessage,
    signal: SignalClient,
    ha_url: str | None,
    ha_token: str | None,
 ) -> None:
    """Reply with van location from Home Assistant."""
    if ha_url is None or ha_token is None:
        signal.reply(message, "Location command is not configured (missing HA_URL or HA_TOKEN).")
        return
    lat_payload = None
    lon_payload = None
    try:
        lat_payload = _get_entity_state(ha_url, ha_token, "sensor.van_last_known_latitude")
        lon_payload = _get_entity_state(ha_url, ha_token, "sensor.van_last_known_longitude")
    except httpx.HTTPError:
        logger.exception("Couldn't fetch van location from Home Assistant right now.")
        logger.debug(f"{ha_url=} {lat_payload=} {lon_payload=}")
        signal.reply(message, "Couldn't fetch van location from Home Assistant right now.")
        return
    latitude = lat_payload.get("state", "")
    longitude = lon_payload.get("state", "")
    if not latitude or not longitude or latitude == "unavailable" or longitude == "unavailable":
        signal.reply(message, "Van location is unavailable in Home Assistant right now.")
        return
    signal.reply(message, _format_location(latitude, longitude))
@@ -0,0 +1,284 @@
 """Device registry — tracks verified/unverified devices by safety number."""
 from __future__ import annotations
 import logging
 from datetime import datetime, timedelta
 from typing import TYPE_CHECKING, NamedTuple
 from sqlalchemy import delete, select
 from sqlalchemy.orm import Session
 from python.common import utcnow
 from python.orm.signal_bot.models import RoleRecord, SignalDevice
 from python.signal_bot.models import Role, TrustLevel
 if TYPE_CHECKING:
    from sqlalchemy.engine import Engine
    from python.signal_bot.signal_client import SignalClient
 logger = logging.getLogger(__name__)
 _BLOCKED_TTL = timedelta(minutes=60)
 _DEFAULT_TTL = timedelta(minutes=5)
 class _CacheEntry(NamedTuple):
    expires: datetime
    trust_level: TrustLevel
    has_safety_number: bool
    safety_number: str | None
    roles: list[Role]
 class DeviceRegistry:
    """Manage device trust based on Signal safety numbers.
    Devices start as UNVERIFIED. An admin verifies them over SSH by calling
    ``verify(phone_number)`` which marks the device VERIFIED and also tells
    signal-cli to trust the identity.
    Only VERIFIED devices may execute commands.
    """
    def __init__(self, signal_client: SignalClient, engine: Engine) -> None:
        self.signal_client = signal_client
        self.engine = engine
        self._contact_cache: dict[str, _CacheEntry] = {}
    def is_verified(self, phone_number: str) -> bool:
        """Check if a phone number is verified."""
        if entry := self._cached(phone_number):
            return entry.trust_level == TrustLevel.VERIFIED
        device = self._load_device(phone_number)
        return device is not None and device.trust_level == TrustLevel.VERIFIED
    def record_contact(self, phone_number: str, safety_number: str | None = None) -> None:
        """Record seeing a device. Creates entry if new, updates last_seen."""
        now = utcnow()
        entry = self._cached(phone_number)
        if entry and entry.safety_number == safety_number:
            return
        with Session(self.engine) as session:
            device = session.scalars(
                select(SignalDevice).where(SignalDevice.phone_number == phone_number)
            ).one_or_none()
            if device:
                if device.safety_number != safety_number and device.trust_level != TrustLevel.BLOCKED:
                    logger.warning(f"Safety number changed for {phone_number}, resetting to UNVERIFIED")
                    device.safety_number = safety_number
                    device.trust_level = TrustLevel.UNVERIFIED
                device.last_seen = now
            else:
                device = SignalDevice(
                    phone_number=phone_number,
                    safety_number=safety_number,
                    trust_level=TrustLevel.UNVERIFIED,
                    last_seen=now,
                )
                session.add(device)
                logger.info(f"New device registered: {phone_number}")
            session.commit()
            self._update_cache(phone_number, device)
    def has_safety_number(self, phone_number: str) -> bool:
        """Check if a device has a safety number on file."""
        if entry := self._cached(phone_number):
            return entry.has_safety_number
        device = self._load_device(phone_number)
        return device is not None and device.safety_number is not None
    def verify(self, phone_number: str) -> bool:
        """Mark a device as verified. Called by admin over SSH.
        Returns True if the device was found and verified.
        """
        with Session(self.engine) as session:
            device = session.scalars(
                select(SignalDevice).where(SignalDevice.phone_number == phone_number)
            ).one_or_none()
            if not device:
                logger.warning(f"Cannot verify unknown device: {phone_number}")
                return False
            device.trust_level = TrustLevel.VERIFIED
            self.signal_client.trust_identity(phone_number, trust_all_known_keys=True)
            session.commit()
            self._update_cache(phone_number, device)
            logger.info(f"Device verified: {phone_number}")
            return True
    def block(self, phone_number: str) -> bool:
        """Block a device."""
        return self._set_trust(phone_number, TrustLevel.BLOCKED, "Device blocked")
    def unverify(self, phone_number: str) -> bool:
        """Reset a device to unverified."""
        return self._set_trust(phone_number, TrustLevel.UNVERIFIED)
    # -- role management ------------------------------------------------------
    def get_roles(self, phone_number: str) -> list[Role]:
        """Return the roles for a device, defaulting to empty."""
        if entry := self._cached(phone_number):
            return entry.roles
        device = self._load_device(phone_number)
        return _extract_roles(device) if device else []
    def has_role(self, phone_number: str, role: Role) -> bool:
        """Check if a device has a specific role or is admin."""
        roles = self.get_roles(phone_number)
        return Role.ADMIN in roles or role in roles
    def grant_role(self, phone_number: str, role: Role) -> bool:
        """Add a role to a device. Called by admin over SSH."""
        with Session(self.engine) as session:
            device = session.scalars(
                select(SignalDevice).where(SignalDevice.phone_number == phone_number)
            ).one_or_none()
            if not device:
                logger.warning(f"Cannot grant role for unknown device: {phone_number}")
                return False
            if any(record.name == role for record in device.roles):
                return True
            role_record = session.scalars(select(RoleRecord).where(RoleRecord.name == role)).one_or_none()
            if not role_record:
                logger.warning(f"Unknown role: {role}")
                return False
            device.roles.append(role_record)
            session.commit()
            self._update_cache(phone_number, device)
            logger.info(f"Device {phone_number} granted role {role}")
            return True
    def revoke_role(self, phone_number: str, role: Role) -> bool:
        """Remove a role from a device. Called by admin over SSH."""
        with Session(self.engine) as session:
            device = session.scalars(
                select(SignalDevice).where(SignalDevice.phone_number == phone_number)
            ).one_or_none()
            if not device:
                logger.warning(f"Cannot revoke role for unknown device: {phone_number}")
                return False
            device.roles = [record for record in device.roles if record.name != role]
            session.commit()
            self._update_cache(phone_number, device)
            logger.info(f"Device {phone_number} revoked role {role}")
            return True
    def set_roles(self, phone_number: str, roles: list[Role]) -> bool:
        """Replace all roles for a device. Called by admin over SSH."""
        with Session(self.engine) as session:
            device = session.scalars(
                select(SignalDevice).where(SignalDevice.phone_number == phone_number)
            ).one_or_none()
            if not device:
                logger.warning(f"Cannot set roles for unknown device: {phone_number}")
                return False
            role_names = [str(role) for role in roles]
            records = session.scalars(select(RoleRecord).where(RoleRecord.name.in_(role_names))).all()
            device.roles = records
            session.commit()
            self._update_cache(phone_number, device)
            logger.info(f"Device {phone_number} roles set to {role_names}")
            return True
    # -- queries --------------------------------------------------------------
    def list_devices(self) -> list[SignalDevice]:
        """Return all known devices."""
        with Session(self.engine) as session:
            return list(session.scalars(select(SignalDevice)).all())
    def sync_identities(self) -> None:
        """Pull identity list from signal-cli and record any new ones."""
        identities = self.signal_client.get_identities()
        for identity in identities:
            number = identity.get("number", "")
            safety = identity.get("safety_number", identity.get("fingerprint", ""))
            if number:
                self.record_contact(number, safety)
    # -- internals ------------------------------------------------------------
    def _cached(self, phone_number: str) -> _CacheEntry | None:
        """Return the cache entry if it exists and hasn't expired."""
        entry = self._contact_cache.get(phone_number)
        if entry and utcnow() < entry.expires:
            return entry
        return None
    def _load_device(self, phone_number: str) -> SignalDevice | None:
        """Fetch a device by phone number (with joined roles)."""
        with Session(self.engine) as session:
            return session.scalars(select(SignalDevice).where(SignalDevice.phone_number == phone_number)).one_or_none()
    def _update_cache(self, phone_number: str, device: SignalDevice) -> None:
        """Refresh the cache entry for a device."""
        ttl = _BLOCKED_TTL if device.trust_level == TrustLevel.BLOCKED else _DEFAULT_TTL
        self._contact_cache[phone_number] = _CacheEntry(
            expires=utcnow() + ttl,
            trust_level=device.trust_level,
            has_safety_number=device.safety_number is not None,
            safety_number=device.safety_number,
            roles=_extract_roles(device),
        )
    def _set_trust(self, phone_number: str, level: str, log_msg: str | None = None) -> bool:
        """Update the trust level for a device."""
        with Session(self.engine) as session:
            device = session.scalars(
                select(SignalDevice).where(SignalDevice.phone_number == phone_number)
            ).one_or_none()
            if not device:
                return False
            device.trust_level = level
            session.commit()
            self._update_cache(phone_number, device)
            if log_msg:
                logger.info(f"{log_msg}: {phone_number}")
            return True
 def _extract_roles(device: SignalDevice) -> list[Role]:
    """Convert a device's RoleRecord objects to a list of Role enums."""
    return [Role(record.name) for record in device.roles]
 def sync_roles(engine: Engine) -> None:
    """Sync the Role enum to the role table, adding new and removing stale entries."""
    expected = {role.value for role in Role}
    with Session(engine) as session:
        existing = set(session.scalars(select(RoleRecord.name)).all())
        to_add = expected - existing
        to_remove = existing - expected
        for name in to_add:
            session.add(RoleRecord(name=name))
            logger.info(f"Role added: {name}")
        if to_remove:
            session.execute(delete(RoleRecord).where(RoleRecord.name.in_(to_remove)))
            for name in to_remove:
                logger.info(f"Role removed: {name}")
        session.commit()
@@ -0,0 +1,80 @@
 """Flexible LLM client for ollama backends."""
 from __future__ import annotations
 import base64
 import logging
 from typing import Any, Self
 import httpx
 logger = logging.getLogger(__name__)
 class LLMClient:
    """Talk to an ollama instance.
    Args:
        model: Ollama model name.
        host: Ollama host.
        port: Ollama port.
        temperature: Sampling temperature.
    """
    def __init__(
        self,
        *,
        model: str,
        host: str,
        port: int = 11434,
        temperature: float = 0.1,
        timeout: int = 300,
    ) -> None:
        self.model = model
        self.temperature = temperature
        self._client = httpx.Client(base_url=f"http://{host}:{port}", timeout=timeout)
    def chat(self, prompt: str, image_data: bytes | None = None, system: str | None = None) -> str:
        """Send a text prompt and return the response."""
        messages: list[dict[str, Any]] = []
        if system:
            messages.append({"role": "system", "content": system})
        user_msg = {"role": "user", "content": prompt}
        if image_data:
            user_msg["images"] = [base64.b64encode(image_data).decode()]
        messages.append(user_msg)
        return self._generate(messages)
    def _generate(self, messages: list[dict[str, Any]]) -> str:
        """Call the ollama chat API."""
        payload = {
            "model": self.model,
            "messages": messages,
            "stream": False,
            "options": {"temperature": self.temperature},
        }
        logger.info(f"LLM request to {self.model}")
        response = self._client.post("/api/chat", json=payload)
        response.raise_for_status()
        data = response.json()
        return data["message"]["content"]
    def list_models(self) -> list[str]:
        """List available models on the ollama instance."""
        response = self._client.get("/api/tags")
        response.raise_for_status()
        return [m["name"] for m in response.json().get("models", [])]
    def __enter__(self) -> Self:
        """Enter the context manager."""
        return self
    def __exit__(self, *args: object) -> None:
        """Close the HTTP client on exit."""
        self.close()
    def close(self) -> None:
        """Close the HTTP client."""
        self._client.close()
@@ -0,0 +1,239 @@
 """Signal command and control bot — main entry point."""
 from __future__ import annotations
 import logging
 from dataclasses import dataclass
 from os import getenv
 from typing import TYPE_CHECKING, Annotated
 if TYPE_CHECKING:
    from collections.abc import Callable
 import typer
 from alembic.command import upgrade
 from sqlalchemy.orm import Session
 from tenacity import before_sleep_log, retry, stop_after_attempt, wait_exponential
 from python.common import configure_logger, utcnow
 from python.database_cli import DATABASES
 from python.orm.common import get_postgres_engine
 from python.orm.signal_bot.models import DeadLetterMessage
 from python.signal_bot.commands.inventory import handle_inventory_update
 from python.signal_bot.commands.location import handle_location_request
 from python.signal_bot.device_registry import DeviceRegistry, sync_roles
 from python.signal_bot.llm_client import LLMClient
 from python.signal_bot.models import BotConfig, MessageStatus, Role, SignalMessage
 from python.signal_bot.signal_client import SignalClient
 logger = logging.getLogger(__name__)
@dataclass(frozen=True, slots=True)
 class Command:
    """A registered bot command."""
    action: Callable[[SignalMessage, str], None]
    help_text: str
    role: Role | None  # None = no role required (always allowed)
 class Bot:
    """Holds shared resources and dispatches incoming messages to command handlers."""
    def __init__(
        self,
        signal: SignalClient,
        llm: LLMClient,
        registry: DeviceRegistry,
        config: BotConfig,
    ) -> None:
        self.signal = signal
        self.llm = llm
        self.registry = registry
        self.config = config
        self.commands: dict[str, Command] = {
            "help": Command(action=self._help, help_text="show this help message", role=None),
            "status": Command(action=self._status, help_text="show bot status", role=Role.STATUS),
            "inventory": Command(
                action=self._inventory,
                help_text="update van inventory from a text list or receipt photo",
                role=Role.INVENTORY,
            ),
            "location": Command(
                action=self._location,
                help_text="get current van location",
                role=Role.LOCATION,
            ),
        }
    # -- actions --------------------------------------------------------------
    def _help(self, message: SignalMessage, _cmd: str) -> None:
        """Return help text filtered to the sender's roles."""
        self.signal.reply(message, self._build_help(self.registry.get_roles(message.source)))
    def _status(self, message: SignalMessage, _cmd: str) -> None:
        """Return the status of the bot."""
        models = self.llm.list_models()
        model_list = ", ".join(models[:10])
        device_count = len(self.registry.list_devices())
        self.signal.reply(
            message,
            f"Bot online.\nLLM: {self.llm.model}\nAvailable models: {model_list}\nKnown devices: {device_count}",
        )
    def _inventory(self, message: SignalMessage, _cmd: str) -> None:
        """Process an inventory update."""
        handle_inventory_update(message, self.signal, self.llm, self.config.inventory_api_url)
    def _location(self, message: SignalMessage, _cmd: str) -> None:
        """Reply with current van location."""
        handle_location_request(message, self.signal, self.config.ha_url, self.config.ha_token)
    # -- dispatch -------------------------------------------------------------
    def _build_help(self, roles: list[Role]) -> str:
        """Build help text showing only the commands the user can access."""
        is_admin = Role.ADMIN in roles
        lines = ["Available commands:"]
        for name, cmd in self.commands.items():
            if cmd.role is None or is_admin or cmd.role in roles:
                lines.append(f"  {name:20s} — {cmd.help_text}")
        return "\n".join(lines)
    def dispatch(self, message: SignalMessage) -> None:
        """Route an incoming message to the right command handler."""
        source = message.source
        if not self.registry.is_verified(source):
            logger.info(f"Device {source} not verified, ignoring message")
            return
        if not self.registry.has_safety_number(source) and self.registry.has_role(source, Role.ADMIN):
            logger.warning(f"Admin device {source} missing safety number, ignoring message")
            return
        text = message.message.strip()
        parts = text.split()
        if not parts and not message.attachments:
            return
        cmd = parts[0].lower() if parts else ""
        logger.info(f"f{source=} running {cmd=} with {message=}")
        command = self.commands.get(cmd)
        if command is None:
            if message.attachments:
                command = self.commands["inventory"]
                cmd = "inventory"
            else:
                return
        if command.role is not None and not self.registry.has_role(source, command.role):
            logger.warning(f"Device {source} denied access to {cmd!r}")
            self.signal.reply(message, f"Permission denied: you do not have the '{command.role}' role.")
            return
        command.action(message, cmd)
    def process_message(self, message: SignalMessage) -> None:
        """Process a single message, sending it to the dead letter queue after repeated failures."""
        max_attempts = self.config.max_message_attempts
        for attempt in range(1, max_attempts + 1):
            try:
                safety_number = self.signal.get_safety_number(message.source)
                self.registry.record_contact(message.source, safety_number)
                self.dispatch(message)
            except Exception:
                logger.exception(f"Failed to process message (attempt {attempt}/{max_attempts})")
            else:
                return
        logger.error(f"Message from {message.source} failed {max_attempts} times, sending to dead letter queue")
        with Session(self.config.engine) as session:
            session.add(
                DeadLetterMessage(
                    source=message.source,
                    message=message.message,
                    received_at=utcnow(),
                    status=MessageStatus.UNPROCESSED,
                )
            )
            session.commit()
    def run(self) -> None:
        """Listen for messages via WebSocket, reconnecting on failure."""
        logger.info("Bot started — listening via WebSocket")
        @retry(
            stop=stop_after_attempt(self.config.max_retries),
            wait=wait_exponential(multiplier=self.config.reconnect_delay, max=self.config.max_reconnect_delay),
            before_sleep=before_sleep_log(logger, logging.WARNING),
            reraise=True,
        )
        def _listen() -> None:
            for message in self.signal.listen():
                logger.info(f"Message from {message.source}: {message.message[:80]}")
                self.process_message(message)
        try:
            _listen()
        except Exception:
            logger.critical("Max retries exceeded, shutting down")
            raise
 def main(
    log_level: Annotated[str, typer.Option()] = "DEBUG",
    llm_timeout: Annotated[int, typer.Option()] = 600,
 ) -> None:
    """Run the Signal command and control bot."""
    configure_logger(log_level)
    signal_api_url = getenv("SIGNAL_API_URL")
    phone_number = getenv("SIGNAL_PHONE_NUMBER")
    inventory_api_url = getenv("INVENTORY_API_URL")
    if signal_api_url is None:
        error = "SIGNAL_API_URL environment variable not set"
        raise ValueError(error)
    if phone_number is None:
        error = "SIGNAL_PHONE_NUMBER environment variable not set"
        raise ValueError(error)
    if inventory_api_url is None:
        error = "INVENTORY_API_URL environment variable not set"
        raise ValueError(error)
    signal_bot_config = DATABASES["signal_bot"].alembic_config()
    upgrade(signal_bot_config, "head")
    engine = get_postgres_engine(name="SIGNALBOT")
    sync_roles(engine)
    config = BotConfig(
        signal_api_url=signal_api_url,
        phone_number=phone_number,
        inventory_api_url=inventory_api_url,
        ha_url=getenv("HA_URL"),
        ha_token=getenv("HA_TOKEN"),
        engine=engine,
    )
    llm_host = getenv("LLM_HOST")
    llm_model = getenv("LLM_MODEL", "qwen3-vl:32b")
    llm_port = int(getenv("LLM_PORT", "11434"))
    if llm_host is None:
        error = "LLM_HOST environment variable not set"
        raise ValueError(error)
    with (
        SignalClient(config.signal_api_url, config.phone_number) as signal,
        LLMClient(model=llm_model, host=llm_host, port=llm_port, timeout=llm_timeout) as llm,
    ):
        registry = DeviceRegistry(signal, engine)
        bot = Bot(signal, llm, registry, config)
        bot.run()
 if __name__ == "__main__":
    typer.run(main)
@@ -0,0 +1,97 @@
 """Models for the Signal command and control bot."""
 from __future__ import annotations
 from datetime import datetime  # noqa: TC003 - pydantic needs this at runtime
 from enum import StrEnum
 from typing import Any
 from pydantic import BaseModel, ConfigDict
 from sqlalchemy.engine import Engine  # noqa: TC002 - pydantic needs this at runtime
 class TrustLevel(StrEnum):
    """Device trust level."""
    VERIFIED = "verified"
    UNVERIFIED = "unverified"
    BLOCKED = "blocked"
 class Role(StrEnum):
    """RBAC roles — one per command, plus admin which grants all."""
    ADMIN = "admin"
    STATUS = "status"
    INVENTORY = "inventory"
    LOCATION = "location"
 class MessageStatus(StrEnum):
    """Dead letter queue message status."""
    UNPROCESSED = "unprocessed"
    PROCESSED = "processed"
 class Device(BaseModel):
    """A registered device tracked by safety number."""
    phone_number: str
    safety_number: str
    trust_level: TrustLevel = TrustLevel.UNVERIFIED
    first_seen: datetime
    last_seen: datetime
 class SignalMessage(BaseModel):
    """An incoming Signal message."""
    source: str
    timestamp: int
    message: str = ""
    attachments: list[str] = []
    group_id: str | None = None
    is_receipt: bool = False
 class SignalEnvelope(BaseModel):
    """Raw envelope from signal-cli-rest-api."""
    envelope: dict[str, Any]
    account: str | None = None
 class InventoryItem(BaseModel):
    """An item in the van inventory."""
    name: str
    quantity: float = 1
    unit: str = "each"
    category: str = ""
    notes: str = ""
 class InventoryUpdate(BaseModel):
    """Result of processing an inventory update."""
    items: list[InventoryItem] = []
    raw_response: str = ""
    source_type: str = ""  # "receipt_photo" or "text_list"
 class BotConfig(BaseModel):
    """Top-level bot configuration."""
    model_config = ConfigDict(arbitrary_types_allowed=True)
    signal_api_url: str
    phone_number: str
    inventory_api_url: str
    ha_url: str | None = None
    ha_token: str | None = None
    engine: Engine
    reconnect_delay: int = 5
    max_reconnect_delay: int = 300
    max_retries: int = 10
    max_message_attempts: int = 3
@@ -0,0 +1,141 @@
 """Client for the signal-cli-rest-api."""
 from __future__ import annotations
 import json
 import logging
 from typing import TYPE_CHECKING, Any, Self
 import httpx
 import websockets.sync.client
 if TYPE_CHECKING:
    from collections.abc import Generator
 from python.signal_bot.models import SignalMessage
 logger = logging.getLogger(__name__)
 def _parse_envelope(envelope: dict[str, Any]) -> SignalMessage | None:
    """Parse a signal-cli envelope into a SignalMessage, or None if not a data message."""
    data_message = envelope.get("dataMessage")
    if not data_message:
        return None
    attachment_ids = [att["id"] for att in data_message.get("attachments", []) if "id" in att]
    group_info = data_message.get("groupInfo")
    group_id = group_info.get("groupId") if group_info else None
    return SignalMessage(
        source=envelope.get("source", ""),
        timestamp=envelope.get("timestamp", 0),
        message=data_message.get("message", "") or "",
        attachments=attachment_ids,
        group_id=group_id,
    )
 class SignalClient:
    """Communicate with signal-cli-rest-api.
    Args:
        base_url: URL of the signal-cli-rest-api (e.g. http://localhost:8989).
        phone_number: The registered phone number to send/receive as.
    """
    def __init__(self, base_url: str, phone_number: str) -> None:
        self.base_url = base_url.rstrip("/")
        self.phone_number = phone_number
        self._client = httpx.Client(base_url=self.base_url, timeout=30)
    def _ws_url(self) -> str:
        """Build the WebSocket URL from the base HTTP URL."""
        url = self.base_url.replace("http://", "ws://").replace("https://", "wss://")
        return f"{url}/v1/receive/{self.phone_number}"
    def listen(self) -> Generator[SignalMessage]:
        """Connect via WebSocket and yield messages as they arrive."""
        ws_url = self._ws_url()
        logger.info(f"Connecting to WebSocket: {ws_url}")
        with websockets.sync.client.connect(ws_url) as ws:
            for raw in ws:
                try:
                    data = json.loads(raw)
                    envelope = data.get("envelope", {})
                    message = _parse_envelope(envelope)
                    if message:
                        yield message
                except json.JSONDecodeError:
                    logger.warning(f"Non-JSON WebSocket frame: {raw[:200]}")
    def send(self, recipient: str, message: str) -> None:
        """Send a text message."""
        payload = {
            "message": message,
            "number": self.phone_number,
            "recipients": [recipient],
        }
        response = self._client.post("/v2/send", json=payload)
        response.raise_for_status()
    def send_to_group(self, group_id: str, message: str) -> None:
        """Send a message to a group."""
        payload = {
            "message": message,
            "number": self.phone_number,
            "recipients": [group_id],
        }
        response = self._client.post("/v2/send", json=payload)
        response.raise_for_status()
    def get_attachment(self, attachment_id: str) -> bytes:
        """Download an attachment by ID."""
        response = self._client.get(f"/v1/attachments/{attachment_id}")
        response.raise_for_status()
        return response.content
    def get_identities(self) -> list[dict[str, Any]]:
        """List known identities and their trust levels."""
        response = self._client.get(f"/v1/identities/{self.phone_number}")
        response.raise_for_status()
        return response.json()
    def get_safety_number(self, phone_number: str) -> str | None:
        """Look up the safety number for a contact from signal-cli's local store."""
        for identity in self.get_identities():
            if identity.get("number") == phone_number:
                return identity.get("safety_number", identity.get("fingerprint", ""))
        return None
    def trust_identity(self, number_to_trust: str, *, trust_all_known_keys: bool = False) -> None:
        """Trust an identity (verify safety number)."""
        payload: dict[str, Any] = {}
        if trust_all_known_keys:
            payload["trust_all_known_keys"] = True
        response = self._client.put(
            f"/v1/identities/{self.phone_number}/trust/{number_to_trust}",
            json=payload,
        )
        response.raise_for_status()
    def reply(self, message: SignalMessage, text: str) -> None:
        """Reply to a message, routing to group or individual."""
        if message.group_id:
            self.send_to_group(message.group_id, text)
        else:
            self.send(message.source, text)
    def __enter__(self) -> Self:
        """Enter the context manager."""
        return self
    def __exit__(self, *args: object) -> None:
        """Close the HTTP client on exit."""
        self.close()
    def close(self) -> None:
        """Close the HTTP client."""
        self._client.close()
@@ -1 +0,0 @@
 """Audiobook tools."""
@@ -1,471 +0,0 @@
 """Convert Audible AAX downloads into Audiobookshelf-friendly M4B files."""
 from __future__ import annotations
 import json
 import logging
 import re
 import shutil
 import subprocess
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import asdict, dataclass
 from os import getenv
 from pathlib import Path  # noqa: TC003 This is required for the typer CLI
 from typing import TYPE_CHECKING, Annotated, Any
 from uuid import uuid7
 import typer
 from python.common import configure_logger
 from python.orm.common import get_postgres_engine
 from python.tools.audiobook.metadata_agent import (
    AgentConfig,
    StandardBookMetadata,
    standard_book_metadata,
    write_agent_log,
 )
 if TYPE_CHECKING:
    from sqlalchemy.engine import Engine
 logger = logging.getLogger(__name__)
 SENSITIVE_COMMAND_ARGUMENTS = {"-activation_bytes"}
 BOOK_RANGE_PATTERN = re.compile(r"(?:^|-)books?-(?P<start>[1-9]\d*)-(?P<end>[1-9]\d*)(?:-|$)")
@dataclass(frozen=True)
 class ConversionConfig:
    """Runtime settings for one conversion command."""
    resolved_output: Path
    ollama_api_key: str
    agent_config: AgentConfig
    engine: Engine
    activation_bytes: str | None
    dry_run: bool
    overwrite: bool
    work_directory_name: str = ".audible_convert"
    dry_run_directory_name: str = "dry-run"
    temp_directory_name: str = "tmp"
    log_directory_name: str = "logs"
    review_directory_name: str = "review"
@dataclass(frozen=True)
 class ConcurrentConversionResult:
    """Result from running ffmpeg and metadata resolution together."""
    metadata: StandardBookMetadata | None
    conversion_error: Exception | None
    metadata_error: Exception | None
 class CommandExecutionError(RuntimeError):
    """Command failed without exposing sensitive arguments."""
    def __init__(self, arguments: list[str], returncode: int) -> None:
        """Create a redacted command failure."""
        self.arguments = tuple(arguments)
        self.returncode = returncode
        command = " ".join(redact_command_arguments(arguments))
        super().__init__(f"Command failed with exit code {returncode}: {command}")
 def main(
    input_directory: Annotated[Path, typer.Argument(help="Directory audible-cli downloads AAX files into.")],
    output_directory: Annotated[Path, typer.Argument(help="Audiobook output directory.")],
    *,
    dry_run: Annotated[
        bool,
        typer.Option("--dry-run", help="Print planned output files and write marker files without converting."),
    ] = False,
    overwrite: Annotated[bool, typer.Option("--overwrite", help="Overwrite existing M4B files.")] = False,
 ) -> None:
    """Convert AAX files from a download directory into M4B files."""
    configure_logger()
    resolved_input = input_directory.resolve(strict=True)
    resolved_output = output_directory.resolve()
    if not dry_run:
        resolved_output.mkdir(parents=True, exist_ok=True)
    ollama_api_key = getenv("OLLAMA_API_KEY")
    if not ollama_api_key:
        msg = "OLLAMA_API_KEY is required for audiobook metadata resolution"
        raise RuntimeError(msg)
    config = ConversionConfig(
        resolved_output=resolved_output,
        ollama_api_key=ollama_api_key,
        agent_config=AgentConfig(),
        engine=get_postgres_engine(name="RICHIE"),
        activation_bytes=getenv("AUDIBLE_ACTIVATION_BYTES"),
        dry_run=dry_run,
        overwrite=overwrite,
    )
    aax_files = sorted(resolved_input.glob("*.aax"))
    if not aax_files:
        logger.info("No AAX files found in %s", resolved_input)
        return
    for aax_file in aax_files:
        logger.info("Converting %s", aax_file)
        convert_aax_file_with_agent(aax_file, config)
 def run_command(arguments: list[str], *, capture: bool = False) -> subprocess.CompletedProcess[str]:
    """Run a command and return the completed process.
    Args:
        arguments: Command and arguments to run.
        capture: Whether to capture stdout and stderr.
    Returns:
        The completed process.
    """
    logger.debug("%s", " ".join(redact_command_arguments(arguments)))
    try:
        return subprocess.run(arguments, check=True, capture_output=capture, text=True)
    except subprocess.CalledProcessError as error:
        raise CommandExecutionError(arguments, error.returncode) from error
 def redact_command_arguments(arguments: list[str]) -> list[str]:
    """Return command arguments with sensitive values redacted."""
    redacted = []
    redact_next = False
    for argument in arguments:
        if redact_next:
            redacted.append("<redacted>")
            redact_next = False
            continue
        redacted.append(argument)
        redact_next = argument in SENSITIVE_COMMAND_ARGUMENTS
    return redacted
 def read_metadata(aax_file: Path) -> dict[str, str]:
    """Read ffprobe format tags from an AAX file.
    Args:
        aax_file: AAX file to inspect.
    Returns:
        Lower-cased metadata tag names mapped to their values.
    """
    completed = run_command(
        [
            "ffprobe",
            "-v",
            "quiet",
            "-print_format",
            "json",
            "-show_format",
            str(aax_file),
        ],
        capture=True,
    )
    ffprobe_data: dict[str, Any] = json.loads(completed.stdout)
    tags = ffprobe_data.get("format", {}).get("tags", {})
    return {str(key).lower(): str(value) for key, value in tags.items()}
 def output_stem(metadata: StandardBookMetadata) -> str:
    """Build the output stem for a book.
    Args:
        metadata: Book metadata.
    Returns:
        Output stem in author-series_01-title form.
    """
    index_slug = series_index_slug(metadata.series_index, metadata.title)
    return f"{metadata.author}-{metadata.series}_{index_slug}-{metadata.title}"
 def series_index_slug(series_index: float, title: str = "") -> str:
    """Return a filename-safe series index."""
    if title_range := title_series_range_slug(series_index, title):
        return title_range
    index = float(series_index)
    if index.is_integer():
        return f"{int(index):02}"
    return f"{int(index):02}.5"
 def title_series_range_slug(series_index: float, title: str) -> str | None:
    """Return a series range slug found in an omnibus title."""
    index = float(series_index)
    if not index.is_integer():
        return None
    first_index = int(index)
    for match in BOOK_RANGE_PATTERN.finditer(title):
        start = int(match.group("start"))
        end = int(match.group("end"))
        if start == first_index and end > start:
            return f"{start:02}-{end:02}"
    return None
 def metadata_output_path(output_directory: Path, metadata: StandardBookMetadata) -> Path:
    """Build the final M4B path from resolved metadata."""
    stem = output_stem(metadata)
    return output_directory / stem / f"{stem}.m4b"
 def convert_aax_file(
    aax_file: Path,
    destination: Path,
    activation_bytes: str | None,
    *,
    overwrite: bool,
 ) -> None:
    """Convert an AAX file into an M4B file.
    Args:
        aax_file: Source AAX file.
        destination: Destination M4B file.
        activation_bytes: Optional Audible activation bytes for ffmpeg.
        overwrite: Whether to overwrite an existing M4B.
    """
    if destination.exists() and not overwrite:
        logger.info("Skipping existing file %s", destination)
        return
    destination.parent.mkdir(parents=True, exist_ok=True)
    arguments = ["ffmpeg", "-hide_banner", "-y" if overwrite else "-n"]
    if activation_bytes:
        arguments.extend(["-activation_bytes", activation_bytes])
    arguments.extend(["-i", str(aax_file), "-map_metadata", "0", "-c", "copy", str(destination)])
    run_command(arguments)
 def write_review_file(
    *,
    destination: Path | None,
    ffprobe_metadata: dict[str, str],
    log_file: Path,
    metadata: StandardBookMetadata | None,
    reason: str,
    review_file: Path,
    source: Path,
    temp_file: Path | None,
 ) -> None:
    """Write a manual review file for an unresolved conversion."""
    review_file.parent.mkdir(parents=True, exist_ok=True)
    payload = {
        "destination": str(destination) if destination else None,
        "ffprobe_metadata": ffprobe_metadata,
        "metadata": asdict(metadata) if metadata else None,
        "reason": reason,
        "source": str(source),
        "temp_file": str(temp_file) if temp_file else None,
    }
    review_file.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
    write_agent_log(log_file, "review_written", path=str(review_file), reason=reason)
 def cleanup_temp_output(temp_file: Path) -> None:
    """Remove a run's temporary output directory."""
    shutil.rmtree(temp_file.parent, ignore_errors=True)
 def dry_run_aax_file_with_agent(
    aax_file: Path,
    ffprobe_metadata: dict[str, str],
    engine: Engine,
    config: ConversionConfig,
    log_file: Path,
    review_file: Path,
 ) -> None:
    """Resolve and print the planned output path without converting."""
    metadata = standard_book_metadata(
        aax_file.name,
        ffprobe_metadata,
        engine,
        log_file,
        config.ollama_api_key,
        config.agent_config,
    )
    destination = None if metadata.needs_review else metadata_output_path(config.resolved_output, metadata)
    if metadata.needs_review:
        write_review_file(
            destination=destination,
            ffprobe_metadata=ffprobe_metadata,
            log_file=log_file,
            metadata=metadata,
            reason="metadata_needs_review",
            review_file=review_file,
            source=aax_file,
            temp_file=None,
        )
        typer.echo(f"{aax_file} -> REVIEW {review_file}")
    else:
        stem = output_stem(metadata)
        dry_run_file = (
            config.resolved_output / config.work_directory_name / config.dry_run_directory_name / stem / f"{stem}.m4b"
        )
        dry_run_file.parent.mkdir(parents=True, exist_ok=True)
        dry_run_file.write_text(f"{destination}\n", encoding="utf-8")
        write_agent_log(
            log_file,
            "dry_run_file_written",
            destination=str(destination),
            path=str(dry_run_file),
        )
        typer.echo(f"{aax_file} -> {destination}")
 def convert_temp_file_and_resolve_metadata(
    aax_file: Path,
    temp_file: Path,
    ffprobe_metadata: dict[str, str],
    config: ConversionConfig,
    log_file: Path,
 ) -> ConcurrentConversionResult:
    """Run ffmpeg and metadata resolution in parallel."""
    conversion_error: Exception | None = None
    metadata_error: Exception | None = None
    metadata: StandardBookMetadata | None = None
    with ThreadPoolExecutor(max_workers=2) as executor:
        conversion_future = executor.submit(
            convert_aax_file,
            aax_file,
            temp_file,
            config.activation_bytes,
            overwrite=True,
        )
        metadata_future = executor.submit(
            standard_book_metadata,
            aax_file.name,
            ffprobe_metadata,
            config.engine,
            log_file,
            config.ollama_api_key,
            config.agent_config,
        )
        conversion_error = conversion_future.exception()
        if conversion_error is None:
            conversion_future.result()
        metadata_error = metadata_future.exception()
        if metadata_error is None:
            metadata = metadata_future.result()
    return ConcurrentConversionResult(
        metadata=metadata,
        conversion_error=conversion_error,
        metadata_error=metadata_error,
    )
 def convert_aax_file_with_agent(aax_file: Path, config: ConversionConfig) -> None:
    """Convert one AAX file using the metadata agent for the final path."""
    run_id = uuid7().hex
    log_file = config.resolved_output / config.work_directory_name / config.log_directory_name / f"{run_id}.jsonl"
    review_file = config.resolved_output / config.work_directory_name / config.review_directory_name / f"{run_id}.json"
    write_agent_log(log_file, "conversion_start", source=str(aax_file), dry_run=config.dry_run)
    try:
        ffprobe_metadata = read_metadata(aax_file)
    except Exception as error:
        logger.exception("ffprobe failed")
        write_review_file(
            destination=None,
            ffprobe_metadata={},
            log_file=log_file,
            metadata=None,
            reason=f"ffprobe_failed: {error}",
            review_file=review_file,
            source=aax_file,
            temp_file=None,
        )
        return
    if config.dry_run:
        dry_run_aax_file_with_agent(
            aax_file,
            ffprobe_metadata,
            config.engine,
            config,
            log_file,
            review_file,
        )
        return
    temp_file = (
        config.resolved_output / config.work_directory_name / config.temp_directory_name / run_id / "converted.m4b"
    )
    temp_file.parent.mkdir(parents=True, exist_ok=True)
    result = convert_temp_file_and_resolve_metadata(aax_file, temp_file, ffprobe_metadata, config, log_file)
    if result.conversion_error:
        reason = f"ffmpeg_failed: {result.conversion_error}"
        write_review_file(
            destination=None,
            ffprobe_metadata=ffprobe_metadata,
            log_file=log_file,
            metadata=result.metadata,
            reason=reason,
            review_file=review_file,
            source=aax_file,
            temp_file=temp_file if temp_file.exists() else None,
        )
        return
    if result.metadata_error:
        write_review_file(
            destination=None,
            ffprobe_metadata=ffprobe_metadata,
            log_file=log_file,
            metadata=None,
            reason=f"metadata_failed: {result.metadata_error}",
            review_file=review_file,
            source=aax_file,
            temp_file=temp_file,
        )
        return
    if result.metadata is None or result.metadata.needs_review:
        write_review_file(
            destination=None,
            ffprobe_metadata=ffprobe_metadata,
            log_file=log_file,
            metadata=result.metadata,
            reason="metadata_needs_review",
            review_file=review_file,
            source=aax_file,
            temp_file=temp_file,
        )
        return
    destination = metadata_output_path(config.resolved_output, result.metadata)
    if destination.exists() and not config.overwrite:
        write_agent_log(log_file, "destination_exists", destination=str(destination))
        cleanup_temp_output(temp_file)
        return
    destination.parent.mkdir(parents=True, exist_ok=True)
    try:
        temp_file.replace(destination)
    except Exception as error:  # noqa: BLE001
        write_review_file(
            destination=destination,
            ffprobe_metadata=ffprobe_metadata,
            log_file=log_file,
            metadata=result.metadata,
            reason=f"rename_failed: {error}",
            review_file=review_file,
            source=aax_file,
            temp_file=temp_file if temp_file.exists() else None,
        )
    else:
        cleanup_temp_output(temp_file)
        write_agent_log(log_file, "conversion_complete", destination=str(destination))
 if __name__ == "__main__":
    typer.run(main)
@@ -1,176 +0,0 @@
 """Import audiobook catalog authors and series from CSV files."""
 from __future__ import annotations
 import csv
 import logging
 from pathlib import Path  # noqa: TC003 This is required for the typer CLI
 from typing import Annotated
 import typer
 from sqlalchemy import select
 from sqlalchemy.orm import Session
 from python.common import configure_logger
 from python.orm.common import get_postgres_engine
 from python.orm.richie import AudiobookAuthor, AudiobookSeries
 logger = logging.getLogger(__name__)
 AUTHOR_NAME_COLUMN = "author_name"
 ID_COLUMN = "id"
 NAME_COLUMN = "name"
 class CatalogImportError(ValueError):
    """CSV catalog import failed validation."""
 def main(
    authors_csv: Annotated[Path, typer.Argument(help="CSV with name and optional id.")],
    series_csv: Annotated[Path, typer.Argument(help="CSV with name, author_name, and optional id.")],
 ) -> None:
    """Upsert audiobook authors and series from CSV files."""
    configure_logger()
    try:
        engine = get_postgres_engine(name="RICHIE")
        with Session(engine) as session:
            author_count = upsert_authors_from_csv(session, authors_csv)
            series_count = upsert_series_from_csv(session, series_csv)
            session.commit()
    except CatalogImportError as error:
        typer.echo(str(error), err=True)
        raise typer.Exit(code=1) from error
    logger.info("Upserted %s authors and %s series", author_count, series_count)
 def upsert_authors_from_csv(session: Session, authors_csv: Path) -> int:
    """Upsert authors from a CSV file."""
    count = 0
    for row_number, row in csv_rows(authors_csv):
        name = required_csv_value(row, authors_csv, row_number, NAME_COLUMN)
        upsert_author(session, name, csv_id(row, authors_csv, row_number))
        count += 1
    return count
 def upsert_series_from_csv(session: Session, series_csv: Path) -> int:
    """Upsert series from a CSV file."""
    count = 0
    for row_number, row in csv_rows(series_csv):
        series_name = required_csv_value(row, series_csv, row_number, NAME_COLUMN)
        author_name = required_csv_value(row, series_csv, row_number, AUTHOR_NAME_COLUMN)
        author = find_author_by_name(session, author_name)
        if author is None:
            msg = f"{series_csv}:{row_number}: author not found: {author_name}"
            raise CatalogImportError(msg)
        upsert_series(session, series_name, author, csv_id(row, series_csv, row_number))
        count += 1
    return count
 def upsert_author(session: Session, name: str, author_id: int | None) -> AudiobookAuthor:
    """Upsert one author by id or exact name."""
    if author_id is not None:
        author = session.get(AudiobookAuthor, author_id)
        if author is None:
            author = AudiobookAuthor(id=author_id, name=name)
            session.add(author)
        else:
            author.name = name
        session.flush()
        return author
    author = find_author_by_name(session, name)
    if author is None:
        author = AudiobookAuthor(name=name)
        session.add(author)
        session.flush()
    return author
 def upsert_series(
    session: Session,
    name: str,
    author: AudiobookAuthor,
    series_id: int | None,
 ) -> AudiobookSeries:
    """Upsert one series by id or exact author/name match."""
    if series_id is not None:
        series = session.get(AudiobookSeries, series_id)
        if series is None:
            series = AudiobookSeries(id=series_id, name=name, author=author)
            session.add(series)
        else:
            series.name = name
            series.author = author
        session.flush()
        return series
    series = find_series_by_name_and_author(session, name, author.id)
    if series is None:
        series = AudiobookSeries(name=name, author=author)
        session.add(series)
        session.flush()
    return series
 def find_author_by_name(session: Session, name: str) -> AudiobookAuthor | None:
    """Find one author by exact name."""
    return session.scalar(select(AudiobookAuthor).where(AudiobookAuthor.name == name))
 def find_series_by_name_and_author(
    session: Session,
    name: str,
    author_id: int,
 ) -> AudiobookSeries | None:
    """Find one series by exact name and author."""
    return session.scalar(
        select(AudiobookSeries).where(
            AudiobookSeries.name == name,
            AudiobookSeries.author_id == author_id,
        ),
    )
 def csv_rows(csv_path: Path) -> list[tuple[int, dict[str, str | None]]]:
    """Read a CSV file as numbered rows."""
    with csv_path.open(newline="", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        if reader.fieldnames is None:
            msg = f"{csv_path}: missing CSV header"
            raise CatalogImportError(msg)
        return [(row_number, row) for row_number, row in enumerate(reader, start=2)]
 def required_csv_value(
    row: dict[str, str | None],
    csv_path: Path,
    row_number: int,
    column: str,
 ) -> str:
    """Read a required CSV value."""
    value = row.get(column)
    if value and value.strip():
        return value.strip()
    msg = f"{csv_path}:{row_number}: missing required column value: {column}"
    raise CatalogImportError(msg)
 def csv_id(row: dict[str, str | None], csv_path: Path, row_number: int) -> int | None:
    """Read an optional id field from a CSV row."""
    value = row.get(ID_COLUMN)
    if value is None or not value.strip():
        return None
    try:
        return int(value)
    except ValueError as error:
        msg = f"{csv_path}:{row_number}: id must be an integer: {value}"
        raise CatalogImportError(msg) from error
    return None
 if __name__ == "__main__":
    typer.run(main)
@@ -1,599 +0,0 @@
 """LLM tool calling support for audiobook metadata resolution."""
 from __future__ import annotations
 import json
 import re
 import time
 from collections.abc import Callable
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 from sqlalchemy import or_, select
 from python.orm.richie import Audiobook, AudiobookAuthor, AudiobookSeries
 if TYPE_CHECKING:
    from pathlib import Path
    from sqlalchemy.orm import Session
    from python.tools.audiobook.metadata_agent import AgentConfig
 CATALOG_SLUG_PATTERN = re.compile(r"^[a-z0-9]+(?:_[a-z0-9]+)*$")
 TITLE_SLUG_PATTERN = re.compile(r"^[a-z0-9]+(?:-[a-z0-9]+)*$")
 LogWriter = Callable[..., None]
 class MetadataResolutionError(ValueError):
    """Metadata resolution failed validation."""
@dataclass(frozen=True)
 class EnsuredBook:
    """Book row plus whether it was created."""
    book: Audiobook
    action: str
 class CatalogToolRegistry:
    """Controlled catalog tools exposed to the metadata model."""
    def __init__(
        self,
        session: Session,
        log_path: Path,
        config: AgentConfig,
        write_log: LogWriter,
    ) -> None:
        """Create a registry bound to one database session and audit log."""
        self.session = session
        self.log_path = log_path
        self.config = config
        self.write_log = write_log
        self.seen_author_ids: set[int] = set()
        self.seen_series_ids: set[int] = set()
        self.seen_book_ids: set[int] = set()
        self.created_author_ids: set[int] = set()
        self.created_series_ids: set[int] = set()
        self.created_book_ids: set[int] = set()
    def tool_schemas(self) -> list[dict[str, object]]:
        """Return Ollama tool schemas."""
        schemas = [
            {
                "type": "function",
                "function": {
                    "name": "search_authors",
                    "description": "Search canonical audiobook authors by slug or noisy source text.",
                    "parameters": {
                        "type": "object",
                        "properties": {"query": {"type": "string"}},
                        "required": ["query"],
                    },
                },
            },
            {
                "type": "function",
                "function": {
                    "name": "search_series",
                    "description": "Search canonical audiobook series by slug or noisy source text.",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "query": {"type": "string"},
                            "author_id": {"type": ["integer", "null"]},
                        },
                        "required": ["query"],
                    },
                },
            },
            {
                "type": "function",
                "function": {
                    "name": "search_books",
                    "description": "Search canonical audiobook titles with optional author and series filters.",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "query": {"type": "string"},
                            "author_id": {"type": ["integer", "null"]},
                            "series_id": {"type": ["integer", "null"]},
                        },
                        "required": ["query"],
                    },
                },
            },
            {
                "type": "function",
                "function": {
                    "name": "ensure_author",
                    "description": "Normalize an author name to a catalog slug, then return or create that author.",
                    "parameters": {
                        "type": "object",
                        "properties": {"name": {"type": "string"}},
                        "required": ["name"],
                    },
                },
            },
            {
                "type": "function",
                "function": {
                    "name": "ensure_series",
                    "description": "Normalize a series name to a catalog slug, then return or create it for an author.",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "name": {"type": "string"},
                            "author_id": {"type": "integer"},
                        },
                        "required": ["name", "author_id"],
                    },
                },
            },
            {
                "type": "function",
                "function": {
                    "name": "ensure_book",
                    "description": "Normalize a title to a book slug, then return or create it for an author/series.",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "title": {"type": "string"},
                            "author_id": {"type": "integer"},
                            "series_id": {"type": ["integer", "null"]},
                            "series_index": {"type": "number", "multipleOf": 0.5},
                        },
                        "required": ["title", "author_id", "series_id", "series_index"],
                    },
                },
            },
        ]
        enabled_tool_names = set(self.config.tool_names)
        return [schema for schema in schemas if schema["function"]["name"] in enabled_tool_names]
    def run(self, name: str, arguments: dict[str, object]) -> list[dict[str, object]]:
        """Run one catalog tool and audit the call."""
        handlers = {
            "search_authors": self.run_search_authors,
            "search_series": self.run_search_series,
            "search_books": self.run_search_books,
            "ensure_author": self.run_ensure_author,
            "ensure_series": self.run_ensure_series,
            "ensure_book": self.run_ensure_book,
        }
        handler = handlers.get(name)
        if handler is None:
            self.write_log(self.log_path, "tool_error", tool=name, arguments=arguments, error="unknown_tool")
            msg = f"Unknown audiobook metadata tool: {name}"
            raise MetadataResolutionError(msg)
        if name not in self.config.tool_names:
            self.write_log(self.log_path, "tool_error", tool=name, arguments=arguments, error="tool_not_enabled")
            msg = f"Audiobook metadata tool is not enabled: {name}"
            raise MetadataResolutionError(msg)
        started = time.perf_counter()
        self.write_log(self.log_path, "tool_call", tool=name, arguments=arguments)
        result = handler(arguments)
        duration_ms = round((time.perf_counter() - started) * 1000, 3)
        self.write_log(
            self.log_path,
            "tool_result",
            tool=name,
            duration_ms=duration_ms,
            result_count=len(result),
            preview=result[:3],
        )
        return result
    def get_author(self, author_id: int) -> AudiobookAuthor | None:
        """Return an author by id."""
        return self.session.get(AudiobookAuthor, author_id)
    def get_book(self, book_id: int) -> Audiobook | None:
        """Return a book by id."""
        return self.session.get(Audiobook, book_id)
    def get_series(self, series_id: int) -> AudiobookSeries | None:
        """Return a series by id."""
        return self.session.get(AudiobookSeries, series_id)
    def prune_unused_created_rows(self, *, author_id: int, book_id: int | None, series_id: int | None) -> None:
        """Remove catalog rows created during this run but not used by final metadata."""
        used_book_ids = {book_id} if book_id is not None else set()
        for created_book_id in self.created_book_ids - used_book_ids:
            if book := self.get_book(created_book_id):
                self.session.delete(book)
        self.session.flush()
        used_series_ids = {series_id} if series_id is not None else set()
        for created_series_id in self.created_series_ids - used_series_ids:
            series = self.get_series(created_series_id)
            if series and not series.books:
                self.session.delete(series)
        self.session.flush()
        for created_author_id in self.created_author_ids - {author_id}:
            author = self.get_author(created_author_id)
            if author and not author.books and not author.series:
                self.session.delete(author)
    def run_search_authors(self, arguments: dict[str, object]) -> list[dict[str, object]]:
        """Search authors from tool arguments and remember returned ids."""
        query = required_string(arguments, "query")
        statement = select(AudiobookAuthor).order_by(AudiobookAuthor.name).limit(self.config.max_tool_results)
        if terms := query_terms(query):
            statement = statement.where(or_(*(AudiobookAuthor.name.ilike(f"%{term}%") for term in terms)))
        authors = self.session.scalars(statement).all()
        self.seen_author_ids.update(author.id for author in authors)
        return [{"id": author.id, "name": author.name} for author in authors]
    def run_search_series(self, arguments: dict[str, object]) -> list[dict[str, object]]:
        """Search series from tool arguments and remember returned ids."""
        query = required_string(arguments, "query")
        author_id = optional_int(arguments.get("author_id"), "author_id")
        statement = select(AudiobookSeries).order_by(AudiobookSeries.name).limit(self.config.max_tool_results)
        if terms := query_terms(query):
            statement = statement.where(or_(*(AudiobookSeries.name.ilike(f"%{term}%") for term in terms)))
        if author_id is not None:
            statement = statement.where(AudiobookSeries.author_id == author_id)
        series_rows = self.session.scalars(statement).all()
        self.seen_series_ids.update(series.id for series in series_rows)
        self.seen_author_ids.update(series.author_id for series in series_rows)
        return [
            {
                "id": series.id,
                "name": series.name,
                "author_id": series.author_id,
                "author": series.author.name,
            }
            for series in series_rows
        ]
    def run_search_books(self, arguments: dict[str, object]) -> list[dict[str, object]]:
        """Search books from tool arguments and remember returned ids."""
        query = required_string(arguments, "query")
        author_id = optional_int(arguments.get("author_id"), "author_id")
        series_id = optional_int(arguments.get("series_id"), "series_id")
        statement = select(Audiobook).order_by(Audiobook.title).limit(self.config.max_tool_results)
        if terms := query_terms(query):
            statement = statement.where(or_(*(Audiobook.title.ilike(f"%{term}%") for term in terms)))
        if author_id is not None:
            statement = statement.where(Audiobook.author_id == author_id)
        if series_id is not None:
            statement = statement.where(Audiobook.series_id == series_id)
        books = self.session.scalars(statement).all()
        self.seen_book_ids.update(book.id for book in books)
        self.seen_author_ids.update(book.author_id for book in books)
        self.seen_series_ids.update(book.series_id for book in books if book.series_id is not None)
        return [
            {
                "id": book.id,
                "title": book.title,
                "author_id": book.author_id,
                "author": book.author.name,
                "series_id": book.series_id,
                "series": book.series.name if book.series else self.config.standalone_series,
                "series_index": book.series_index,
            }
            for book in books
        ]
    def run_ensure_author(self, arguments: dict[str, object]) -> list[dict[str, object]]:
        """Ensure an author from tool arguments and return a tool result."""
        name = normalize_catalog_slug(required_string(arguments, "name"))
        validate_catalog_slug(name, "author")
        author = self.session.scalar(select(AudiobookAuthor).where(AudiobookAuthor.name == name))
        action = "existing"
        if author is None:
            author = AudiobookAuthor(name=name)
            self.session.add(author)
            self.session.flush()
            self.created_author_ids.add(author.id)
            action = "created"
        self.seen_author_ids.add(author.id)
        return [{"id": author.id, "name": author.name, "action": action}]
    def run_ensure_series(self, arguments: dict[str, object]) -> list[dict[str, object]]:
        """Ensure a series from tool arguments and return a tool result."""
        name = normalize_catalog_slug(required_string(arguments, "name"))
        author_id = required_int(arguments, "author_id")
        validate_catalog_slug(name, "series")
        author = self.required_author(author_id)
        series = self.find_series_by_catalog_slug(name, author.id)
        action = "existing"
        if series is None:
            series = AudiobookSeries(name=name, author=author)
            self.session.add(series)
            self.session.flush()
            self.created_series_ids.add(series.id)
            action = "created"
        self.seen_author_ids.add(author.id)
        self.seen_series_ids.add(series.id)
        return [self.series_result(series, action)]
    def run_ensure_book(self, arguments: dict[str, object]) -> list[dict[str, object]]:
        """Ensure a book from tool arguments and return a tool result."""
        title = required_string(arguments, "title")
        author_id = required_int(arguments, "author_id")
        series_id = optional_int(arguments.get("series_id"), "series_id")
        series_index = required_series_index(arguments, "series_index")
        ensured = self.ensure_book(title, author_id, series_id, series_index)
        return [self.book_result(ensured.book, ensured.action)]
    def ensure_book(
        self,
        title: str,
        author_id: int,
        series_id: int | None,
        series_index: float,
    ) -> EnsuredBook:
        """Return an existing book row, or create it after validating ownership."""
        title = normalize_title_slug(title)
        validate_title_slug(title)
        author = self.required_author(author_id)
        series = None
        if series_id is None:
            if series_index != 0:
                msg = "standalone books must use series_index 0"
                raise MetadataResolutionError(msg)
        else:
            series = self.required_series(series_id)
            if series.author_id != author.id:
                msg = f"series_id {series_id} does not belong to author_id {author_id}"
                raise MetadataResolutionError(msg)
            if series_index <= 0:
                msg = "series books must use a positive series_index"
                raise MetadataResolutionError(msg)
        statement = select(Audiobook).where(
            Audiobook.title == title,
            Audiobook.author_id == author.id,
        )
        if series is None:
            statement = statement.where(Audiobook.series_id.is_(None))
        else:
            statement = statement.where(Audiobook.series_id == series.id)
        book = self.session.scalar(statement)
        if book is None:
            book = Audiobook(title=title, author=author, series=series, series_index=series_index)
            self.session.add(book)
            self.session.flush()
            self.created_book_ids.add(book.id)
            action = "created"
        else:
            action = "existing"
        self.seen_book_ids.add(book.id)
        self.seen_author_ids.add(author.id)
        if book.series_id is not None:
            self.seen_series_ids.add(book.series_id)
        return EnsuredBook(book=book, action=action)
    def required_author(self, author_id: int) -> AudiobookAuthor:
        """Return an author or fail metadata resolution."""
        author = self.get_author(author_id)
        if author is None:
            msg = f"author_id {author_id} does not exist"
            raise MetadataResolutionError(msg)
        return author
    def required_series(self, series_id: int) -> AudiobookSeries:
        """Return a series or fail metadata resolution."""
        series = self.get_series(series_id)
        if series is None:
            msg = f"series_id {series_id} does not exist"
            raise MetadataResolutionError(msg)
        return series
    def find_series_by_catalog_slug(self, name: str, author_id: int) -> AudiobookSeries | None:
        """Return a series by exact slug or underscore-insensitive slug."""
        exact = self.session.scalar(
            select(AudiobookSeries).where(
                AudiobookSeries.name == name,
                AudiobookSeries.author_id == author_id,
            ),
        )
        if exact is not None:
            return exact
        compact_name = compact_catalog_slug(name)
        series_rows = self.session.scalars(
            select(AudiobookSeries).where(AudiobookSeries.author_id == author_id).order_by(AudiobookSeries.name),
        ).all()
        for series in series_rows:
            if compact_catalog_slug(series.name) == compact_name:
                return series
        return None
    def series_result(self, series: AudiobookSeries, action: str) -> dict[str, object]:
        """Build a normalized series tool result."""
        return {
            "id": series.id,
            "name": series.name,
            "author_id": series.author_id,
            "author": series.author.name,
            "action": action,
        }
    def book_result(self, book: Audiobook, action: str) -> dict[str, object]:
        """Build a normalized book tool result."""
        return {
            "id": book.id,
            "title": book.title,
            "author_id": book.author_id,
            "author": book.author.name,
            "series_id": book.series_id,
            "series": book.series.name if book.series else self.config.standalone_series,
            "series_index": book.series_index,
            "action": action,
        }
 def run_tool_calls(
    messages: list[dict[str, object]],
    message: dict[str, object],
    tool_calls: list[tuple[str, dict[str, object]]],
    registry: CatalogToolRegistry,
    log_path: Path,
    write_log: LogWriter,
 ) -> str | None:
    """Run tool calls, append tool messages, and return fatal error text when stopped."""
    messages.append(message)
    for tool_name, arguments in tool_calls:
        try:
            tool_result = registry.run(tool_name, arguments)
        except MetadataResolutionError as error:
            if is_fatal_tool_error(error):
                return str(error)
            write_log(log_path, "tool_error", tool=tool_name, arguments=arguments, error=str(error))
            messages.append(
                {
                    "role": "tool",
                    "tool_name": tool_name,
                    "content": json.dumps({"error": str(error)}, sort_keys=True),
                },
            )
            continue
        messages.append(
            {
                "role": "tool",
                "tool_name": tool_name,
                "content": json.dumps(tool_result, sort_keys=True),
            },
        )
    return None
 def parse_tool_calls(message: dict[str, object]) -> list[tuple[str, dict[str, object]]]:
    """Parse Ollama tool calls from a response message."""
    raw_tool_calls = message.get("tool_calls") or []
    if not isinstance(raw_tool_calls, list):
        msg = "tool_calls must be a list"
        raise MetadataResolutionError(msg)
    tool_calls = []
    for raw_call in raw_tool_calls:
        if not isinstance(raw_call, dict):
            msg = "tool call must be an object"
            raise MetadataResolutionError(msg)
        function = raw_call.get("function")
        if not isinstance(function, dict):
            msg = "tool call is missing function"
            raise MetadataResolutionError(msg)
        name = function.get("name")
        if not isinstance(name, str) or not name:
            msg = "tool call is missing function name"
            raise MetadataResolutionError(msg)
        arguments = parse_tool_arguments(function.get("arguments", {}))
        tool_calls.append((name, arguments))
    return tool_calls
 def parse_tool_arguments(raw_arguments: object) -> dict[str, object]:
    """Parse tool call arguments returned by Ollama."""
    if isinstance(raw_arguments, dict):
        return {str(key): value for key, value in raw_arguments.items()}
    if isinstance(raw_arguments, str):
        parsed = json.loads(raw_arguments) if raw_arguments else {}
        if isinstance(parsed, dict):
            return {str(key): value for key, value in parsed.items()}
    msg = "tool arguments must be an object"
    raise MetadataResolutionError(msg)
 def validate_title_slug(title: str) -> None:
    """Validate a canonical book title slug."""
    if not TITLE_SLUG_PATTERN.fullmatch(title):
        msg = f"title slug is invalid: {title}"
        raise MetadataResolutionError(msg)
 def validate_catalog_slug(value: str, label: str) -> None:
    """Validate a canonical catalog slug."""
    if not CATALOG_SLUG_PATTERN.fullmatch(value):
        msg = f"{label} slug is invalid: {value}"
        raise MetadataResolutionError(msg)
 def normalize_catalog_slug(value: str) -> str:
    """Normalize noisy catalog names into lower snake-case slugs."""
    return re.sub(r"[^a-z0-9]+", "_", value.strip().casefold()).strip("_")
 def compact_catalog_slug(value: str) -> str:
    """Return a catalog slug comparison key that ignores underscores."""
    return normalize_catalog_slug(value).replace("_", "")
 def normalize_title_slug(value: str) -> str:
    """Normalize noisy book titles into lower kebab-case slugs."""
    return re.sub(r"[^a-z0-9]+", "-", value.strip().casefold()).strip("-")
 def is_fatal_tool_error(error: MetadataResolutionError) -> bool:
    """Return whether a tool error should stop the agent immediately."""
    message = str(error)
    return message.startswith(
        (
            "Unknown audiobook metadata tool",
            "Audiobook metadata tool is not enabled",
        ),
    )
 def query_terms(query: str) -> tuple[str, ...]:
    """Return text variants useful for matching noisy audiobook metadata."""
    normalized = query.strip().casefold()
    underscore_slug = normalize_catalog_slug(normalized)
    compact_slug = compact_catalog_slug(normalized)
    hyphen_slug = normalize_title_slug(normalized)
    return tuple(dict.fromkeys(term for term in (normalized, underscore_slug, compact_slug, hyphen_slug) if term))
 def required_string(data: dict[str, object], key: str) -> str:
    """Read a required string field."""
    value = data.get(key)
    if not isinstance(value, str) or not value.strip():
        msg = f"{key} must be a non-empty string"
        raise MetadataResolutionError(msg)
    return value.strip()
 def required_int(data: dict[str, object], key: str) -> int:
    """Read a required integer field."""
    value = data.get(key)
    if isinstance(value, bool) or not isinstance(value, int):
        msg = f"{key} must be an integer"
        raise MetadataResolutionError(msg)
    return value
 def required_series_index(data: dict[str, object], key: str) -> float:
    """Read a required whole-number or half-number series index."""
    value = data.get(key)
    if isinstance(value, bool) or not isinstance(value, int | float):
        msg = f"{key} must be a number"
        raise MetadataResolutionError(msg)
    series_index = float(value)
    if not (series_index * 2).is_integer():
        msg = f"{key} must be a whole number or .5 increment"
        raise MetadataResolutionError(msg)
    return series_index
 def optional_int(value: object, key: str) -> int | None:
    """Read an optional integer field."""
    if value is None:
        return None
    if isinstance(value, bool) or not isinstance(value, int):
        msg = f"{key} must be an integer or null"
        raise MetadataResolutionError(msg)
    return value
@@ -1,575 +0,0 @@
 """Resolve audiobook metadata with a controlled Ollama tool loop."""
 from __future__ import annotations
 import json
 import re
 from dataclasses import asdict, dataclass, is_dataclass, replace
 from os import PathLike
 from typing import TYPE_CHECKING
 import httpx
 from sqlalchemy.orm import Session
 from python.common import utcnow
 from python.tools.audiobook.llm_tool_calling import (
    CatalogToolRegistry,
    MetadataResolutionError,
    normalize_title_slug,
    optional_int,
    parse_tool_calls,
    required_int,
    required_series_index,
    required_string,
    run_tool_calls,
    validate_catalog_slug,
    validate_title_slug,
 )
 if TYPE_CHECKING:
    from pathlib import Path
    from sqlalchemy.engine import Engine
    from python.orm.richie import AudiobookAuthor
 FENCED_JSON_PATTERN = re.compile(r"^```(?:json)?\s*(?P<json>.*?)\s*```$", re.IGNORECASE | re.DOTALL)
@dataclass(frozen=True)
 class AgentConfig:
    """Runtime settings for the audiobook metadata agent."""
    model: str = "deepseek-v4-flash:cloud"
    ollama_chat_url: str = "https://ollama.com/api/chat"
    http_timeout_seconds: int = 300
    max_agent_turns: int = 8
    max_tool_results: int = 10
    min_confidence: float = 0.85
    invalid_final_retries: int = 1
    standalone_series: str = "standalone"
    tool_names: tuple[str, ...] = (
        "search_authors",
        "search_series",
        "search_books",
        "ensure_author",
        "ensure_series",
        "ensure_book",
    )
@dataclass(frozen=True)
 class StandardBookMetadata:
    """Canonical metadata for the final audiobook path."""
    author_id: int
    author: str
    book_id: int | None
    title: str
    series_id: int | None
    series: str
    series_index: float
    confidence: float
    needs_review: bool
    evidence: list[str]
@dataclass(frozen=True)
 class FinalMetadataFields:
    """Raw model fields after schema validation."""
    author_id: int
    book_id: int | None
    title: str
    series_id: int | None
    series_index: float
    confidence: float
    evidence: list[str]
@dataclass(frozen=True)
 class ResolvedBookFields:
    """Book fields after optional catalog book resolution."""
    book_id: int | None
    title: str
    series_id: int | None
    series_index: float
@dataclass(frozen=True)
 class AgentStepResult:
    """Outcome from one model response."""
    metadata: StandardBookMetadata | None
    invalid_final_count: int
    should_continue: bool
 def standard_book_metadata(
    aax_file_name: str,
    aax_metadata_from_ffprobe: dict[str, str],
    engine: Engine,
    log_path: Path,
    ollama_api_key: str,
    config: AgentConfig,
 ) -> StandardBookMetadata:
    """Resolve canonical audiobook metadata with the configured Ollama Cloud model."""
    with Session(engine) as session:
        registry = CatalogToolRegistry(session, log_path, config, write_agent_log)
        agent = AudiobookMetadataAgent(
            registry=registry, log_path=log_path, ollama_api_key=ollama_api_key, config=config
        )
        metadata = agent.run(aax_file_name, aax_metadata_from_ffprobe)
        if metadata.needs_review:
            session.rollback()
        else:
            registry.prune_unused_created_rows(
                author_id=metadata.author_id,
                book_id=metadata.book_id,
                series_id=metadata.series_id,
            )
            session.commit()
        return metadata
 class AudiobookMetadataAgent:
    """Ollama-backed metadata resolver with a fixed local tool registry."""
    def __init__(
        self,
        *,
        registry: CatalogToolRegistry,
        log_path: Path,
        ollama_api_key: str,
        config: AgentConfig,
    ) -> None:
        """Create an Ollama metadata agent."""
        self._registry = registry
        self._log_path = log_path
        self._ollama_api_key = ollama_api_key
        self._config = config
    def run(self, aax_file_name: str, aax_metadata_from_ffprobe: dict[str, str]) -> StandardBookMetadata:
        """Resolve metadata for one AAX file."""
        messages = [
            {"role": "system", "content": system_prompt()},
            {"role": "user", "content": user_prompt(aax_file_name, aax_metadata_from_ffprobe)},
        ]
        invalid_final_count = 0
        result: StandardBookMetadata | None = None
        for turn in range(1, self._config.max_agent_turns + 1):
            step = self.run_step(messages, turn, invalid_final_count)
            invalid_final_count = step.invalid_final_count
            if step.should_continue:
                continue
            result = step.metadata
            break
        if result is None:
            return self.force_final_response(messages)
        return result
    def run_step(
        self,
        messages: list[dict[str, object]],
        turn: int,
        invalid_final_count: int,
    ) -> AgentStepResult:
        """Run one model turn and return the next agent-loop action."""
        data = self.chat(messages, turn)
        message = data.get("message")
        if not isinstance(message, dict):
            return AgentStepResult(
                metadata=review_metadata("Ollama response did not include a message", self._config),
                invalid_final_count=invalid_final_count,
                should_continue=False,
            )
        try:
            tool_calls = parse_tool_calls(message)
        except (json.JSONDecodeError, MetadataResolutionError) as error:
            return AgentStepResult(
                metadata=review_metadata(str(error), self._config),
                invalid_final_count=invalid_final_count,
                should_continue=False,
            )
        if tool_calls:
            fatal_error = run_tool_calls(messages, message, tool_calls, self._registry, self._log_path, write_agent_log)
            if fatal_error is not None:
                return AgentStepResult(
                    metadata=review_metadata(fatal_error, self._config),
                    invalid_final_count=invalid_final_count,
                    should_continue=False,
                )
            return AgentStepResult(metadata=None, invalid_final_count=invalid_final_count, should_continue=True)
        return self.handle_final_message(messages, message, invalid_final_count)
    def handle_final_message(
        self,
        messages: list[dict[str, object]],
        message: dict[str, object],
        invalid_final_count: int,
    ) -> AgentStepResult:
        """Validate a final model message or request one retry."""
        content = message.get("content")
        if not isinstance(content, str):
            return AgentStepResult(
                metadata=review_metadata("Ollama final response did not include string content", self._config),
                invalid_final_count=invalid_final_count,
                should_continue=False,
            )
        try:
            resolved = self.validate_final(parse_final_json_content(content))
        except (json.JSONDecodeError, MetadataResolutionError) as error:
            return self.handle_invalid_final(messages, error, invalid_final_count)
        write_agent_log(self._log_path, "final_metadata", metadata=resolved)
        return AgentStepResult(metadata=resolved, invalid_final_count=invalid_final_count, should_continue=False)
    def handle_invalid_final(
        self,
        messages: list[dict[str, object]],
        error: json.JSONDecodeError | MetadataResolutionError,
        invalid_final_count: int,
    ) -> AgentStepResult:
        """Log invalid final JSON and either retry or return review metadata."""
        invalid_final_count += 1
        write_agent_log(
            self._log_path,
            "final_validation_error",
            error=str(error),
            invalid_final_count=invalid_final_count,
        )
        if invalid_final_count > self._config.invalid_final_retries:
            return AgentStepResult(
                metadata=review_metadata(str(error), self._config),
                invalid_final_count=invalid_final_count,
                should_continue=False,
            )
        messages.append(
            {
                "role": "user",
                "content": (
                    "Your previous final answer was invalid. Return only valid JSON matching the required "
                    f"schema. Validation error: {error}"
                ),
            },
        )
        return AgentStepResult(metadata=None, invalid_final_count=invalid_final_count, should_continue=True)
    def force_final_response(self, messages: list[dict[str, object]]) -> StandardBookMetadata:
        """Request a no-tool final answer after the normal turn limit."""
        messages.append({"role": "user", "content": forced_final_prompt()})
        write_agent_log(self._log_path, "forced_final_request", reason="max_turns")
        data = self.chat(messages, self._config.max_agent_turns + 1, tools_enabled=False)
        message = data.get("message")
        if not isinstance(message, dict):
            return review_metadata("Ollama forced final response did not include a message", self._config)
        content = message.get("content")
        if not isinstance(content, str):
            return review_metadata("Ollama forced final response did not include string content", self._config)
        try:
            resolved = self.validate_final(parse_final_json_content(content))
        except (json.JSONDecodeError, MetadataResolutionError) as error:
            return review_metadata(f"Ollama forced final response was invalid: {error}", self._config)
        write_agent_log(self._log_path, "final_metadata", metadata=resolved)
        return resolved
    def chat(self, messages: list[dict[str, object]], turn: int, *, tools_enabled: bool = True) -> dict[str, object]:
        """Send one chat request to Ollama and log the request and response."""
        payload = {
            "model": self._config.model,
            "messages": messages,
            "stream": False,
            "options": {"temperature": 0.1},
        }
        tool_names = []
        if tools_enabled:
            payload["tools"] = self._registry.tool_schemas()
            tool_names = self._config.tool_names
        write_agent_log(
            self._log_path,
            "model_request",
            model=self._config.model,
            turn=turn,
            message_count=len(messages),
            tool_names=tool_names,
            tools_enabled=tools_enabled,
        )
        write_agent_log(
            self._log_path,
            "llm_messages_sent",
            model=self._config.model,
            turn=turn,
            messages=messages,
            tools_enabled=tools_enabled,
        )
        response = httpx.post(
            self._config.ollama_chat_url,
            headers={"Authorization": f"Bearer {self._ollama_api_key}"},
            json=payload,
            timeout=self._config.http_timeout_seconds,
        )
        response.raise_for_status()
        raw_data = response.json()
        if not isinstance(raw_data, dict):
            return {}
        data = {str(key): value for key, value in raw_data.items()}
        message = data.get("message", {})
        content = message.get("content") if isinstance(message, dict) else ""
        write_agent_log(
            self._log_path,
            "llm_message_received",
            model=self._config.model,
            turn=turn,
            message=message,
        )
        write_agent_log(
            self._log_path,
            "model_response",
            model=self._config.model,
            turn=turn,
            has_tool_calls=bool(isinstance(message, dict) and message.get("tool_calls")),
            content_chars=len(content) if isinstance(content, str) else 0,
        )
        return data
    def validate_final(self, raw_metadata: object) -> StandardBookMetadata:
        """Validate final model metadata against catalog rows."""
        fields = parse_final_metadata_fields(raw_metadata)
        fields = replace(fields, title=normalize_title_slug(fields.title))
        author = self.validate_author(fields.author_id)
        validate_title_slug(fields.title)
        book_fields = self.resolve_book_fields(fields)
        series = self.validate_series(fields.author_id, book_fields.series_id, book_fields.series_index)
        return StandardBookMetadata(
            author_id=fields.author_id,
            author=author.name,
            book_id=book_fields.book_id,
            title=book_fields.title,
            series_id=book_fields.series_id,
            series=series,
            series_index=book_fields.series_index,
            confidence=fields.confidence,
            needs_review=fields.confidence < self._config.min_confidence,
            evidence=fields.evidence,
        )
    def validate_author(self, author_id: int) -> AudiobookAuthor:
        """Validate that an author id was seen and exists."""
        if author_id not in self._registry.seen_author_ids:
            msg = f"author_id {author_id} was not returned by search_authors"
            raise MetadataResolutionError(msg)
        author = self._registry.get_author(author_id)
        if author is None:
            msg = f"author_id {author_id} does not exist"
            raise MetadataResolutionError(msg)
        validate_catalog_slug(author.name, "author")
        return author
    def resolve_book_fields(self, fields: FinalMetadataFields) -> ResolvedBookFields:
        """Resolve final book fields from a seen book id or created book."""
        if fields.book_id is None:
            ensured = self._registry.ensure_book(
                fields.title,
                fields.author_id,
                fields.series_id,
                fields.series_index,
            )
            return ResolvedBookFields(
                book_id=ensured.book.id,
                title=ensured.book.title,
                series_id=ensured.book.series_id,
                series_index=ensured.book.series_index,
            )
        if fields.book_id not in self._registry.seen_book_ids:
            msg = f"book_id {fields.book_id} was not returned by search_books"
            raise MetadataResolutionError(msg)
        book = self._registry.get_book(fields.book_id)
        if book is None:
            msg = f"book_id {fields.book_id} does not exist"
            raise MetadataResolutionError(msg)
        if book.author_id != fields.author_id:
            msg = f"book_id {fields.book_id} does not belong to author_id {fields.author_id}"
            raise MetadataResolutionError(msg)
        return ResolvedBookFields(
            book_id=fields.book_id,
            title=book.title,
            series_id=book.series_id,
            series_index=book.series_index,
        )
    def validate_series(self, author_id: int, series_id: int | None, series_index: float) -> str:
        """Validate final series fields and return the canonical series slug."""
        if series_id is None:
            if series_index != 0:
                msg = "standalone books must use series_index 0"
                raise MetadataResolutionError(msg)
            return self._config.standalone_series
        if series_id not in self._registry.seen_series_ids:
            msg = f"series_id {series_id} was not returned by search_series"
            raise MetadataResolutionError(msg)
        series = self._registry.get_series(series_id)
        if series is None:
            msg = f"series_id {series_id} does not exist"
            raise MetadataResolutionError(msg)
        if series.author_id != author_id:
            msg = f"series_id {series_id} does not belong to author_id {author_id}"
            raise MetadataResolutionError(msg)
        if series_index <= 0:
            msg = "series books must use a positive series_index"
            raise MetadataResolutionError(msg)
        validate_catalog_slug(series.name, "series")
        return series.name
 def write_agent_log(log_path: Path, event: str, **fields: object) -> None:
    """Append one JSONL audit event."""
    log_path.parent.mkdir(parents=True, exist_ok=True)
    record = {
        "created": utcnow().isoformat(),
        "event": event,
        **{key: json_log_value(value) for key, value in fields.items()},
    }
    with log_path.open("a", encoding="utf-8") as file:
        file.write(json.dumps(record, sort_keys=True))
        file.write("\n")
 def json_log_value(value: object) -> object:
    """Return a JSON-serializable value for audit logs."""
    if is_dataclass(value) and not isinstance(value, type):
        return json_log_value(asdict(value))
    if isinstance(value, dict):
        return {str(key): json_log_value(item) for key, item in value.items()}
    if isinstance(value, list | tuple):
        return [json_log_value(item) for item in value]
    if isinstance(value, set):
        return [json_log_value(item) for item in sorted(value, key=str)]
    if isinstance(value, PathLike):
        return str(value)
    return value
 def system_prompt() -> str:
    """Return the stable system prompt."""
    return """You standardize Audible audiobook metadata against a private catalog.
 Rules:
 - You must use the provided tools before returning final metadata.
 - Only use author_id, series_id, or book_id values returned by tools.
 - Return final metadata as JSON only. Do not wrap it in Markdown.
 - The final JSON object must contain author_id, book_id, title, series_id, series_index, confidence, and evidence.
 - title must be a canonical title slug using lower-case words separated by hyphens.
 - Use series_id null and series_index 0 for standalone books.
 - If you use a series_id, series_index must be a whole number or .5 value greater than 0.
 - Treat series slugs that differ only by underscores as the same series. Prefer the existing catalog row instead of
  creating a new series.
 - Detect omnibus or box-set editions that contain multiple numbered novels, books, or novellas.
 - For an omnibus, make a best-effort range from the filename, tags, and catalog rows. Keep series_index as the
  first covered book number and include the range in the title when the source title includes it, for example
  books-1-3.
 - Be careful with omnibuses of novels or novellas later published as one book: keep the omnibus as the audiobook's
  book record unless catalog rows clearly identify a better match.
 - Do not create publisher collections or author collections as series unless the book metadata clearly gives a
  numbered series.
 - Series belong to authors. Use a series_id only when it belongs to the selected author_id.
 - Always search for the author before creating one. If no exact author slug exists, call ensure_author.
 - Always search for a series with author_id before creating one. If no exact series slug exists, call ensure_series.
 - Always search for a book before creating one. If no exact title slug exists, call ensure_book.
 - If a tool returns an error, correct your tool arguments or final metadata before continuing.
 - confidence must be a number from 0 to 1.
 - evidence must be a short list of strings explaining which filename, tags, and catalog rows support the answer."""
 def forced_final_prompt() -> str:
    """Return the no-tools finalization prompt."""
    return (
        "Stop calling tools. Return final metadata as JSON only using the tool results already provided. "
        "If search_books returned no matching rows but author and series are known, use book_id null and resolve "
        "the title slug from the AAX filename and ffprobe tags. The validator will create the missing book. "
        "Use only author_id and series_id values returned by earlier tool results."
    )
 def user_prompt(aax_file_name: str, metadata: dict[str, str]) -> str:
    """Build the user prompt from source metadata."""
    return (
        "Resolve this Audible audiobook.\n\n"
        f"AAX file name: {aax_file_name}\n\n"
        "ffprobe format tags:\n"
        f"{json.dumps(metadata, indent=2, sort_keys=True)}"
    )
 def parse_final_json_content(content: str) -> object:
    """Parse final model content, accepting bare or fenced JSON."""
    stripped = content.strip()
    if match := FENCED_JSON_PATTERN.fullmatch(stripped):
        stripped = match.group("json").strip()
    return json.loads(stripped)
 def parse_final_metadata_fields(raw_metadata: object) -> FinalMetadataFields:
    """Parse the model's final JSON object into typed fields."""
    if not isinstance(raw_metadata, dict):
        msg = "Final metadata must be a JSON object"
        raise MetadataResolutionError(msg)
    data = {str(key): value for key, value in raw_metadata.items()}
    return FinalMetadataFields(
        author_id=required_int(data, "author_id"),
        book_id=optional_int(data.get("book_id"), "book_id"),
        title=required_string(data, "title"),
        series_id=optional_int(data.get("series_id"), "series_id"),
        series_index=required_series_index(data, "series_index"),
        confidence=required_float(data, "confidence"),
        evidence=required_string_list(data, "evidence"),
    )
 def review_metadata(reason: str, config: AgentConfig) -> StandardBookMetadata:
    """Return a metadata result that must be reviewed manually."""
    return StandardBookMetadata(
        author_id=0,
        author="unknown_author",
        book_id=None,
        title="unknown-title",
        series_id=None,
        series=config.standalone_series,
        series_index=0,
        confidence=0,
        needs_review=True,
        evidence=[reason],
    )
 def required_float(data: dict[str, object], key: str) -> float:
    """Read a required float field."""
    value = data.get(key)
    if isinstance(value, bool) or not isinstance(value, int | float):
        msg = f"{key} must be a number"
        raise MetadataResolutionError(msg)
    confidence = float(value)
    if confidence < 0 or confidence > 1:
        msg = f"{key} must be between 0 and 1"
        raise MetadataResolutionError(msg)
    return confidence
 def required_string_list(data: dict[str, object], key: str) -> list[str]:
    """Read a required list of strings."""
    value = data.get(key)
    if not isinstance(value, list) or not value or not all(isinstance(item, str) for item in value):
        msg = f"{key} must be a non-empty list of strings"
        raise MetadataResolutionError(msg)
    strings = [item.strip() for item in value if item.strip()]
    if not strings:
        msg = f"{key} must include at least one non-empty string"
        raise MetadataResolutionError(msg)
    return strings
@@ -1,17 +0,0 @@
 FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1
 RUN apt-get update \
    && apt-get install -y --no-install-recommends python3 python3-pip ffmpeg \
    && rm -rf /var/lib/apt/lists/*
 RUN pip3 install --no-cache-dir --upgrade pip \
    && pip3 install --no-cache-dir faster-whisper requests
 WORKDIR /app
 COPY python/tools/whisper/inference.py /app/inference.py
 ENTRYPOINT ["python3", "/app/inference.py"]
@@ -1,2 +0,0 @@
 *
 !python/tools/whisper/inference.py
@@ -1 +0,0 @@
 """Whisper transcription tools (host orchestrator and container entrypoint)."""
@@ -1,136 +0,0 @@
 """Container entrypoint that transcribes a directory of audio files with faster-whisper.
 Run inside the whisper-transcribe docker image; segment timestamps are grouped
 into one-minute buckets so the output reads as ``[HH:MM:00] text``.
 """
 from __future__ import annotations
 import argparse
 import logging
 from pathlib import Path
 from faster_whisper import WhisperModel
 logger = logging.getLogger(__name__)
 AUDIO_EXTENSIONS = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".opus", ".mp4", ".mkv", ".webm", ".aac"}
 BUCKET_SECONDS = 60
 BEAM_SIZE = 5
 SECONDS_PER_HOUR = 3600
 SECONDS_PER_MINUTE = 60
 def format_timestamp(total_seconds: float) -> str:
    """Render a whole-minute timestamp as ``HH:MM:00``.
    Args:
        total_seconds: Offset in seconds from the start of the audio.
    Returns:
        A zero-padded ``HH:MM:00`` string.
    """
    hours = int(total_seconds // SECONDS_PER_HOUR)
    minutes = int((total_seconds % SECONDS_PER_HOUR) // SECONDS_PER_MINUTE)
    return f"{hours:02d}:{minutes:02d}:00"
 def transcribe_file(model: WhisperModel, audio_path: Path, output_path: Path) -> None:
    """Transcribe one audio file and write the bucketed transcript to disk.
    Args:
        model: Loaded faster-whisper model.
        audio_path: Source audio file.
        output_path: Destination ``.txt`` path.
    """
    logger.info("Transcribing %s", audio_path)
    segments, info = model.transcribe(
        str(audio_path),
        language="en",
        beam_size=BEAM_SIZE,
        vad_filter=True,
    )
    logger.info("Duration %.1fs", info.duration)
    buckets: dict[int, list[str]] = {}
    for segment in segments:
        bucket = int(segment.start // BUCKET_SECONDS)
        buckets.setdefault(bucket, []).append(segment.text.strip())
    lines = [f"[{format_timestamp(bucket * BUCKET_SECONDS)}] {' '.join(buckets[bucket])}" for bucket in sorted(buckets)]
    output_path.write_text("\n\n".join(lines) + "\n", encoding="utf-8")
    logger.info("Wrote %s", output_path)
 def find_audio_files(input_directory: Path) -> list[Path]:
    """Collect every audio file under ``input_directory``.
    Args:
        input_directory: Directory to walk recursively.
    Returns:
        Sorted list of audio file paths.
    """
    return sorted(
        path for path in input_directory.rglob("*") if path.is_file() and path.suffix.lower() in AUDIO_EXTENSIONS
    )
 def configure_container_logger() -> None:
    """Configure logging for the container (stdout, INFO)."""
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s %(levelname)s %(message)s",
    )
 def parse_arguments() -> argparse.Namespace:
    """Parse CLI arguments for the container entrypoint.
    Returns:
        Parsed argparse namespace.
    """
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--input", type=Path, default=Path("/audio"))
    parser.add_argument("--output", type=Path, default=Path("/output"))
    parser.add_argument("--model", default="large-v3")
    parser.add_argument(
        "--download-only",
        action="store_true",
        help="Download the model into the cache volume and exit without transcribing.",
    )
    return parser.parse_args()
 def main() -> None:
    """Load the model, then either exit (download-only) or transcribe the directory."""
    configure_container_logger()
    arguments = parse_arguments()
    logger.info("Loading model %s on CUDA", arguments.model)
    model = WhisperModel(arguments.model, device="cuda", compute_type="float16")
    if arguments.download_only:
        logger.info("Model ready; exiting (download-only mode)")
        return
    arguments.output.mkdir(parents=True, exist_ok=True)
    audio_files = find_audio_files(arguments.input)
    if not audio_files:
        logger.warning("No audio files found in %s", arguments.input)
        return
    logger.info("Found %d audio file(s)", len(audio_files))
    for audio_path in audio_files:
        relative = audio_path.relative_to(arguments.input)
        output_path = arguments.output / relative.with_suffix(".txt")
        output_path.parent.mkdir(parents=True, exist_ok=True)
        if output_path.exists():
            logger.info("Skip %s (already transcribed)", relative)
            continue
        transcribe_file(model, audio_path, output_path)
 if __name__ == "__main__":
    main()
@@ -1,167 +0,0 @@
 """Build and run the whisper transcription docker container on demand.
 The container is started fresh for each invocation and removed on exit
 (``docker run --rm``). The model is cached in a named docker volume so
 only the first run pays the download cost.
 """
 from __future__ import annotations
 import logging
 import subprocess
 from pathlib import Path
 from typing import Annotated
 import typer
 from python.common import configure_logger
 logger = logging.getLogger(__name__)
 class Config:
    """Paths and names for the whisper-transcribe Docker workflow."""
    image_tag = "whisper-transcribe:latest"
    model_volume = "whisper-models"
    repo_root = Path(__file__).resolve().parents[3]
    dockerfile = Path(__file__).resolve().parent / "Dockerfile"
    huggingface_cache = "/root/.cache/huggingface"
 def run_docker(arguments: list[str]) -> None:
    """Run a docker subcommand, streaming output and raising on failure.
    Args:
        arguments: Arguments to pass to the ``docker`` binary.
    Raises:
        subprocess.CalledProcessError: If docker exits non-zero.
    """
    logger.info("docker %s", " ".join(arguments))
    subprocess.run(["docker", *arguments], check=True)
 def build_image() -> None:
    """Build the whisper-transcribe image using the repo root as build context."""
    logger.info("Building image %s", Config.image_tag)
    run_docker(
        [
            "build",
            "--tag",
            Config.image_tag,
            "--file",
            str(Config.dockerfile),
            str(Config.repo_root),
        ],
    )
 def model_cache_present(model: str) -> bool:
    """Check whether the given model is already downloaded in the cache volume.
    Args:
        model: faster-whisper model name (e.g. ``large-v3``).
    Returns:
        True if the HuggingFace cache directory for the model exists in the volume.
    """
    cache_directory = f"hub/models--Systran--faster-whisper-{model}"
    completed = subprocess.run(
        [
            "docker",
            "run",
            "--rm",
            "--volume",
            f"{Config.model_volume}:/cache",
            "alpine",
            "test",
            "-d",
            f"/cache/{cache_directory}",
        ],
        check=False,
    )
    return completed.returncode == 0
 def download_model(model: str) -> None:
    """Download the model into the cache volume and exit.
    Args:
        model: faster-whisper model name.
    """
    logger.info("Downloading model %s into volume %s", model, Config.model_volume)
    run_docker(
        [
            "run",
            "--rm",
            "--device=nvidia.com/gpu=all",
            "--ipc=host",
            "--volume",
            f"{Config.model_volume}:{Config.huggingface_cache}",
            Config.image_tag,
            "--model",
            model,
            "--download-only",
        ],
    )
 def transcribe(input_directory: Path, output_directory: Path, model: str) -> None:
    """Run transcription on every audio file under ``input_directory``.
    Args:
        input_directory: Host path containing audio files (mounted read-only).
        output_directory: Host path for ``.txt`` transcripts.
        model: faster-whisper model name.
    """
    logger.info("Transcribing %s -> %s (model=%s)", input_directory, output_directory, model)
    run_docker(
        [
            "run",
            "--rm",
            "--device=nvidia.com/gpu=all",
            "--ipc=host",
            "--volume",
            f"{input_directory}:/audio:ro",
            "--volume",
            f"{output_directory}:/output",
            "--volume",
            f"{Config.model_volume}:{Config.huggingface_cache}",
            Config.image_tag,
            "--model",
            model,
        ],
    )
 def main(
    input_directory: Annotated[Path, typer.Argument(help="Directory of audio files to transcribe.")],
    output_directory: Annotated[Path, typer.Argument(help="Directory to write .txt transcripts to.")],
    model: Annotated[str, typer.Option(help="faster-whisper model name.")] = "large-v3",
    *,
    force_download: Annotated[
        bool,
        typer.Option("--force-download", help="Re-download the model even if already cached."),
    ] = False,
 ) -> None:
    """Build the image, ensure the model is cached, then transcribe and stop."""
    configure_logger()
    resolved_input = input_directory.resolve(strict=True)
    output_directory.mkdir(parents=True, exist_ok=True)
    resolved_output = output_directory.resolve()
    build_image()
    if force_download or not model_cache_present(model):
        download_model(model)
    else:
        logger.info("Model %s already cached in volume %s", model, Config.model_volume)
    transcribe(resolved_input, resolved_output, model)
    logger.info("Done. Container stopped.")
 if __name__ == "__main__":
    typer.run(main)
@@ -1,13 +1,11 @@
 { inputs, pkgs, ... }:
 {
  imports = [
    "${inputs.self}/users/math"
    "${inputs.self}/users/richie"
-    "${inputs.self}/users/steve"
+    "${inputs.self}/users/math"
    "${inputs.self}/common/global"
    "${inputs.self}/common/optional/docker.nix"
    "${inputs.self}/common/optional/scanner.nix"
    "${inputs.self}/common/optional/monitoring-agent.nix"
    "${inputs.self}/common/optional/steam.nix"
    "${inputs.self}/common/optional/syncthing_base.nix"
    "${inputs.self}/common/optional/systemd-boot.nix"
@@ -28,12 +26,7 @@
  networking = {
    hostName = "bob";
    hostId = "7c678a41";
-    firewall = {
+    firewall.enable = true;
      enable = true;
      allowedTCPPorts = [
        8000
      ];
    };
    networkmanager.enable = true;
  };
@@ -28,13 +28,9 @@
        allowDiscards = true;
        keyFileSize = 4096;
        keyFile = "/dev/disk/by-id/usb-Samsung_Flash_Drive_FIT_0374620080067131-0:0";
        fallbackToPassword = true;
      };
    };
    zfs.extraPools = [
      "storage"
    ];
    kernelModules = [ "kvm-amd" ];
    extraModulePackages = [ ];
  };
@@ -42,14 +42,11 @@
      "qwen3:8b"
      "qwen3.5:27b"
      "qwen3.5:35b"
      "qwen3.6:27b"
      "qwen3.6:35b"
      "rinex20/translategemma3:12b"
      "translategemma:12b"
      "translategemma:27b"
      "translategemma:4b"
    ];
-    models = "/zfs/storage/models";
+    models = "/zfs/models";
    openFirewall = true;
  };
 }
@@ -0,0 +1,11 @@
 #!/bin/bash
 # zpools
 # storage
 sudo zpool create -f -o ashift=12 -O acltype=posixacl -O atime=off -O dnodesize=auto -O xattr=sa -O compression=zstd -m /zfs/storage storage mirror
 sudo zpool create -o ashift=12 -O acltype=posixacl -O atime=off -O dnodesize=auto -O xattr=sa -O compression=zstd -m /zfs/storage storage
 # storage datasets
 sudo zfs create storage/models -o recordsize=1M
@@ -24,6 +24,6 @@ monthly = 0
 ["root_pool/models"]
 15_min = 4
-hourly = 24
+hourly = 2
 daily = 0
 monthly = 0
@@ -31,15 +31,5 @@
      ];
      fsWatcherEnabled = true;
    };
    "recordings" = {
      path = "/home/richie/recordings";
      devices = [
        "jeeves"
        "phone"
        "rhapsody-in-green"
      ];
      fsWatcherEnabled = true;
    };
  };
 }
@@ -26,6 +26,7 @@
        allowDiscards = true;
        keyFileSize = 4096;
        keyFile = "/dev/disk/by-id/usb-USB_SanDisk_3.2Gen1_03021630090925173333-0:0";
        fallbackToPassword = true;
      };
    };
    kernelModules = [ "kvm-intel" ];
@@ -4,21 +4,17 @@ let
 in
 {
  imports = [
    "${inputs.self}/users/dov"
    "${inputs.self}/users/math"
    "${inputs.self}/users/richie"
-    "${inputs.self}/users/steve"
+    "${inputs.self}/users/math"
    "${inputs.self}/users/dov"
    "${inputs.self}/common/global"
    "${inputs.self}/common/optional/docker.nix"
    "${inputs.self}/common/optional/monitoring-agent.nix"
    "${inputs.self}/common/optional/ssh_decrypt.nix"
    "${inputs.self}/common/optional/syncthing_base.nix"
    "${inputs.self}/common/optional/update.nix"
    "${inputs.self}/common/optional/zerotier.nix"
    ./monitoring
    ./docker
    ./services
    ./web_services
    ./hardware.nix
    ./networking.nix
    ./programs.nix
@@ -39,10 +35,5 @@ in
    zerotierone.joinNetworks = [ "a09acf02330d37b9" ];
  };
  users.groups = {
    nornsight = { };
    nornsight-admin = { };
  };
  system.stateVersion = "24.05";
 }
@@ -9,6 +9,7 @@ let
    inherit device;
    keyFileSize = 4096;
    keyFile = "/dev/disk/by-id/usb-XIAO_USB_Drive_24587CE29074-0:0";
    fallbackToPassword = true;
  };
  makeLuksSSD =
    device:
@@ -1,426 +0,0 @@
 {
  "annotations": {
    "list": [
      {
        "builtIn": 1,
        "datasource": {
          "type": "grafana",
          "uid": "-- Grafana --"
        },
        "enable": true,
        "hide": true,
        "iconColor": "rgba(0, 211, 255, 1)",
        "name": "Annotations & Alerts",
        "type": "dashboard"
      }
    ]
  },
  "editable": true,
  "fiscalYearStartMonth": 0,
  "graphTooltip": 0,
  "links": [],
  "panels": [
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prom-main"
      },
      "fieldConfig": {
        "defaults": {
          "unit": "percent"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 8,
        "w": 6,
        "x": 0,
        "y": 0
      },
      "id": 1,
      "options": {
        "legend": {
          "displayMode": "list",
          "placement": "bottom"
        },
        "tooltip": {
          "mode": "multi"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prom-main"
          },
          "editorMode": "code",
          "expr": "100 * (1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
          "legendFormat": "{{instance}}",
          "range": true,
          "refId": "A"
        }
      ],
      "title": "CPU Used",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prom-main"
      },
      "fieldConfig": {
        "defaults": {
          "unit": "percent"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 8,
        "w": 6,
        "x": 6,
        "y": 0
      },
      "id": 2,
      "options": {
        "legend": {
          "displayMode": "list",
          "placement": "bottom"
        },
        "tooltip": {
          "mode": "multi"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prom-main"
          },
          "editorMode": "code",
          "expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
          "legendFormat": "{{instance}}",
          "range": true,
          "refId": "A"
        }
      ],
      "title": "RAM Used",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prom-main"
      },
      "fieldConfig": {
        "defaults": {
          "unit": "percent"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 8,
        "w": 6,
        "x": 12,
        "y": 0
      },
      "id": 3,
      "options": {
        "legend": {
          "displayMode": "list",
          "placement": "bottom"
        },
        "tooltip": {
          "mode": "multi"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prom-main"
          },
          "editorMode": "code",
          "expr": "100 * (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes))",
          "legendFormat": "{{instance}}",
          "range": true,
          "refId": "A"
        }
      ],
      "title": "Swap Used",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prom-main"
      },
      "fieldConfig": {
        "defaults": {
          "unit": "short"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 8,
        "w": 6,
        "x": 18,
        "y": 0
      },
      "id": 4,
      "options": {
        "legend": {
          "displayMode": "list",
          "placement": "bottom"
        },
        "tooltip": {
          "mode": "multi"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prom-main"
          },
          "editorMode": "code",
          "expr": "node_load1",
          "legendFormat": "{{instance}} load1",
          "range": true,
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prom-main"
          },
          "editorMode": "code",
          "expr": "node_load5",
          "legendFormat": "{{instance}} load5",
          "range": true,
          "refId": "B"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prom-main"
          },
          "editorMode": "code",
          "expr": "node_load15",
          "legendFormat": "{{instance}} load15",
          "range": true,
          "refId": "C"
        }
      ],
      "title": "Load",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prom-main"
      },
      "fieldConfig": {
        "defaults": {
          "unit": "Bps"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 9,
        "w": 12,
        "x": 0,
        "y": 8
      },
      "id": 5,
      "options": {
        "legend": {
          "displayMode": "list",
          "placement": "bottom"
        },
        "tooltip": {
          "mode": "multi"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prom-main"
          },
          "editorMode": "code",
          "expr": "sum by (instance) (rate(node_disk_read_bytes_total[5m]))",
          "legendFormat": "{{instance}} read",
          "range": true,
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prom-main"
          },
          "editorMode": "code",
          "expr": "sum by (instance) (rate(node_disk_written_bytes_total[5m]))",
          "legendFormat": "{{instance}} write",
          "range": true,
          "refId": "B"
        }
      ],
      "title": "Disk Throughput",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prom-main"
      },
      "fieldConfig": {
        "defaults": {
          "unit": "percent"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 9,
        "w": 12,
        "x": 12,
        "y": 8
      },
      "id": 6,
      "options": {
        "cellHeight": "sm",
        "showHeader": true,
        "sortBy": [
          {
            "desc": true,
            "displayName": "Value"
          }
        ]
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prom-main"
          },
          "editorMode": "code",
          "expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=~\"(/|/home|/var|/zfs.*)\",fstype!=\"\"} / node_filesystem_size_bytes{mountpoint=~\"(/|/home|/var|/zfs.*)\",fstype!=\"\"}))",
          "format": "table",
          "instant": true,
          "legendFormat": "{{instance}} {{mountpoint}}",
          "refId": "A"
        }
      ],
      "title": "Filesystem Usage",
      "type": "table"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prom-main"
      },
      "fieldConfig": {
        "defaults": {
          "unit": "percentunit"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 10,
        "w": 12,
        "x": 0,
        "y": 17
      },
      "id": 7,
      "options": {
        "cellHeight": "sm",
        "showHeader": true,
        "sortBy": [
          {
            "desc": true,
            "displayName": "Value"
          }
        ]
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prom-main"
          },
          "editorMode": "code",
          "expr": "topk(10, rate(namedprocess_namegroup_cpu_seconds_total[5m]))",
          "format": "table",
          "instant": true,
          "legendFormat": "{{instance}} {{groupname}}",
          "refId": "A"
        }
      ],
      "title": "Top Grouped CPU",
      "type": "table"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prom-main"
      },
      "fieldConfig": {
        "defaults": {
          "unit": "bytes"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 10,
        "w": 12,
        "x": 12,
        "y": 17
      },
      "id": 8,
      "options": {
        "cellHeight": "sm",
        "showHeader": true,
        "sortBy": [
          {
            "desc": true,
            "displayName": "Value"
          }
        ]
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prom-main"
          },
          "editorMode": "code",
          "expr": "topk(10, namedprocess_namegroup_memory_bytes{memtype=\"resident\"})",
          "format": "table",
          "instant": true,
          "legendFormat": "{{instance}} {{groupname}}",
          "refId": "A"
        }
      ],
      "title": "Top Grouped Memory",
      "type": "table"
    }
  ],
  "refresh": "30s",
  "schemaVersion": 39,
  "style": "dark",
  "tags": [
    "monitoring"
  ],
  "templating": {
    "list": []
  },
  "time": {
    "from": "now-24h",
    "to": "now"
  },
  "timepicker": {},
  "timezone": "",
  "title": "Overview",
  "uid": "monitor-overview",
  "version": 1,
  "weekStart": ""
 }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Richie	a80de99175	adding math to bob	2026-04-12 10:08:23 -04:00
Richie	50d56a8a39	added config.toml to git ignore	2026-04-12 10:08:23 -04:00
Richie	30dc36588c	updated BenchmarkConfig to have from_toml	2026-04-12 10:08:23 -04:00
Richie	68190901cb	setup FinetuneConfig	2026-04-12 10:08:23 -04:00
Richie	275762843f	deleted train.sh	2026-04-12 10:08:23 -04:00
Richie	face93262f	added containers dir	2026-04-12 10:08:23 -04:00
Richie	ee34a0986b	conveted to summarization_prompts	2026-04-12 10:08:23 -04:00
Richie	e8b20bc7df	moved renamed container.py to vllm_container.py	2026-04-12 10:08:23 -04:00
Richie	6c459985fa	created working finetuing pipeline	2026-04-12 10:08:23 -04:00
Richie	20a204612f	added data dir for traning	2026-04-12 10:08:23 -04:00
Richie	27b609052c	updated spell check	2026-04-12 10:08:23 -04:00
Richie	20fb24e244	added storage pool	2026-04-12 10:08:23 -04:00
Richie	230ab1d7f6	added tiktoken	2026-04-12 10:08:23 -04:00
Richie	9ffaa1b755	added summarization_prompts.py to sore the prompts	2026-04-12 10:08:23 -04:00
Richie	c6b4ed4814	added tools dir for on off scripts i used	2026-04-12 10:08:23 -04:00
Richie	88ceeb55a1	added batch_bill_summarizer.py batch bill summarizer sends a batch api call to gpt	2026-04-12 10:08:23 -04:00
Richie	6c57d74644	decreased root_pool/models snapshot life	2026-04-12 10:08:23 -04:00
Richie	cb98090f95	added bill_token_compression.py tested on sample size of 100 bills matching the distribution of our data Compression saves ~11.5% on prompt tokens; completion/reasoning are roughly equal across the two sets. prompt completion reasoning total compressed 349,460 157,110 112,128 506,570 uncompressed 394,948 154,710 110,080 549,658 delta −45,488 +2,400 +2,048 −43,088	2026-04-12 10:08:23 -04:00
Richie	63cb48a3dd	created main prompt bench	2026-04-12 10:08:23 -04:00
Richie	6f6d247d3e	fixed sunshine.nix	2026-04-12 10:08:23 -04:00
Richie	6b63315579	converting bob to a server	2026-04-12 10:08:23 -04:00
Richie	a093c72eb9	creating prompt_bench downloader	2026-04-12 10:08:23 -04:00
		`@@ -0,0 +1,3 @@`
							`"""Data science CLI tools."""`

							`from __future__ import annotations`
		`@@ -0,0 +1 @@`
							`"""Prompt benchmarking system for evaluating LLMs via vLLM."""`
		`@@ -1 +0,0 @@`
			`"""Whisper transcription tools (host orchestrator and container entrypoint)."""`