Compare commits

..

9 Commits

Author SHA1 Message Date
Richie 0ed7be036e setting up bluesky firehose 2026-03-26 08:09:32 -04:00
Richie e4d5f342be making more generic exception handling 2026-03-26 08:07:39 -04:00
Richie f8976f690f ran ingest_posts 2026-03-25 17:21:44 -04:00
Richie d1d6a540e5 adding tables for 2023 2026-03-25 09:22:10 -04:00
Richie 794dff4e13 added ingest_posts.py 2026-03-24 23:47:04 -04:00
Richie 21c4a38d81 adding 2026 partitions 2026-03-24 23:18:28 -04:00
Richie 31c6e2cb69 adding post table 2026-03-24 23:03:58 -04:00
Richie 65dacb6089 added media/temp for fast dir when working with data 2026-03-24 21:37:51 -04:00
Richie b0ed4900e4 adding data_science_dev 2026-03-24 21:36:50 -04:00
120 changed files with 1362 additions and 8010 deletions
+1 -1
View File
@@ -23,6 +23,6 @@ jobs:
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- name: Build default package - name: Build default package
run: "nixos-rebuild build --accept-flake-config --flake ./#${{ matrix.system }}" run: "nixos-rebuild build --flake ./#${{ matrix.system }}"
- name: copy to nix-cache - name: copy to nix-cache
run: nix copy --accept-flake-config --to unix:///host-nix/var/nix/daemon-socket/socket .#nixosConfigurations.${{ matrix.system }}.config.system.build.toplevel run: nix copy --accept-flake-config --to unix:///host-nix/var/nix/daemon-socket/socket .#nixosConfigurations.${{ matrix.system }}.config.system.build.toplevel
+30
View File
@@ -0,0 +1,30 @@
name: fix_eval_warnings
on:
workflow_run:
workflows: ["build_systems"]
types: [completed]
jobs:
check-warnings:
if: >-
github.event.workflow_run.conclusion != 'cancelled' &&
github.event.workflow_run.head_branch == 'main' &&
(github.event.workflow_run.event == 'push' || github.event.workflow_run.event == 'schedule')
runs-on: self-hosted
permissions:
contents: write
pull-requests: write
steps:
- uses: actions/checkout@v4
- name: Fix eval warnings
env:
GH_TOKEN: ${{ secrets.GH_TOKEN_FOR_UPDATES }}
run: >-
nix develop .#devShells.x86_64-linux.default -c
python -m python.eval_warnings.main
--run-id "${{ github.event.workflow_run.id }}"
--repo "${{ github.repository }}"
--ollama-url "${{ secrets.OLLAMA_URL }}"
--run-url "${{ github.event.workflow_run.html_url }}"
+13 -7
View File
@@ -6,18 +6,24 @@ on:
jobs: jobs:
merge: merge:
runs-on: self-hosted runs-on: ubuntu-latest
permissions: permissions:
contents: write contents: write
pull-requests: write pull-requests: write
steps: steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: merge_flake_lock_update - name: merge_flake_lock_update
run: >- run: |
nix develop .#devShells.x86_64-linux.default -c pr_number=$(gh pr list --state open --author RichieCahill --label flake_lock_update --json number --jq '.[0].number')
python -m python.gitea_flake_lock merge echo "pr_number=$pr_number" >> $GITHUB_ENV
--repo "${{ github.repository }}" if [ -n "$pr_number" ]; then
gh pr merge "$pr_number" --rebase
else
echo "No open PR found with label flake_lock_update"
fi
env: env:
GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }} GITHUB_TOKEN: ${{ secrets.GH_TOKEN_FOR_UPDATES }}
GITEA_URL: https://gitea.tmmworkshop.com
+1 -1
View File
@@ -1,13 +1,13 @@
name: pytest name: pytest
on: on:
workflow_dispatch:
push: push:
branches: branches:
- main - main
pull_request: pull_request:
branches: branches:
- main - main
merge_group:
jobs: jobs:
pytest: pytest:
+11 -13
View File
@@ -6,20 +6,18 @@ on:
jobs: jobs:
lockfile: lockfile:
runs-on: self-hosted runs-on: ubuntu-latest
permissions:
contents: write
pull-requests: write
steps: steps:
- name: Checkout repository - name: Checkout repository
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@main
- name: Update flake.lock - name: Update flake.lock
run: nix flake update uses: DeterminateSystems/update-flake-lock@main
- name: Create or update flake.lock PR with:
env: token: ${{ secrets.GH_TOKEN_FOR_UPDATES }}
GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }} pr-title: "Update flake.lock"
GITEA_URL: https://gitea.tmmworkshop.com pr-labels: |
run: >- dependencies
nix develop .#devShells.x86_64-linux.default -c automated
python -m python.gitea_flake_lock update flake_lock_update
--repo "${{ github.repository }}"
-3
View File
@@ -169,6 +169,3 @@ test.*
# Frontend build output # Frontend build output
frontend/dist/ frontend/dist/
frontend/node_modules/ frontend/node_modules/
# data from testing llms
data/*
+1 -2
View File
@@ -40,6 +40,7 @@
"cgroupdriver", "cgroupdriver",
"charliermarsh", "charliermarsh",
"Checkpointing", "Checkpointing",
"cloudflared",
"codellama", "codellama",
"codezombiech", "codezombiech",
"compactmode", "compactmode",
@@ -203,7 +204,6 @@
"peerconnection", "peerconnection",
"PESKYFOX", "PESKYFOX",
"PGID", "PGID",
"pgvector",
"pipewire", "pipewire",
"pkgs", "pkgs",
"plugdev", "plugdev",
@@ -308,7 +308,6 @@
"usernamehw", "usernamehw",
"userprefs", "userprefs",
"vaninventory", "vaninventory",
"vdev",
"vfat", "vfat",
"victron", "victron",
"virt", "virt",
+2 -12
View File
@@ -23,10 +23,7 @@
boot = { boot = {
tmp.useTmpfs = true; tmp.useTmpfs = true;
kernelPackages = lib.mkDefault pkgs.linuxPackages_6_12; kernelPackages = lib.mkDefault pkgs.linuxPackages_6_12;
zfs = { zfs.package = lib.mkDefault pkgs.zfs_2_4;
package = lib.mkDefault pkgs.zfs_2_4;
forceImportRoot = lib.mkDefault false;
};
}; };
hardware.enableRedistributableFirmware = true; hardware.enableRedistributableFirmware = true;
@@ -40,17 +37,10 @@
nixpkgs = { nixpkgs = {
overlays = builtins.attrValues outputs.overlays; overlays = builtins.attrValues outputs.overlays;
config = { config.allowUnfree = true;
allowUnfree = true;
permittedInsecurePackages = [
"openssl-1.1.1w" # This is for discord-canary
];
};
}; };
services = { services = {
dbus.implementation = "dbus";
# firmware update # firmware update
fwupd.enable = true; fwupd.enable = true;
-1
View File
@@ -34,7 +34,6 @@ in
warn-dirty = false; warn-dirty = false;
flake-registry = ""; # disable global flake registries flake-registry = ""; # disable global flake registries
connect-timeout = 10; connect-timeout = 10;
download-buffer-size = 536870912;
fallback = true; fallback = true;
}; };
-256
View File
@@ -1,256 +0,0 @@
{
config,
lib,
pkgs,
...
}:
let
monitoringInterface = "ztwfunumly";
nodeTextfileDir = "/var/lib/prometheus-node-exporter-textfile";
mkProcessNameTemplate =
perPid: template: if perPid then "${template}:{{.PID}}:{{.StartTime}}" else template;
mkProcessMatchers = perPid: [
{
name = mkProcessNameTemplate perPid "{{.Username}}:{{.Matches.Module}}";
cmdline = [ "^/nix/store[^ ]*/bin/python[^ ]* -m (?P<Module>[^ ]+)" ];
}
{
name = mkProcessNameTemplate perPid "{{.Username}}:{{.Matches.Wrapped}}";
cmdline = [
"^/nix/store[^ ]*/bin/python[^ ]* /nix/store[^ ]*/bin/\\.?(?P<Wrapped>[^ /]+?)(?:-wrapped)?(?:\\s|$)"
];
}
{
name = mkProcessNameTemplate perPid "{{.Username}}:{{.Matches.Wrapped}}";
cmdline = [
"^/nix/store[^ ]*/bin/node /nix/store[^ ]*-(?P<Wrapped>[A-Za-z0-9._+-]+)-[0-9][^ /]*/"
];
}
{
name = mkProcessNameTemplate perPid "{{.Username}}:{{.Matches.Wrapped}}";
cmdline = [ "^/nix/store[^ ]*/(?:bin/|lib/[^ ]*/)?\\.?(?P<Wrapped>[^ /]+?)(?:-wrapped)?(?:\\s|$)" ];
}
{
name = mkProcessNameTemplate perPid "{{.Username}}:{{.ExeBase}}";
cmdline = [ ".+" ];
}
];
perPidConfig = pkgs.writeText "process-exporter-per-pid.yaml" (
builtins.toJSON {
process_names = mkProcessMatchers true;
}
);
zpoolLatencyScript = pkgs.writeShellScript "zpool-latency-exporter" ''
set -euo pipefail
out_dir=${lib.escapeShellArg nodeTextfileDir}
host=${lib.escapeShellArg config.networking.hostName}
tmp_file="$(mktemp "$out_dir/zpool.prom.XXXXXX")"
trap 'rm -f "$tmp_file"' EXIT
pools="$(zpool list -H -o name | paste -sd, -)"
cat >"$tmp_file" <<'EOF'
# HELP zpool_iostat_total_wait_read_ns Average total read wait time reported by zpool iostat.
# TYPE zpool_iostat_total_wait_read_ns gauge
# HELP zpool_iostat_total_wait_write_ns Average total write wait time reported by zpool iostat.
# TYPE zpool_iostat_total_wait_write_ns gauge
# HELP zpool_iostat_disk_wait_read_ns Average disk read wait time reported by zpool iostat.
# TYPE zpool_iostat_disk_wait_read_ns gauge
# HELP zpool_iostat_disk_wait_write_ns Average disk write wait time reported by zpool iostat.
# TYPE zpool_iostat_disk_wait_write_ns gauge
# HELP zpool_iostat_syncq_wait_read_ns Average synchronous queue read wait time reported by zpool iostat.
# TYPE zpool_iostat_syncq_wait_read_ns gauge
# HELP zpool_iostat_syncq_wait_write_ns Average synchronous queue write wait time reported by zpool iostat.
# TYPE zpool_iostat_syncq_wait_write_ns gauge
# HELP zpool_iostat_asyncq_wait_read_ns Average asynchronous queue read wait time reported by zpool iostat.
# TYPE zpool_iostat_asyncq_wait_read_ns gauge
# HELP zpool_iostat_asyncq_wait_write_ns Average asynchronous queue write wait time reported by zpool iostat.
# TYPE zpool_iostat_asyncq_wait_write_ns gauge
EOF
zpool iostat -Hplvy -y 1 1 | awk -F '\t' -v host="$host" -v pools="$pools" '
function esc(str, out) {
out = str
gsub(/\\/, "\\\\", out)
gsub(/"/, "\\\"", out)
return out
}
function emit(metric, pool, vdev, value) {
if (value == "" || value == "-") {
return
}
printf "%s{host=\"%s\",pool=\"%s\",vdev=\"%s\"} %s\n",
metric,
esc(host),
esc(pool),
esc(vdev),
value
}
BEGIN {
split(pools, pool_names, ",")
for (idx in pool_names) {
if (pool_names[idx] != "") {
known_pools[pool_names[idx]] = 1
}
}
}
NF == 0 {
next
}
{
row_name = $1
if (row_name in known_pools) {
current_pool = row_name
current_vdev = "_pool"
} else if (current_pool == "") {
next
} else {
current_vdev = row_name
}
emit("zpool_iostat_total_wait_read_ns", current_pool, current_vdev, $8)
emit("zpool_iostat_total_wait_write_ns", current_pool, current_vdev, $9)
emit("zpool_iostat_disk_wait_read_ns", current_pool, current_vdev, $10)
emit("zpool_iostat_disk_wait_write_ns", current_pool, current_vdev, $11)
emit("zpool_iostat_syncq_wait_read_ns", current_pool, current_vdev, $12)
emit("zpool_iostat_syncq_wait_write_ns", current_pool, current_vdev, $13)
emit("zpool_iostat_asyncq_wait_read_ns", current_pool, current_vdev, $14)
emit("zpool_iostat_asyncq_wait_write_ns", current_pool, current_vdev, $15)
}
' >>"$tmp_file"
mv "$tmp_file" "$out_dir/zpool.prom"
trap - EXIT
'';
in
{
networking.firewall.interfaces.${monitoringInterface}.allowedTCPPorts = [
9100
9134
9256
9257
9633
];
services.prometheus.exporters = {
node = {
enable = true;
enabledCollectors = [
"pressure"
"processes"
"systemd"
];
extraFlags = [ "--collector.textfile.directory=${nodeTextfileDir}" ];
};
process = {
enable = true;
user = "root";
group = "root";
settings.process_names = mkProcessMatchers false;
extraFlags = [
"-gather-smaps=false"
"-remove-empty-groups=true"
"-threads=false"
];
};
smartctl.enable = true;
zfs.enable = true;
};
programs.atop = {
enable = true;
atopService.enable = true;
atopRotateTimer.enable = true;
atopacctService.enable = true;
settings.interval = 30;
};
systemd = {
services = {
prometheus-process-pid-exporter = {
description = "Prometheus process exporter with per-PID naming";
wantedBy = [ "multi-user.target" ];
after = [ "network.target" ];
serviceConfig = {
ExecStart = ''
${pkgs.prometheus-process-exporter}/bin/process-exporter \
--web.listen-address 0.0.0.0:9257 \
--config.path ${perPidConfig} \
-children=false \
-gather-smaps=false \
-remove-empty-groups=true \
-threads=false
'';
User = "root";
Group = "root";
Restart = "always";
WorkingDirectory = "/tmp";
CapabilityBoundingSet = [ "" ];
DeviceAllow = [ "" ];
LockPersonality = true;
MemoryDenyWriteExecute = true;
NoNewPrivileges = true;
PrivateDevices = true;
PrivateTmp = true;
ProtectClock = true;
ProtectControlGroups = true;
ProtectHome = true;
ProtectHostname = true;
ProtectKernelLogs = true;
ProtectKernelModules = true;
ProtectKernelTunables = true;
ProtectSystem = "strict";
RemoveIPC = true;
RestrictAddressFamilies = [
"AF_INET"
"AF_INET6"
];
RestrictNamespaces = true;
RestrictRealtime = true;
RestrictSUIDSGID = true;
SystemCallArchitectures = "native";
UMask = "0077";
};
};
zpool-latency-exporter = {
description = "Exports ZFS latency metrics for node_exporter textfile collection";
after = [ "zfs-import.target" ];
requires = [ "zfs-import.target" ];
path = [
config.boot.zfs.package
pkgs.coreutils
pkgs.gawk
];
serviceConfig = {
Type = "oneshot";
ExecStart = zpoolLatencyScript;
};
};
};
timers.zpool-latency-exporter = {
wantedBy = [ "timers.target" ];
timerConfig = {
OnBootSec = "2m";
OnUnitActiveSec = "60s";
Unit = "zpool-latency-exporter.service";
};
};
tmpfiles.rules = [ "d ${nodeTextfileDir} 0755 root root - -" ];
};
}
+1 -1
View File
@@ -12,7 +12,7 @@
brain.id = "SSCGIPI-IV3VYKB-TRNIJE3-COV4T2H-CDBER7F-I2CGHYA-NWOEUDU-3T5QAAN"; # cspell:disable-line brain.id = "SSCGIPI-IV3VYKB-TRNIJE3-COV4T2H-CDBER7F-I2CGHYA-NWOEUDU-3T5QAAN"; # cspell:disable-line
ipad.id = "KI76T3X-SFUGV2L-VSNYTKR-TSIUV5L-SHWD3HE-GQRGRCN-GY4UFMD-CW6Z6AX"; # cspell:disable-line ipad.id = "KI76T3X-SFUGV2L-VSNYTKR-TSIUV5L-SHWD3HE-GQRGRCN-GY4UFMD-CW6Z6AX"; # cspell:disable-line
jeeves.id = "ICRHXZW-ECYJCUZ-I4CZ64R-3XRK7CG-LL2HAAK-FGOHD22-BQA4AI6-5OAL6AG"; # cspell:disable-line jeeves.id = "ICRHXZW-ECYJCUZ-I4CZ64R-3XRK7CG-LL2HAAK-FGOHD22-BQA4AI6-5OAL6AG"; # cspell:disable-line
phone.id = "JPVQKQW-CFXOJXT-Q5G5F3H-QIDHDRE-GKHPTQB-GXZUQSP-U7FR7F7-INP3AAH"; # cspell:disable-line phone.id = "TBRULKD-7DZPGGZ-F6LLB7J-MSO54AY-7KLPBIN-QOFK6PX-W2HBEWI-PHM2CQI"; # cspell:disable-line
rhapsody-in-green.id = "ASL3KC4-3XEN6PA-7BQBRKE-A7JXLI6-DJT43BY-Q4WPOER-7UALUAZ-VTPQ6Q4"; # cspell:disable-line rhapsody-in-green.id = "ASL3KC4-3XEN6PA-7BQBRKE-A7JXLI6-DJT43BY-Q4WPOER-7UALUAZ-VTPQ6Q4"; # cspell:disable-line
}; };
}; };
+1 -1
View File
@@ -4,7 +4,7 @@
flags = [ "--accept-flake-config" ]; flags = [ "--accept-flake-config" ];
randomizedDelaySec = "1h"; randomizedDelaySec = "1h";
persistent = true; persistent = true;
flake = "git+https://gitea.tmmworkshop.com/richie/dotfiles?ref=main"; flake = "github:RichieCahill/dotfiles";
allowReboot = true; allowReboot = true;
dates = "Sat *-*-* 06:00:00"; dates = "Sat *-*-* 06:00:00";
}; };
-76
View File
@@ -1,76 +0,0 @@
# ZFS failed root import recovery
## Fast path
If the machine fails to boot because ZFS refuses to import `root_pool`:
### GRUB
1. At the bootloader menu, select the normal NixOS entry.
2. Press `e`.
3. Find the line that starts with `linux`.
4. Append this to the end of that line:
```text
zfs_force=1
```
5. Boot once with `Ctrl+x` or `F10`.
### systemd-boot
1. At the bootloader menu, highlight the normal NixOS entry.
2. Press `e`.
3. Append this to the end of the options line:
```text
zfs_force=1
```
4. Press `Enter` to boot once.
## After boot
Run:
```bash
sudo zpool status
sudo zpool import
journalctl -b | rg "ZFS|zfs|import|root_pool"
```
## Expected result
`sudo zpool status` should show `root_pool` as `ONLINE`.
## Reboot test
Run:
```bash
sudo reboot
```
Do not add `zfs_force=1` the second time.
## If it still fails
Boot once more with:
```text
zfs_force=1
```
Then run:
```bash
sudo zpool status -v
sudo zpool history | tail -n 50
journalctl -b | rg "ZFS|zfs|import|root_pool"
```
## Notes
- Root pool name is `root_pool`.
- This is a one-time recovery path after disk moves, controller changes, dirty exports, or interrupted imports.
- Some hosts also need the LUKS unlock USB key inserted before boot.
Generated
+26 -42
View File
@@ -8,11 +8,11 @@
}, },
"locked": { "locked": {
"dir": "pkgs/firefox-addons", "dir": "pkgs/firefox-addons",
"lastModified": 1780733803, "lastModified": 1773979456,
"narHash": "sha256-QBJPq12P1DAXFGezoEJaSO/xPUrPlnaI3ddSaMG2JpM=", "narHash": "sha256-9kBMJ5IvxqNlkkj/swmE8uK1Sc7TL/LIRUI958m7uBM=",
"owner": "rycee", "owner": "rycee",
"repo": "nur-expressions", "repo": "nur-expressions",
"rev": "c80b0aa94392c5f3612ac797108f6d952752036d", "rev": "81e28f47ac18d9e89513929c77e711e657b64851",
"type": "gitlab" "type": "gitlab"
}, },
"original": { "original": {
@@ -29,11 +29,11 @@
] ]
}, },
"locked": { "locked": {
"lastModified": 1780679734, "lastModified": 1774007980,
"narHash": "sha256-KmRNvpNOb7QEORa06bVgjW9kITcx0VhsI7w0vhmZyD8=", "narHash": "sha256-FOnZjElEI8pqqCvB6K/1JRHTE8o4rer8driivTpq2uo=",
"owner": "nix-community", "owner": "nix-community",
"repo": "home-manager", "repo": "home-manager",
"rev": "b2b7db486e06e098711dc291bb25db82850e1d16", "rev": "9670de2921812bc4e0452f6e3efd8c859696c183",
"type": "github" "type": "github"
}, },
"original": { "original": {
@@ -43,15 +43,12 @@
} }
}, },
"nixos-hardware": { "nixos-hardware": {
"inputs": {
"nixpkgs": "nixpkgs"
},
"locked": { "locked": {
"lastModified": 1780310866, "lastModified": 1774018263,
"narHash": "sha256-fPBRVf6A5xlACYcOI59shGrjURuvwu0lRsDoSCEXt/I=", "narHash": "sha256-HHYEwK1A22aSaxv2ibhMMkKvrDGKGlA/qObG4smrSqc=",
"owner": "nixos", "owner": "nixos",
"repo": "nixos-hardware", "repo": "nixos-hardware",
"rev": "4ed851c979641e28597a05086332d75cdc9e395f", "rev": "2d4b4717b2534fad5c715968c1cece04a172b365",
"type": "github" "type": "github"
}, },
"original": { "original": {
@@ -63,24 +60,27 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1767892417, "lastModified": 1773821835,
"narHash": "sha256-8bW3q88CEg2u4hSP66Vf4lpbLonHz7hqDNBMcCY7E9U=", "narHash": "sha256-TJ3lSQtW0E2JrznGVm8hOQGVpXjJyXY2guAxku2O9A4=",
"rev": "3497aa5c9457a9d88d71fa93a4a8368816fbeeba", "owner": "nixos",
"type": "tarball", "repo": "nixpkgs",
"url": "https://releases.nixos.org/nixos/unstable/nixos-26.05pre924538.3497aa5c9457/nixexprs.tar.xz" "rev": "b40629efe5d6ec48dd1efba650c797ddbd39ace0",
"type": "github"
}, },
"original": { "original": {
"type": "tarball", "owner": "nixos",
"url": "https://channels.nixos.org/nixos-unstable/nixexprs.tar.xz" "ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
} }
}, },
"nixpkgs-master": { "nixpkgs-master": {
"locked": { "locked": {
"lastModified": 1780798858, "lastModified": 1774051532,
"narHash": "sha256-4KLc5ZMjfMQosXA2JasUgZTk3i+c/i1zMH4custtmI0=", "narHash": "sha256-d3CGMweyYIcPuTj5BKq+1Lx4zwlgL31nVtN647tOZKo=",
"owner": "nixos", "owner": "nixos",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "92840095e65b9970125843175f4be974b71a92ad", "rev": "8620c0b5cc8fbe76502442181be1d0514bc3a1b7",
"type": "github" "type": "github"
}, },
"original": { "original": {
@@ -106,28 +106,12 @@
"type": "github" "type": "github"
} }
}, },
"nixpkgs_2": {
"locked": {
"lastModified": 1780243769,
"narHash": "sha256-x5UQuRsH3MqI0U9afaXSNqzTPSeZlRLvFAav2Ux1pNw=",
"owner": "nixos",
"repo": "nixpkgs",
"rev": "331800de5053fcebacf6813adb5db9c9dca22a0c",
"type": "github"
},
"original": {
"owner": "nixos",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"root": { "root": {
"inputs": { "inputs": {
"firefox-addons": "firefox-addons", "firefox-addons": "firefox-addons",
"home-manager": "home-manager", "home-manager": "home-manager",
"nixos-hardware": "nixos-hardware", "nixos-hardware": "nixos-hardware",
"nixpkgs": "nixpkgs_2", "nixpkgs": "nixpkgs",
"nixpkgs-master": "nixpkgs-master", "nixpkgs-master": "nixpkgs-master",
"nixpkgs-stable": "nixpkgs-stable", "nixpkgs-stable": "nixpkgs-stable",
"sops-nix": "sops-nix", "sops-nix": "sops-nix",
@@ -141,11 +125,11 @@
] ]
}, },
"locked": { "locked": {
"lastModified": 1780547341, "lastModified": 1773889674,
"narHash": "sha256-Gq8KNx5A7hBB3uGJaj6eQfLDIz5YdLu92gqBcvHvoUo=", "narHash": "sha256-+ycaiVAk3MEshJTg35cBTUa0MizGiS+bgpYw/f8ohkg=",
"owner": "Mic92", "owner": "Mic92",
"repo": "sops-nix", "repo": "sops-nix",
"rev": "9ed65852b6257fbeae4355bc24ecfea307ca759a", "rev": "29b6519f3e0780452bca0ac0be4584f04ac16cc5",
"type": "github" "type": "github"
}, },
"original": { "original": {
+24
View File
@@ -0,0 +1,24 @@
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
pnpm-debug.log*
lerna-debug.log*
node_modules
dist
dist-ssr
*.local
# Editor directories and files
.vscode/*
!.vscode/extensions.json
.idea
.DS_Store
*.suo
*.ntvs*
*.njsproj
*.sln
*.sw?
+1 -1
View File
@@ -21,6 +21,7 @@
alembic alembic
apprise apprise
apscheduler apscheduler
confluent-kafka
fastapi fastapi
fastapi-cli fastapi-cli
httpx httpx
@@ -41,7 +42,6 @@
sqlalchemy sqlalchemy
tenacity tenacity
textual textual
tiktoken
tinytuya tinytuya
typer typer
websockets websockets
+3 -3
View File
@@ -26,7 +26,6 @@ dependencies = [
[project.scripts] [project.scripts]
database = "python.database_cli:app" database = "python.database_cli:app"
van-inventory = "python.van_inventory.main:serve" van-inventory = "python.van_inventory.main:serve"
whisper-transcribe = "python.tools.whisper.transcribe:main"
[dependency-groups] [dependency-groups]
dev = [ dev = [
@@ -51,7 +50,6 @@ lint.ignore = [
"COM812", # (TEMP) conflicts when used with the formatter "COM812", # (TEMP) conflicts when used with the formatter
"ISC001", # (TEMP) conflicts when used with the formatter "ISC001", # (TEMP) conflicts when used with the formatter
"S603", # (PERM) This is known to cause a false positive "S603", # (PERM) This is known to cause a false positive
"S607", # (PERM) This is becoming a consistent annoyance
] ]
[tool.ruff.lint.per-file-ignores] [tool.ruff.lint.per-file-ignores]
@@ -80,7 +78,9 @@ lint.ignore = [
"python/congress_tracker/**" = [ "python/congress_tracker/**" = [
"TC003", # (perm) this creates issues because sqlalchemy uses these at runtime "TC003", # (perm) this creates issues because sqlalchemy uses these at runtime
] ]
"python/eval_warnings/**" = [
"S607", # (perm) gh and git are expected on PATH in the runner environment
]
"python/alembic/**" = [ "python/alembic/**" = [
"INP001", # (perm) this creates LSP issues for alembic "INP001", # (perm) this creates LSP issues for alembic
] ]
@@ -46,7 +46,12 @@ ALREADY_ATTACHED_QUERY = text("""
def upgrade() -> None: def upgrade() -> None:
"""Attach all weekly partition tables to the posts parent table.""" """Attach all weekly partition tables to the posts parent table."""
connection = op.get_bind() connection = op.get_bind()
already_attached = {row[0] for row in connection.execute(ALREADY_ATTACHED_QUERY, {"parent": f"{schema}.posts"})} already_attached = {
row[0]
for row in connection.execute(
ALREADY_ATTACHED_QUERY, {"parent": f"{schema}.posts"}
)
}
for year in range(PARTITION_START_YEAR, PARTITION_END_YEAR + 1): for year in range(PARTITION_START_YEAR, PARTITION_END_YEAR + 1):
for week in range(1, iso_weeks_in_year(year) + 1): for week in range(1, iso_weeks_in_year(year) + 1):
@@ -69,4 +74,7 @@ def downgrade() -> None:
for year in range(PARTITION_START_YEAR, PARTITION_END_YEAR + 1): for year in range(PARTITION_START_YEAR, PARTITION_END_YEAR + 1):
for week in range(1, iso_weeks_in_year(year) + 1): for week in range(1, iso_weeks_in_year(year) + 1):
table_name = f"posts_{year}_{week:02d}" table_name = f"posts_{year}_{week:02d}"
op.execute(f"ALTER TABLE {schema}.posts DETACH PARTITION {schema}.{table_name}") op.execute(
f"ALTER TABLE {schema}.posts "
f"DETACH PARTITION {schema}.{table_name}"
)
@@ -1,153 +0,0 @@
"""adding congress data.
Revision ID: 83bfc8af92d8
Revises: a1b2c3d4e5f6
Create Date: 2026-03-27 10:43:02.324510
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import sqlalchemy as sa
from alembic import op
from python.orm import DataScienceDevBase
if TYPE_CHECKING:
from collections.abc import Sequence
# revision identifiers, used by Alembic.
revision: str = "83bfc8af92d8"
down_revision: str | None = "a1b2c3d4e5f6"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
schema = DataScienceDevBase.schema_name
def upgrade() -> None:
"""Upgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.create_table(
"bill",
sa.Column("congress", sa.Integer(), nullable=False),
sa.Column("bill_type", sa.String(), nullable=False),
sa.Column("number", sa.Integer(), nullable=False),
sa.Column("title", sa.String(), nullable=True),
sa.Column("title_short", sa.String(), nullable=True),
sa.Column("official_title", sa.String(), nullable=True),
sa.Column("status", sa.String(), nullable=True),
sa.Column("status_at", sa.Date(), nullable=True),
sa.Column("sponsor_bioguide_id", sa.String(), nullable=True),
sa.Column("subjects_top_term", sa.String(), nullable=True),
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.PrimaryKeyConstraint("id", name=op.f("pk_bill")),
sa.UniqueConstraint("congress", "bill_type", "number", name="uq_bill_congress_type_number"),
schema=schema,
)
op.create_index("ix_bill_congress", "bill", ["congress"], unique=False, schema=schema)
op.create_table(
"legislator",
sa.Column("bioguide_id", sa.Text(), nullable=False),
sa.Column("thomas_id", sa.String(), nullable=True),
sa.Column("lis_id", sa.String(), nullable=True),
sa.Column("govtrack_id", sa.Integer(), nullable=True),
sa.Column("opensecrets_id", sa.String(), nullable=True),
sa.Column("fec_ids", sa.String(), nullable=True),
sa.Column("first_name", sa.String(), nullable=False),
sa.Column("last_name", sa.String(), nullable=False),
sa.Column("official_full_name", sa.String(), nullable=True),
sa.Column("nickname", sa.String(), nullable=True),
sa.Column("birthday", sa.Date(), nullable=True),
sa.Column("gender", sa.String(), nullable=True),
sa.Column("current_party", sa.String(), nullable=True),
sa.Column("current_state", sa.String(), nullable=True),
sa.Column("current_district", sa.Integer(), nullable=True),
sa.Column("current_chamber", sa.String(), nullable=True),
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.PrimaryKeyConstraint("id", name=op.f("pk_legislator")),
schema=schema,
)
op.create_index(op.f("ix_legislator_bioguide_id"), "legislator", ["bioguide_id"], unique=True, schema=schema)
op.create_table(
"bill_text",
sa.Column("bill_id", sa.Integer(), nullable=False),
sa.Column("version_code", sa.String(), nullable=False),
sa.Column("version_name", sa.String(), nullable=True),
sa.Column("text_content", sa.String(), nullable=True),
sa.Column("date", sa.Date(), nullable=True),
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.ForeignKeyConstraint(
["bill_id"], [f"{schema}.bill.id"], name=op.f("fk_bill_text_bill_id_bill"), ondelete="CASCADE"
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_bill_text")),
sa.UniqueConstraint("bill_id", "version_code", name="uq_bill_text_bill_id_version_code"),
schema=schema,
)
op.create_table(
"vote",
sa.Column("congress", sa.Integer(), nullable=False),
sa.Column("chamber", sa.String(), nullable=False),
sa.Column("session", sa.Integer(), nullable=False),
sa.Column("number", sa.Integer(), nullable=False),
sa.Column("vote_type", sa.String(), nullable=True),
sa.Column("question", sa.String(), nullable=True),
sa.Column("result", sa.String(), nullable=True),
sa.Column("result_text", sa.String(), nullable=True),
sa.Column("vote_date", sa.Date(), nullable=False),
sa.Column("yea_count", sa.Integer(), nullable=True),
sa.Column("nay_count", sa.Integer(), nullable=True),
sa.Column("not_voting_count", sa.Integer(), nullable=True),
sa.Column("present_count", sa.Integer(), nullable=True),
sa.Column("bill_id", sa.Integer(), nullable=True),
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.ForeignKeyConstraint(["bill_id"], [f"{schema}.bill.id"], name=op.f("fk_vote_bill_id_bill")),
sa.PrimaryKeyConstraint("id", name=op.f("pk_vote")),
sa.UniqueConstraint("congress", "chamber", "session", "number", name="uq_vote_congress_chamber_session_number"),
schema=schema,
)
op.create_index("ix_vote_congress_chamber", "vote", ["congress", "chamber"], unique=False, schema=schema)
op.create_index("ix_vote_date", "vote", ["vote_date"], unique=False, schema=schema)
op.create_table(
"vote_record",
sa.Column("vote_id", sa.Integer(), nullable=False),
sa.Column("legislator_id", sa.Integer(), nullable=False),
sa.Column("position", sa.String(), nullable=False),
sa.ForeignKeyConstraint(
["legislator_id"],
[f"{schema}.legislator.id"],
name=op.f("fk_vote_record_legislator_id_legislator"),
ondelete="CASCADE",
),
sa.ForeignKeyConstraint(
["vote_id"], [f"{schema}.vote.id"], name=op.f("fk_vote_record_vote_id_vote"), ondelete="CASCADE"
),
sa.PrimaryKeyConstraint("vote_id", "legislator_id", name=op.f("pk_vote_record")),
schema=schema,
)
# ### end Alembic commands ###
def downgrade() -> None:
"""Downgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table("vote_record", schema=schema)
op.drop_index("ix_vote_date", table_name="vote", schema=schema)
op.drop_index("ix_vote_congress_chamber", table_name="vote", schema=schema)
op.drop_table("vote", schema=schema)
op.drop_table("bill_text", schema=schema)
op.drop_index(op.f("ix_legislator_bioguide_id"), table_name="legislator", schema=schema)
op.drop_table("legislator", schema=schema)
op.drop_index("ix_bill_congress", table_name="bill", schema=schema)
op.drop_table("bill", schema=schema)
# ### end Alembic commands ###
@@ -1,58 +0,0 @@
"""adding LegislatorSocialMedia.
Revision ID: 5cd7eee3549d
Revises: 83bfc8af92d8
Create Date: 2026-03-29 11:53:44.224799
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import sqlalchemy as sa
from alembic import op
from python.orm import DataScienceDevBase
if TYPE_CHECKING:
from collections.abc import Sequence
# revision identifiers, used by Alembic.
revision: str = "5cd7eee3549d"
down_revision: str | None = "83bfc8af92d8"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
schema = DataScienceDevBase.schema_name
def upgrade() -> None:
"""Upgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.create_table(
"legislator_social_media",
sa.Column("legislator_id", sa.Integer(), nullable=False),
sa.Column("platform", sa.String(), nullable=False),
sa.Column("account_name", sa.String(), nullable=False),
sa.Column("url", sa.String(), nullable=True),
sa.Column("source", sa.String(), nullable=False),
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.ForeignKeyConstraint(
["legislator_id"],
[f"{schema}.legislator.id"],
name=op.f("fk_legislator_social_media_legislator_id_legislator"),
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_legislator_social_media")),
schema=schema,
)
# ### end Alembic commands ###
def downgrade() -> None:
"""Downgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table("legislator_social_media", schema=schema)
# ### end Alembic commands ###
+1
View File
@@ -3,6 +3,7 @@
from __future__ import annotations from __future__ import annotations
import logging import logging
import re
import sys import sys
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Any, Literal from typing import TYPE_CHECKING, Any, Literal
@@ -1,187 +0,0 @@
"""removed ds table from richie DB.
Revision ID: c8a794340928
Revises: 6b275323f435
Create Date: 2026-03-29 15:29:23.643146
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects import postgresql
from python.orm import RichieBase
if TYPE_CHECKING:
from collections.abc import Sequence
# revision identifiers, used by Alembic.
revision: str = "c8a794340928"
down_revision: str | None = "6b275323f435"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
schema = RichieBase.schema_name
def upgrade() -> None:
"""Upgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table("vote_record", schema=schema)
op.drop_index(op.f("ix_vote_congress_chamber"), table_name="vote", schema=schema)
op.drop_index(op.f("ix_vote_date"), table_name="vote", schema=schema)
op.drop_index(op.f("ix_legislator_bioguide_id"), table_name="legislator", schema=schema)
op.drop_table("legislator", schema=schema)
op.drop_table("vote", schema=schema)
op.drop_index(op.f("ix_bill_congress"), table_name="bill", schema=schema)
op.drop_table("bill", schema=schema)
# ### end Alembic commands ###
def downgrade() -> None:
"""Downgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.create_table(
"vote",
sa.Column("congress", sa.INTEGER(), autoincrement=False, nullable=False),
sa.Column("chamber", sa.VARCHAR(), autoincrement=False, nullable=False),
sa.Column("session", sa.INTEGER(), autoincrement=False, nullable=False),
sa.Column("number", sa.INTEGER(), autoincrement=False, nullable=False),
sa.Column("vote_type", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("question", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("result", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("result_text", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("vote_date", sa.DATE(), autoincrement=False, nullable=False),
sa.Column("yea_count", sa.INTEGER(), autoincrement=False, nullable=True),
sa.Column("nay_count", sa.INTEGER(), autoincrement=False, nullable=True),
sa.Column("not_voting_count", sa.INTEGER(), autoincrement=False, nullable=True),
sa.Column("present_count", sa.INTEGER(), autoincrement=False, nullable=True),
sa.Column("bill_id", sa.INTEGER(), autoincrement=False, nullable=True),
sa.Column("id", sa.INTEGER(), autoincrement=True, nullable=False),
sa.Column(
"created",
postgresql.TIMESTAMP(timezone=True),
server_default=sa.text("now()"),
autoincrement=False,
nullable=False,
),
sa.Column(
"updated",
postgresql.TIMESTAMP(timezone=True),
server_default=sa.text("now()"),
autoincrement=False,
nullable=False,
),
sa.ForeignKeyConstraint(["bill_id"], [f"{schema}.bill.id"], name=op.f("fk_vote_bill_id_bill")),
sa.PrimaryKeyConstraint("id", name=op.f("pk_vote")),
sa.UniqueConstraint(
"congress",
"chamber",
"session",
"number",
name=op.f("uq_vote_congress_chamber_session_number"),
postgresql_include=[],
postgresql_nulls_not_distinct=False,
),
schema=schema,
)
op.create_index(op.f("ix_vote_date"), "vote", ["vote_date"], unique=False, schema=schema)
op.create_index(op.f("ix_vote_congress_chamber"), "vote", ["congress", "chamber"], unique=False, schema=schema)
op.create_table(
"vote_record",
sa.Column("vote_id", sa.INTEGER(), autoincrement=False, nullable=False),
sa.Column("legislator_id", sa.INTEGER(), autoincrement=False, nullable=False),
sa.Column("position", sa.VARCHAR(), autoincrement=False, nullable=False),
sa.ForeignKeyConstraint(
["legislator_id"],
[f"{schema}.legislator.id"],
name=op.f("fk_vote_record_legislator_id_legislator"),
ondelete="CASCADE",
),
sa.ForeignKeyConstraint(
["vote_id"], [f"{schema}.vote.id"], name=op.f("fk_vote_record_vote_id_vote"), ondelete="CASCADE"
),
sa.PrimaryKeyConstraint("vote_id", "legislator_id", name=op.f("pk_vote_record")),
schema=schema,
)
op.create_table(
"legislator",
sa.Column("bioguide_id", sa.TEXT(), autoincrement=False, nullable=False),
sa.Column("thomas_id", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("lis_id", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("govtrack_id", sa.INTEGER(), autoincrement=False, nullable=True),
sa.Column("opensecrets_id", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("fec_ids", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("first_name", sa.VARCHAR(), autoincrement=False, nullable=False),
sa.Column("last_name", sa.VARCHAR(), autoincrement=False, nullable=False),
sa.Column("official_full_name", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("nickname", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("birthday", sa.DATE(), autoincrement=False, nullable=True),
sa.Column("gender", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("current_party", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("current_state", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("current_district", sa.INTEGER(), autoincrement=False, nullable=True),
sa.Column("current_chamber", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("id", sa.INTEGER(), autoincrement=True, nullable=False),
sa.Column(
"created",
postgresql.TIMESTAMP(timezone=True),
server_default=sa.text("now()"),
autoincrement=False,
nullable=False,
),
sa.Column(
"updated",
postgresql.TIMESTAMP(timezone=True),
server_default=sa.text("now()"),
autoincrement=False,
nullable=False,
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_legislator")),
schema=schema,
)
op.create_index(op.f("ix_legislator_bioguide_id"), "legislator", ["bioguide_id"], unique=True, schema=schema)
op.create_table(
"bill",
sa.Column("congress", sa.INTEGER(), autoincrement=False, nullable=False),
sa.Column("bill_type", sa.VARCHAR(), autoincrement=False, nullable=False),
sa.Column("number", sa.INTEGER(), autoincrement=False, nullable=False),
sa.Column("title", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("title_short", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("official_title", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("status", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("status_at", sa.DATE(), autoincrement=False, nullable=True),
sa.Column("sponsor_bioguide_id", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("subjects_top_term", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("id", sa.INTEGER(), autoincrement=True, nullable=False),
sa.Column(
"created",
postgresql.TIMESTAMP(timezone=True),
server_default=sa.text("now()"),
autoincrement=False,
nullable=False,
),
sa.Column(
"updated",
postgresql.TIMESTAMP(timezone=True),
server_default=sa.text("now()"),
autoincrement=False,
nullable=False,
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_bill")),
sa.UniqueConstraint(
"congress",
"bill_type",
"number",
name=op.f("uq_bill_congress_type_number"),
postgresql_include=[],
postgresql_nulls_not_distinct=False,
),
schema=schema,
)
op.create_index(op.f("ix_bill_congress"), "bill", ["congress"], unique=False, schema=schema)
# ### end Alembic commands ###
@@ -1,93 +0,0 @@
"""adding audiobook libreary metadata.
Revision ID: d7864d1ffc17
Revises: c8a794340928
Create Date: 2026-06-03 20:24:09.200837
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import sqlalchemy as sa
from alembic import op
from python.orm import RichieBase
if TYPE_CHECKING:
from collections.abc import Sequence
# revision identifiers, used by Alembic.
revision: str = "d7864d1ffc17"
down_revision: str | None = "c8a794340928"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
schema = RichieBase.schema_name
def upgrade() -> None:
"""Upgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.create_table(
"audiobook_author",
sa.Column("name", sa.String(), nullable=False),
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.PrimaryKeyConstraint("id", name=op.f("pk_audiobook_author")),
sa.UniqueConstraint("name", name=op.f("uq_audiobook_author_name")),
schema=schema,
)
op.create_table(
"audiobook_series",
sa.Column("name", sa.String(), nullable=False),
sa.Column("author_id", sa.Integer(), nullable=False),
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.ForeignKeyConstraint(
["author_id"],
[f"{schema}.audiobook_author.id"],
name=op.f("fk_audiobook_series_author_id_audiobook_author"),
ondelete="CASCADE",
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_audiobook_series")),
sa.UniqueConstraint("author_id", "name", name=op.f("uq_audiobook_series_author_id")),
schema=schema,
)
op.create_table(
"audiobook",
sa.Column("title", sa.String(), nullable=False),
sa.Column("author_id", sa.Integer(), nullable=False),
sa.Column("series_id", sa.Integer(), nullable=True),
sa.Column("series_index", sa.Integer(), nullable=False),
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.ForeignKeyConstraint(
["author_id"],
[f"{schema}.audiobook_author.id"],
name=op.f("fk_audiobook_author_id_audiobook_author"),
ondelete="CASCADE",
),
sa.ForeignKeyConstraint(
["series_id"],
[f"{schema}.audiobook_series.id"],
name=op.f("fk_audiobook_series_id_audiobook_series"),
ondelete="SET NULL",
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_audiobook")),
schema=schema,
)
# ### end Alembic commands ###
def downgrade() -> None:
"""Downgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table("audiobook", schema=schema)
op.drop_table("audiobook_series", schema=schema)
op.drop_table("audiobook_author", schema=schema)
# ### end Alembic commands ###
+104
View File
@@ -0,0 +1,104 @@
"""Utilities for converting Bluesky identifiers to numeric database IDs.
Handles DID-to-user_id hashing, TID-to-post_id decoding, and AT-URI parsing.
"""
from __future__ import annotations
import hashlib
TID_CHARSET = "234567abcdefghijklmnopqrstuvwxyz"
_TID_LENGTH = 13
_BIGINT_MASK = 0x7FFFFFFFFFFFFFFF
_AT_URI_SEGMENT_COUNT = 3
def did_to_user_id(did: str) -> int:
"""Convert a DID string to a deterministic 63-bit integer for user_id.
Uses SHA-256, truncated to 63 bits (positive signed BigInteger range).
Collision probability is negligible at Bluesky's scale (~tens of millions of users).
Args:
did: A Bluesky DID string, e.g. "did:plc:abc123".
Returns:
A positive 63-bit integer suitable for BigInteger storage.
"""
digest = hashlib.sha256(did.encode()).digest()
return int.from_bytes(digest[:8], "big") & _BIGINT_MASK
def tid_to_integer(tid: str) -> int:
"""Decode a Bluesky TID (base32-sortbase) into a 64-bit integer for post_id.
TIDs are 13-character, base32-sortbase encoded identifiers that encode a
microsecond timestamp plus a clock ID. They are globally unique by construction.
Args:
tid: A 13-character TID string, e.g. "3abc2defghijk".
Returns:
A positive integer suitable for BigInteger storage.
Raises:
ValueError: If the TID is malformed (wrong length or invalid characters).
"""
if len(tid) != _TID_LENGTH:
message = f"TID must be {_TID_LENGTH} characters, got {len(tid)}: {tid!r}"
raise ValueError(message)
result = 0
for char in tid:
index = TID_CHARSET.find(char)
if index == -1:
message = f"Invalid character {char!r} in TID {tid!r}"
raise ValueError(message)
result = result * 32 + index
return result
def parse_at_uri(uri: str) -> tuple[str, str, str]:
"""Parse an AT-URI into its components.
Args:
uri: An AT-URI string, e.g. "at://did:plc:abc123/app.bsky.feed.post/3abc2defghijk".
Returns:
A tuple of (did, collection, rkey).
Raises:
ValueError: If the URI doesn't have the expected format.
"""
stripped = uri.removeprefix("at://")
parts = stripped.split("/", maxsplit=2)
if len(parts) != _AT_URI_SEGMENT_COUNT:
message = f"Expected {_AT_URI_SEGMENT_COUNT} path segments in AT-URI, got {len(parts)}: {uri!r}"
raise ValueError(message)
return parts[0], parts[1], parts[2]
def post_id_from_uri(uri: str) -> int:
"""Extract and decode the post_id (TID) from an AT-URI.
Args:
uri: An AT-URI pointing to a post.
Returns:
The post_id as an integer.
"""
_did, _collection, rkey = parse_at_uri(uri)
return tid_to_integer(rkey)
def user_id_from_uri(uri: str) -> int:
"""Extract and hash the user_id (DID) from an AT-URI.
Args:
uri: An AT-URI pointing to a post.
Returns:
The user_id as an integer.
"""
did, _collection, _rkey = parse_at_uri(uri)
return did_to_user_id(did)
+143
View File
@@ -0,0 +1,143 @@
"""Transform Bluesky Jetstream messages into rows matching the Posts table schema."""
from __future__ import annotations
import json
import logging
from datetime import datetime
from python.data_science.bluesky_ids import (
did_to_user_id,
post_id_from_uri,
tid_to_integer,
user_id_from_uri,
)
logger = logging.getLogger(__name__)
INSTANCE = "bsky"
POST_COLLECTION = "app.bsky.feed.post"
EMBED_RECORD_TYPE = "app.bsky.embed.record"
EMBED_RECORD_WITH_MEDIA_TYPE = "app.bsky.embed.recordWithMedia"
def transform_jetstream_post(message: dict) -> dict:
"""Transform a Jetstream commit message into a dict matching Posts table columns.
Expects a Jetstream message with kind=commit, operation=create,
collection=app.bsky.feed.post.
Args:
message: The full Jetstream JSON message.
Returns:
A dict with keys matching the Posts table columns.
"""
did = message["did"]
commit = message["commit"]
record = commit["record"]
row: dict = {
"post_id": tid_to_integer(commit["rkey"]),
"user_id": did_to_user_id(did),
"instance": INSTANCE,
"date": datetime.fromisoformat(record["createdAt"]),
"text": record.get("text", ""),
"langs": _extract_langs(record),
"like_count": 0,
"reply_count": 0,
"repost_count": 0,
"reply_to": None,
"replied_author": None,
"thread_root": None,
"thread_root_author": None,
"repost_from": None,
"reposted_author": None,
"quotes": None,
"quoted_author": None,
"labels": _extract_labels(record),
"sent_label": None,
"sent_score": None,
}
_extract_reply_refs(record, row)
_extract_quote_refs(record, row)
return row
def is_post_create(message: dict) -> bool:
"""Check if a Jetstream message is a post creation event.
Args:
message: The full Jetstream JSON message.
Returns:
True if this is a create commit for app.bsky.feed.post.
"""
if message.get("kind") != "commit":
return False
commit = message.get("commit", {})
return commit.get("operation") == "create" and commit.get("collection") == POST_COLLECTION
def _extract_langs(record: dict) -> str | None:
"""Extract langs array as a JSON string, or None if absent."""
langs = record.get("langs")
if langs is None:
return None
return json.dumps(langs)
def _extract_labels(record: dict) -> str | None:
"""Extract self-labels as a JSON string, or None if absent."""
labels_obj = record.get("labels")
if labels_obj is None:
return None
values = labels_obj.get("values", [])
if not values:
return None
label_strings = [label.get("val", "") for label in values]
return json.dumps(label_strings)
def _extract_reply_refs(record: dict, row: dict) -> None:
"""Populate reply_to, replied_author, thread_root, thread_root_author from record.reply."""
reply = record.get("reply")
if reply is None:
return
parent = reply.get("parent", {})
parent_uri = parent.get("uri")
if parent_uri:
row["reply_to"] = post_id_from_uri(parent_uri)
row["replied_author"] = user_id_from_uri(parent_uri)
root = reply.get("root", {})
root_uri = root.get("uri")
if root_uri:
row["thread_root"] = post_id_from_uri(root_uri)
row["thread_root_author"] = user_id_from_uri(root_uri)
def _extract_quote_refs(record: dict, row: dict) -> None:
"""Populate quotes and quoted_author from embed record references."""
embed = record.get("embed")
if embed is None:
return
embed_type = embed.get("$type", "")
if embed_type == EMBED_RECORD_TYPE:
_set_quote_from_record(embed.get("record", {}), row)
elif embed_type == EMBED_RECORD_WITH_MEDIA_TYPE:
inner_record = embed.get("record", {}).get("record", {})
_set_quote_from_record(inner_record, row)
def _set_quote_from_record(record_ref: dict, row: dict) -> None:
"""Set quotes and quoted_author from a record reference object."""
uri = record_ref.get("uri")
if uri and POST_COLLECTION in uri:
row["quotes"] = post_id_from_uri(uri)
row["quoted_author"] = user_id_from_uri(uri)
+203
View File
@@ -0,0 +1,203 @@
"""Kafka consumer that ingests Bluesky posts into the partitioned Posts table.
Consumes Jetstream messages from Kafka, transforms them into Posts rows,
and batch-inserts them into PostgreSQL with manual offset commits.
Usage:
firehose-consumer
firehose-consumer --kafka-servers kafka:9092 --batch-size 500
"""
from __future__ import annotations
import json
import logging
import signal
from os import getenv
from threading import Event
from typing import Annotated
import typer
from confluent_kafka import Consumer, KafkaError, KafkaException
from sqlalchemy.orm import Session
from python.data_science.bluesky_transform import is_post_create, transform_jetstream_post
from python.data_science.ingest_posts import ingest_batch
from python.orm.common import get_postgres_engine
from python.orm.data_science_dev.posts.failed_ingestion import FailedIngestion
logger = logging.getLogger(__name__)
DEFAULT_TOPIC = "bluesky.firehose.posts"
DEFAULT_KAFKA_SERVERS = "localhost:9092"
DEFAULT_GROUP_ID = "bluesky-posts-ingestor"
DEFAULT_BATCH_SIZE = 500
POLL_TIMEOUT_SECONDS = 5.0
shutdown_event = Event()
app = typer.Typer(help="Consume Bluesky posts from Kafka and ingest into PostgreSQL.")
@app.command()
def main(
kafka_servers: Annotated[str, typer.Option(help="Kafka bootstrap servers")] = "",
topic: Annotated[str, typer.Option(help="Kafka topic to consume from")] = "",
group_id: Annotated[str, typer.Option(help="Kafka consumer group ID")] = "",
batch_size: Annotated[int, typer.Option(help="Messages per DB insert batch")] = DEFAULT_BATCH_SIZE,
) -> None:
"""Consume Bluesky posts from Kafka and ingest into the partitioned posts table."""
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
datefmt="%H:%M:%S",
)
servers = kafka_servers or getenv("KAFKA_BOOTSTRAP_SERVERS", DEFAULT_KAFKA_SERVERS)
topic_name = topic or getenv("BLUESKY_FIREHOSE_TOPIC", DEFAULT_TOPIC)
group = group_id or getenv("KAFKA_GROUP_ID", DEFAULT_GROUP_ID)
signal.signal(signal.SIGTERM, _handle_shutdown)
signal.signal(signal.SIGINT, _handle_shutdown)
consumer = _create_consumer(servers, group)
consumer.subscribe([topic_name])
engine = get_postgres_engine(name="DATA_SCIENCE_DEV")
total_inserted = 0
logger.info("Starting firehose consumer: topic=%s group=%s batch_size=%d", topic_name, group, batch_size)
try:
with Session(engine) as session:
while not shutdown_event.is_set():
inserted = _consume_batch(consumer, session, batch_size)
total_inserted += inserted
if inserted > 0:
logger.info("Batch inserted %d rows (total: %d)", inserted, total_inserted)
except KafkaException:
logger.exception("Fatal Kafka error")
finally:
logger.info("Closing consumer (total inserted: %d)", total_inserted)
consumer.close()
def _consume_batch(consumer: Consumer, session: Session, batch_size: int) -> int:
"""Poll a batch of messages, transform, and insert into the database.
Args:
consumer: The Kafka consumer instance.
session: SQLAlchemy database session.
batch_size: Maximum number of messages to consume per batch.
Returns:
Number of rows successfully inserted.
"""
messages = consumer.consume(num_messages=batch_size, timeout=POLL_TIMEOUT_SECONDS)
if not messages:
return 0
rows: list[dict] = []
for message in messages:
error = message.error()
if error is not None:
if error.code() == KafkaError._PARTITION_EOF: # noqa: SLF001 — confluent-kafka exposes this as a pseudo-private constant; no public alternative exists
continue
logger.error("Consumer error: %s", error)
continue
row = _safe_transform(message.value(), session)
if row is not None:
rows.append(row)
if not rows:
consumer.commit(asynchronous=False)
return 0
inserted = ingest_batch(session, rows)
consumer.commit(asynchronous=False)
return inserted
def _safe_transform(raw_value: bytes | None, session: Session) -> dict | None:
"""Transform a Kafka message value into a Posts row, logging failures.
Args:
raw_value: Raw message bytes from Kafka.
session: SQLAlchemy session for logging failures.
Returns:
A transformed row dict, or None if transformation failed.
"""
if raw_value is None:
return None
try:
message = json.loads(raw_value)
except (json.JSONDecodeError, UnicodeDecodeError):
logger.exception("Failed to decode Kafka message")
_log_failed_ingestion(session, raw_value, "JSON decode error")
return None
if not is_post_create(message):
return None
try:
return transform_jetstream_post(message)
except (KeyError, ValueError, TypeError):
logger.exception("Failed to transform Jetstream message")
_log_failed_ingestion(session, raw_value, "Transform error")
return None
def _log_failed_ingestion(session: Session, raw_value: bytes, error: str) -> None:
"""Log a failed ingestion to the FailedIngestion table.
Args:
session: SQLAlchemy session.
raw_value: The raw message bytes.
error: Description of the error.
"""
try:
session.add(
FailedIngestion(
raw_line=raw_value.decode(errors="replace")[:10000],
error=error,
)
)
session.commit()
except Exception:
session.rollback()
logger.exception("Failed to log ingestion failure")
def _create_consumer(servers: str, group: str) -> Consumer:
"""Create a configured Kafka consumer.
Args:
servers: Kafka bootstrap servers string.
group: Consumer group ID.
Returns:
A configured confluent_kafka.Consumer.
"""
config = {
"bootstrap.servers": servers,
"group.id": group,
"auto.offset.reset": "earliest",
"enable.auto.commit": False,
"max.poll.interval.ms": 300000,
"fetch.min.bytes": 1024,
"session.timeout.ms": 30000,
}
return Consumer(config)
def _handle_shutdown(_signum: int, _frame: object) -> None:
"""Signal handler to trigger graceful shutdown."""
logger.info("Shutdown signal received")
shutdown_event.set()
if __name__ == "__main__":
app()
+230
View File
@@ -0,0 +1,230 @@
"""Bluesky Jetstream firehose to Kafka producer.
Connects to the Bluesky Jetstream WebSocket API with zstd compression,
filters for post creation events, and produces them to a Kafka topic.
Usage:
firehose-producer
firehose-producer --kafka-servers kafka:9092 --topic bluesky.firehose.posts
"""
from __future__ import annotations
import json
import logging
import signal
from os import getenv
from threading import Event
from typing import Annotated
import typer
from compression import zstd
from confluent_kafka import KafkaError, KafkaException, Producer
from websockets.exceptions import ConnectionClosed
from websockets.sync.client import connect
logger = logging.getLogger(__name__)
JETSTREAM_URL = "wss://jetstream2.us-east.bsky.network/subscribe"
DEFAULT_TOPIC = "bluesky.firehose.posts"
DEFAULT_KAFKA_SERVERS = "localhost:9092"
POLL_INTERVAL = 100
POST_COLLECTION = "app.bsky.feed.post"
shutdown_event = Event()
app = typer.Typer(help="Stream Bluesky firehose posts into Kafka.")
@app.command()
def main(
kafka_servers: Annotated[str, typer.Option(help="Kafka bootstrap servers")] = "",
topic: Annotated[str, typer.Option(help="Kafka topic to produce to")] = "",
collections: Annotated[str, typer.Option(help="Comma-separated collections to subscribe to")] = POST_COLLECTION,
) -> None:
"""Connect to Bluesky Jetstream and produce post events to Kafka."""
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
datefmt="%H:%M:%S",
)
servers = kafka_servers or getenv("KAFKA_BOOTSTRAP_SERVERS", DEFAULT_KAFKA_SERVERS)
topic_name = topic or getenv("BLUESKY_FIREHOSE_TOPIC", DEFAULT_TOPIC)
signal.signal(signal.SIGTERM, _handle_shutdown)
signal.signal(signal.SIGINT, _handle_shutdown)
producer = _create_producer(servers)
cursor: int | None = None
logger.info("Starting firehose producer → %s on %s", topic_name, servers)
while not shutdown_event.is_set():
try:
cursor = _stream_loop(producer, topic_name, collections, cursor)
except (ConnectionClosed, OSError):
logger.exception("WebSocket disconnected, reconnecting")
except KafkaException:
logger.exception("Kafka error, reconnecting")
if not shutdown_event.is_set():
logger.info("Reconnecting in 5 seconds (cursor=%s)", cursor)
shutdown_event.wait(timeout=5)
logger.info("Shutting down, flushing producer")
producer.flush(timeout=30)
logger.info("Producer shutdown complete")
def _stream_loop(
producer: Producer,
topic: str,
collections: str,
cursor: int | None,
) -> int | None:
"""Connect to Jetstream and stream messages to Kafka until disconnected.
Args:
producer: The Kafka producer instance.
topic: Kafka topic name.
collections: Comma-separated AT Protocol collections to subscribe to.
cursor: Optional microsecond timestamp to resume from.
Returns:
The last processed time_us cursor value.
"""
url = _build_jetstream_url(collections, cursor)
logger.info("Connecting to %s", url)
message_count = 0
last_cursor = cursor
with connect(url, additional_headers={"Accept-Encoding": "zstd"}) as websocket:
logger.info("Connected to Jetstream")
while not shutdown_event.is_set():
try:
raw_frame = websocket.recv(timeout=10)
except TimeoutError:
producer.poll(0)
continue
text = _decode_frame(raw_frame)
message = json.loads(text)
time_us = message.get("time_us")
if time_us is not None:
last_cursor = time_us
if not _is_post_create(message):
continue
did = message.get("did", "")
try:
producer.produce(
topic,
key=did.encode(),
value=text.encode() if isinstance(text, str) else text,
callback=_delivery_callback,
)
except BufferError:
logger.warning("Producer buffer full, flushing")
producer.flush(timeout=10)
producer.produce(
topic,
key=did.encode(),
value=text.encode() if isinstance(text, str) else text,
callback=_delivery_callback,
)
message_count += 1
if message_count % POLL_INTERVAL == 0:
producer.poll(0)
if message_count % 10000 == 0:
logger.info("Produced %d messages (cursor=%s)", message_count, last_cursor)
return last_cursor
def _build_jetstream_url(collections: str, cursor: int | None) -> str:
"""Build the Jetstream WebSocket URL with query parameters.
Args:
collections: Comma-separated collection names.
cursor: Optional microsecond timestamp for resumption.
Returns:
The full WebSocket URL.
"""
params = ["compress=true"]
for raw_collection in collections.split(","):
cleaned = raw_collection.strip()
if cleaned:
params.append(f"wantedCollections={cleaned}")
if cursor is not None:
params.append(f"cursor={cursor}")
return f"{JETSTREAM_URL}?{'&'.join(params)}"
def _decode_frame(frame: str | bytes) -> str:
"""Decode a WebSocket frame, decompressing zstd if binary.
Jetstream with compress=true sends zstd-compressed binary frames.
Args:
frame: Raw WebSocket frame data.
Returns:
The decoded JSON string.
"""
if isinstance(frame, bytes):
return zstd.decompress(frame).decode()
return frame
def _is_post_create(message: dict) -> bool:
"""Check if a Jetstream message is a post creation commit."""
if message.get("kind") != "commit":
return False
commit = message.get("commit", {})
return commit.get("operation") == "create" and commit.get("collection") == POST_COLLECTION
def _create_producer(servers: str) -> Producer:
"""Create a configured Kafka producer.
Args:
servers: Kafka bootstrap servers string.
Returns:
A configured confluent_kafka.Producer.
"""
config = {
"bootstrap.servers": servers,
"linger.ms": 50,
"batch.size": 65536,
"compression.type": "zstd",
"acks": "all",
"retries": 5,
"retry.backoff.ms": 500,
}
return Producer(config)
def _delivery_callback(error: KafkaError | None, _message: object) -> None:
"""Log delivery failures from the Kafka producer."""
if error is not None:
logger.error("Kafka delivery failed: %s", error)
def _handle_shutdown(_signum: int, _frame: object) -> None:
"""Signal handler to trigger graceful shutdown."""
logger.info("Shutdown signal received")
shutdown_event.set()
if __name__ == "__main__":
app()
-613
View File
@@ -1,613 +0,0 @@
"""Ingestion pipeline for loading congress data from unitedstates/congress JSON files.
Loads legislators, bills, votes, vote records, and bill text into the data_science_dev database.
Expects the parent directory to contain congress-tracker/ and congress-legislators/ as siblings.
Usage:
ingest-congress /path/to/parent/
ingest-congress /path/to/parent/ --congress 118
ingest-congress /path/to/parent/ --congress 118 --only bills
"""
from __future__ import annotations
import logging
from pathlib import Path # noqa: TC003 needed at runtime for typer CLI argument
from typing import TYPE_CHECKING, Annotated
import orjson
import typer
import yaml
from sqlalchemy import select
from sqlalchemy.orm import Session
from python.common import configure_logger
from python.orm.common import get_postgres_engine
from python.orm.data_science_dev.congress import Bill, BillText, Legislator, LegislatorSocialMedia, Vote, VoteRecord
if TYPE_CHECKING:
from collections.abc import Iterator
from sqlalchemy.engine import Engine
logger = logging.getLogger(__name__)
BATCH_SIZE = 10_000
app = typer.Typer(help="Ingest unitedstates/congress data into data_science_dev.")
@app.command()
def main(
parent_dir: Annotated[
Path,
typer.Argument(help="Parent directory containing congress-tracker/ and congress-legislators/"),
],
congress: Annotated[int | None, typer.Option(help="Only ingest a specific congress number")] = None,
only: Annotated[
str | None,
typer.Option(help="Only run a specific step: legislators, social-media, bills, votes, bill-text"),
] = None,
) -> None:
"""Ingest congress data from unitedstates/congress JSON files."""
configure_logger(level="INFO")
data_dir = parent_dir / "congress-tracker/congress/data/"
legislators_dir = parent_dir / "congress-legislators"
if not data_dir.is_dir():
typer.echo(f"Expected congress-tracker/ directory: {data_dir}", err=True)
raise typer.Exit(code=1)
if not legislators_dir.is_dir():
typer.echo(f"Expected congress-legislators/ directory: {legislators_dir}", err=True)
raise typer.Exit(code=1)
engine = get_postgres_engine(name="DATA_SCIENCE_DEV")
congress_dirs = _resolve_congress_dirs(data_dir, congress)
if not congress_dirs:
typer.echo("No congress directories found.", err=True)
raise typer.Exit(code=1)
logger.info("Found %d congress directories to process", len(congress_dirs))
steps: dict[str, tuple] = {
"legislators": (ingest_legislators, (engine, legislators_dir)),
"legislators-social-media": (ingest_social_media, (engine, legislators_dir)),
"bills": (ingest_bills, (engine, congress_dirs)),
"votes": (ingest_votes, (engine, congress_dirs)),
"bill-text": (ingest_bill_text, (engine, congress_dirs)),
}
if only:
if only not in steps:
typer.echo(f"Unknown step: {only}. Choose from: {', '.join(steps)}", err=True)
raise typer.Exit(code=1)
steps = {only: steps[only]}
for step_name, (step_func, step_args) in steps.items():
logger.info("=== Starting step: %s ===", step_name)
step_func(*step_args)
logger.info("=== Finished step: %s ===", step_name)
logger.info("ingest-congress done")
def _resolve_congress_dirs(data_dir: Path, congress: int | None) -> list[Path]:
"""Find congress number directories under data_dir."""
if congress is not None:
target = data_dir / str(congress)
return [target] if target.is_dir() else []
return sorted(path for path in data_dir.iterdir() if path.is_dir() and path.name.isdigit())
def _flush_batch(session: Session, batch: list[object], label: str) -> int:
"""Add a batch of ORM objects to the session and commit. Returns count added."""
if not batch:
return 0
session.add_all(batch)
session.commit()
count = len(batch)
logger.info("Committed %d %s", count, label)
batch.clear()
return count
# ---------------------------------------------------------------------------
# Legislators — loaded from congress-legislators YAML files
# ---------------------------------------------------------------------------
def ingest_legislators(engine: Engine, legislators_dir: Path) -> None:
"""Load legislators from congress-legislators YAML files."""
legislators_data = _load_legislators_yaml(legislators_dir)
logger.info("Loaded %d legislators from YAML files", len(legislators_data))
with Session(engine) as session:
existing_legislators = {
legislator.bioguide_id: legislator for legislator in session.scalars(select(Legislator)).all()
}
logger.info("Found %d existing legislators in DB", len(existing_legislators))
total_inserted = 0
total_updated = 0
for entry in legislators_data:
bioguide_id = entry.get("id", {}).get("bioguide")
if not bioguide_id:
continue
fields = _parse_legislator(entry)
if existing := existing_legislators.get(bioguide_id):
changed = False
for field, value in fields.items():
if value is not None and getattr(existing, field) != value:
setattr(existing, field, value)
changed = True
if changed:
total_updated += 1
else:
session.add(Legislator(bioguide_id=bioguide_id, **fields))
total_inserted += 1
session.commit()
logger.info("Inserted %d new legislators, updated %d existing", total_inserted, total_updated)
def _load_legislators_yaml(legislators_dir: Path) -> list[dict]:
"""Load and combine legislators-current.yaml and legislators-historical.yaml."""
legislators: list[dict] = []
for filename in ("legislators-current.yaml", "legislators-historical.yaml"):
path = legislators_dir / filename
if not path.exists():
logger.warning("Legislators file not found: %s", path)
continue
with path.open() as file:
data = yaml.safe_load(file)
if isinstance(data, list):
legislators.extend(data)
return legislators
def _parse_legislator(entry: dict) -> dict:
"""Extract Legislator fields from a congress-legislators YAML entry."""
ids = entry.get("id", {})
name = entry.get("name", {})
bio = entry.get("bio", {})
terms = entry.get("terms", [])
latest_term = terms[-1] if terms else {}
fec_ids = ids.get("fec")
fec_ids_joined = ",".join(fec_ids) if isinstance(fec_ids, list) else fec_ids
chamber = latest_term.get("type")
chamber_normalized = {"rep": "House", "sen": "Senate"}.get(chamber, chamber)
return {
"thomas_id": ids.get("thomas"),
"lis_id": ids.get("lis"),
"govtrack_id": ids.get("govtrack"),
"opensecrets_id": ids.get("opensecrets"),
"fec_ids": fec_ids_joined,
"first_name": name.get("first"),
"last_name": name.get("last"),
"official_full_name": name.get("official_full"),
"nickname": name.get("nickname"),
"birthday": bio.get("birthday"),
"gender": bio.get("gender"),
"current_party": latest_term.get("party"),
"current_state": latest_term.get("state"),
"current_district": latest_term.get("district"),
"current_chamber": chamber_normalized,
}
# ---------------------------------------------------------------------------
# Social Media — loaded from legislators-social-media.yaml
# ---------------------------------------------------------------------------
SOCIAL_MEDIA_PLATFORMS = {
"twitter": "https://twitter.com/{account}",
"facebook": "https://facebook.com/{account}",
"youtube": "https://youtube.com/{account}",
"instagram": "https://instagram.com/{account}",
"mastodon": None,
}
def ingest_social_media(engine: Engine, legislators_dir: Path) -> None:
"""Load social media accounts from legislators-social-media.yaml."""
social_media_path = legislators_dir / "legislators-social-media.yaml"
if not social_media_path.exists():
logger.warning("Social media file not found: %s", social_media_path)
return
with social_media_path.open() as file:
social_media_data = yaml.safe_load(file)
if not isinstance(social_media_data, list):
logger.warning("Unexpected format in %s", social_media_path)
return
logger.info("Loaded %d entries from legislators-social-media.yaml", len(social_media_data))
with Session(engine) as session:
legislator_map = _build_legislator_map(session)
existing_accounts = {
(account.legislator_id, account.platform)
for account in session.scalars(select(LegislatorSocialMedia)).all()
}
logger.info("Found %d existing social media accounts in DB", len(existing_accounts))
total_inserted = 0
total_updated = 0
for entry in social_media_data:
bioguide_id = entry.get("id", {}).get("bioguide")
if not bioguide_id:
continue
legislator_id = legislator_map.get(bioguide_id)
if legislator_id is None:
continue
social = entry.get("social", {})
for platform, url_template in SOCIAL_MEDIA_PLATFORMS.items():
account_name = social.get(platform)
if not account_name:
continue
url = url_template.format(account=account_name) if url_template else None
if (legislator_id, platform) in existing_accounts:
total_updated += 1
else:
session.add(
LegislatorSocialMedia(
legislator_id=legislator_id,
platform=platform,
account_name=str(account_name),
url=url,
source="https://github.com/unitedstates/congress-legislators",
)
)
existing_accounts.add((legislator_id, platform))
total_inserted += 1
session.commit()
logger.info("Inserted %d new social media accounts, updated %d existing", total_inserted, total_updated)
def _iter_voters(position_group: object) -> Iterator[dict]:
"""Yield voter dicts from a vote position group (handles list, single dict, or string)."""
if isinstance(position_group, dict):
yield position_group
elif isinstance(position_group, list):
for voter in position_group:
if isinstance(voter, dict):
yield voter
# ---------------------------------------------------------------------------
# Bills
# ---------------------------------------------------------------------------
def ingest_bills(engine: Engine, congress_dirs: list[Path]) -> None:
"""Load bill data.json files."""
with Session(engine) as session:
existing_bills = {(bill.congress, bill.bill_type, bill.number) for bill in session.scalars(select(Bill)).all()}
logger.info("Found %d existing bills in DB", len(existing_bills))
total_inserted = 0
batch: list[Bill] = []
for congress_dir in congress_dirs:
bills_dir = congress_dir / "bills"
if not bills_dir.is_dir():
continue
logger.info("Scanning bills from %s", congress_dir.name)
for bill_file in bills_dir.rglob("data.json"):
data = _read_json(bill_file)
if data is None:
continue
bill = _parse_bill(data, existing_bills)
if bill is not None:
batch.append(bill)
if len(batch) >= BATCH_SIZE:
total_inserted += _flush_batch(session, batch, "bills")
total_inserted += _flush_batch(session, batch, "bills")
logger.info("Inserted %d new bills total", total_inserted)
def _parse_bill(data: dict, existing_bills: set[tuple[int, str, int]]) -> Bill | None:
"""Parse a bill data.json dict into a Bill ORM object, skipping existing."""
raw_congress = data.get("congress")
bill_type = data.get("bill_type")
raw_number = data.get("number")
if raw_congress is None or bill_type is None or raw_number is None:
return None
congress = int(raw_congress)
number = int(raw_number)
if (congress, bill_type, number) in existing_bills:
return None
sponsor_bioguide = None
sponsor = data.get("sponsor")
if sponsor:
sponsor_bioguide = sponsor.get("bioguide_id")
return Bill(
congress=congress,
bill_type=bill_type,
number=number,
title=data.get("short_title") or data.get("official_title"),
title_short=data.get("short_title"),
official_title=data.get("official_title"),
status=data.get("status"),
status_at=data.get("status_at"),
sponsor_bioguide_id=sponsor_bioguide,
subjects_top_term=data.get("subjects_top_term"),
)
# ---------------------------------------------------------------------------
# Votes (and vote records)
# ---------------------------------------------------------------------------
def ingest_votes(engine: Engine, congress_dirs: list[Path]) -> None:
"""Load vote data.json files with their vote records."""
with Session(engine) as session:
legislator_map = _build_legislator_map(session)
logger.info("Loaded %d legislators into lookup map", len(legislator_map))
bill_map = _build_bill_map(session)
logger.info("Loaded %d bills into lookup map", len(bill_map))
existing_votes = {
(vote.congress, vote.chamber, vote.session, vote.number) for vote in session.scalars(select(Vote)).all()
}
logger.info("Found %d existing votes in DB", len(existing_votes))
total_inserted = 0
batch: list[Vote] = []
for congress_dir in congress_dirs:
votes_dir = congress_dir / "votes"
if not votes_dir.is_dir():
continue
logger.info("Scanning votes from %s", congress_dir.name)
for vote_file in votes_dir.rglob("data.json"):
data = _read_json(vote_file)
if data is None:
continue
vote = _parse_vote(data, legislator_map, bill_map, existing_votes)
if vote is not None:
batch.append(vote)
if len(batch) >= BATCH_SIZE:
total_inserted += _flush_batch(session, batch, "votes")
total_inserted += _flush_batch(session, batch, "votes")
logger.info("Inserted %d new votes total", total_inserted)
def _build_legislator_map(session: Session) -> dict[str, int]:
"""Build a mapping of bioguide_id -> legislator.id."""
return {legislator.bioguide_id: legislator.id for legislator in session.scalars(select(Legislator)).all()}
def _build_bill_map(session: Session) -> dict[tuple[int, str, int], int]:
"""Build a mapping of (congress, bill_type, number) -> bill.id."""
return {(bill.congress, bill.bill_type, bill.number): bill.id for bill in session.scalars(select(Bill)).all()}
def _parse_vote(
data: dict,
legislator_map: dict[str, int],
bill_map: dict[tuple[int, str, int], int],
existing_votes: set[tuple[int, str, int, int]],
) -> Vote | None:
"""Parse a vote data.json dict into a Vote ORM object with records."""
raw_congress = data.get("congress")
chamber = data.get("chamber")
raw_number = data.get("number")
vote_date = data.get("date")
if raw_congress is None or chamber is None or raw_number is None or vote_date is None:
return None
raw_session = data.get("session")
if raw_session is None:
return None
congress = int(raw_congress)
number = int(raw_number)
session_number = int(raw_session)
# Normalize chamber from "h"/"s" to "House"/"Senate"
chamber_normalized = {"h": "House", "s": "Senate"}.get(chamber, chamber)
if (congress, chamber_normalized, session_number, number) in existing_votes:
return None
# Resolve linked bill
bill_id = None
bill_ref = data.get("bill")
if bill_ref:
bill_key = (
int(bill_ref.get("congress", congress)),
bill_ref.get("type"),
int(bill_ref.get("number", 0)),
)
bill_id = bill_map.get(bill_key)
raw_votes = data.get("votes", {})
vote_counts = _count_votes(raw_votes)
vote_records = _build_vote_records(raw_votes, legislator_map)
return Vote(
congress=congress,
chamber=chamber_normalized,
session=session_number,
number=number,
vote_type=data.get("type"),
question=data.get("question"),
result=data.get("result"),
result_text=data.get("result_text"),
vote_date=vote_date[:10] if isinstance(vote_date, str) else vote_date,
bill_id=bill_id,
vote_records=vote_records,
**vote_counts,
)
def _count_votes(raw_votes: dict) -> dict[str, int]:
"""Count voters per position category, correctly handling dict and list formats."""
yea_count = 0
nay_count = 0
not_voting_count = 0
present_count = 0
for position, position_group in raw_votes.items():
voter_count = sum(1 for _ in _iter_voters(position_group))
if position in ("Yea", "Aye"):
yea_count += voter_count
elif position in ("Nay", "No"):
nay_count += voter_count
elif position == "Not Voting":
not_voting_count += voter_count
elif position == "Present":
present_count += voter_count
return {
"yea_count": yea_count,
"nay_count": nay_count,
"not_voting_count": not_voting_count,
"present_count": present_count,
}
def _build_vote_records(raw_votes: dict, legislator_map: dict[str, int]) -> list[VoteRecord]:
"""Build VoteRecord objects from raw vote data."""
records: list[VoteRecord] = []
for position, position_group in raw_votes.items():
for voter in _iter_voters(position_group):
bioguide_id = voter.get("id")
if not bioguide_id:
continue
legislator_id = legislator_map.get(bioguide_id)
if legislator_id is None:
continue
records.append(
VoteRecord(
legislator_id=legislator_id,
position=position,
)
)
return records
# ---------------------------------------------------------------------------
# Bill Text
# ---------------------------------------------------------------------------
def ingest_bill_text(engine: Engine, congress_dirs: list[Path]) -> None:
"""Load bill text from text-versions directories."""
with Session(engine) as session:
bill_map = _build_bill_map(session)
logger.info("Loaded %d bills into lookup map", len(bill_map))
existing_bill_texts = {
(bill_text.bill_id, bill_text.version_code) for bill_text in session.scalars(select(BillText)).all()
}
logger.info("Found %d existing bill text versions in DB", len(existing_bill_texts))
total_inserted = 0
batch: list[BillText] = []
for congress_dir in congress_dirs:
logger.info("Scanning bill texts from %s", congress_dir.name)
for bill_text in _iter_bill_texts(congress_dir, bill_map, existing_bill_texts):
batch.append(bill_text)
if len(batch) >= BATCH_SIZE:
total_inserted += _flush_batch(session, batch, "bill texts")
total_inserted += _flush_batch(session, batch, "bill texts")
logger.info("Inserted %d new bill text versions total", total_inserted)
def _iter_bill_texts(
congress_dir: Path,
bill_map: dict[tuple[int, str, int], int],
existing_bill_texts: set[tuple[int, str]],
) -> Iterator[BillText]:
"""Yield BillText objects for a single congress directory, skipping existing."""
bills_dir = congress_dir / "bills"
if not bills_dir.is_dir():
return
for bill_dir in bills_dir.rglob("text-versions"):
if not bill_dir.is_dir():
continue
bill_key = _bill_key_from_dir(bill_dir.parent, congress_dir)
if bill_key is None:
continue
bill_id = bill_map.get(bill_key)
if bill_id is None:
continue
for version_dir in sorted(bill_dir.iterdir()):
if not version_dir.is_dir():
continue
if (bill_id, version_dir.name) in existing_bill_texts:
continue
text_content = _read_bill_text(version_dir)
version_data = _read_json(version_dir / "data.json")
yield BillText(
bill_id=bill_id,
version_code=version_dir.name,
version_name=version_data.get("version_name") if version_data else None,
date=version_data.get("issued_on") if version_data else None,
text_content=text_content,
)
def _bill_key_from_dir(bill_dir: Path, congress_dir: Path) -> tuple[int, str, int] | None:
"""Extract (congress, bill_type, number) from directory structure."""
congress = int(congress_dir.name)
bill_type = bill_dir.parent.name
name = bill_dir.name
# Directory name is like "hr3590" — strip the type prefix to get the number
number_str = name[len(bill_type) :]
if not number_str.isdigit():
return None
return (congress, bill_type, int(number_str))
def _read_bill_text(version_dir: Path) -> str | None:
"""Read bill text from a version directory, preferring .txt over .xml."""
for extension in ("txt", "htm", "html", "xml"):
candidates = list(version_dir.glob(f"document.{extension}"))
if not candidates:
candidates = list(version_dir.glob(f"*.{extension}"))
if candidates:
try:
return candidates[0].read_text(encoding="utf-8")
except Exception:
logger.exception("Failed to read %s", candidates[0])
return None
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _read_json(path: Path) -> dict | None:
"""Read and parse a JSON file, returning None on failure."""
try:
return orjson.loads(path.read_bytes())
except FileNotFoundError:
return None
except Exception:
logger.exception("Failed to parse %s", path)
return None
if __name__ == "__main__":
app()
-347
View File
@@ -1,347 +0,0 @@
"""Small Gitea API client for repository automation."""
from __future__ import annotations
from dataclasses import dataclass
from typing import Self
from urllib.parse import quote
import httpx
DEFAULT_PAGE_SIZE = 100
EXPECTED_NO_CONTENT = 204
EXPECTED_CREATED = 201
EXPECTED_OK = 200
@dataclass(frozen=True)
class CreatedIssue:
"""Issue data returned by Gitea."""
number: int | None
html_url: str | None
title: str
@dataclass(frozen=True)
class PullRequest:
"""Pull request data returned by Gitea."""
number: int
title: str
html_url: str | None
labels: tuple[str, ...]
head_branch: str | None
base_branch: str | None
@dataclass(frozen=True)
class WorkflowJob:
"""Workflow job data returned by Gitea Actions."""
id: int
name: str
run_id: int | None
status: str | None
conclusion: str | None
class GiteaError(RuntimeError):
"""Raised when Gitea rejects an API request."""
def split_repo_name(repo: str) -> tuple[str, str]:
"""Split an owner/repo string into its parts."""
owner, separator, repo_name = repo.partition("/")
if not separator or not owner or not repo_name:
msg = f"Invalid repository name: {repo}"
raise ValueError(msg)
return owner, repo_name
class GiteaClient:
"""HTTP client for the subset of Gitea APIs used in this repository."""
def __init__(
self,
*,
base_url: str,
token: str,
timeout: int = 30,
transport: httpx.BaseTransport | None = None,
) -> None:
"""Initialize the Gitea client."""
self._client = httpx.Client(
base_url=base_url.rstrip("/"),
timeout=timeout,
headers={"Authorization": f"token {token}"},
transport=transport,
)
def create_issue(
self,
*,
owner: str,
repo: str,
title: str,
body: str,
labels: list[int] | None = None,
) -> CreatedIssue:
"""Create a Gitea issue."""
payload: dict[str, object] = {"title": title, "body": body, "labels": labels or []}
response = self._request(
"POST",
f"/api/v1/repos/{owner}/{repo}/issues",
expected_statuses={EXPECTED_CREATED},
json=payload,
)
data = response.json()
return CreatedIssue(
number=_optional_int(data.get("number")),
html_url=_optional_str(data.get("html_url")),
title=str(data.get("title", title)),
)
def resolve_label_ids(self, *, owner: str, repo: str, labels: list[str]) -> list[int]:
"""Resolve label names to Gitea label IDs."""
if not labels:
return []
available_labels: dict[str, int] = {}
page = 1
while True:
response = self._request(
"GET",
f"/api/v1/repos/{owner}/{repo}/labels",
params={"page": page, "limit": DEFAULT_PAGE_SIZE},
)
batch = response.json()
if not batch:
break
for label in batch:
label_name = str(label.get("name", ""))
label_id = _optional_int(label.get("id"))
if label_name and label_id is not None:
available_labels[label_name] = label_id
if len(batch) < DEFAULT_PAGE_SIZE:
break
page += 1
missing = [label for label in labels if label not in available_labels]
if missing:
missing_names = ", ".join(sorted(missing))
msg = f"Missing Gitea labels: {missing_names}"
raise GiteaError(msg)
return [available_labels[label] for label in labels]
def list_open_pull_requests(
self,
*,
owner: str,
repo: str,
labels: list[str] | None = None,
head: str | None = None,
) -> list[PullRequest]:
"""List open pull requests for a repository."""
expected_labels = set(labels or [])
pull_requests: list[PullRequest] = []
page = 1
while True:
response = self._request(
"GET",
f"/api/v1/repos/{owner}/{repo}/pulls",
params={"state": "open", "page": page, "limit": DEFAULT_PAGE_SIZE},
)
batch = response.json()
if not batch:
break
for item in batch:
pull_request = _pull_request_from_api(item)
if head and pull_request.head_branch != head:
continue
if expected_labels and not expected_labels.issubset(set(pull_request.labels)):
continue
pull_requests.append(pull_request)
if len(batch) < DEFAULT_PAGE_SIZE:
break
page += 1
return pull_requests
def create_pull_request(
self,
*,
owner: str,
repo: str,
title: str,
body: str,
head: str,
base: str,
labels: list[str] | None = None,
) -> PullRequest:
"""Create a pull request."""
payload: dict[str, object] = {
"title": title,
"body": body,
"head": head,
"base": base,
}
if labels:
payload["labels"] = self.resolve_label_ids(owner=owner, repo=repo, labels=labels)
response = self._request(
"POST",
f"/api/v1/repos/{owner}/{repo}/pulls",
expected_statuses={EXPECTED_CREATED},
json=payload,
)
return _pull_request_from_api(response.json())
def merge_pull_request(
self,
*,
owner: str,
repo: str,
number: int,
merge_method: str = "rebase",
head_commit_id: str | None = None,
delete_branch_after_merge: bool = False,
) -> None:
"""Merge a pull request."""
payload: dict[str, object] = {
"Do": merge_method,
"delete_branch_after_merge": delete_branch_after_merge,
}
if head_commit_id:
payload["head_commit_id"] = head_commit_id
self._request(
"POST",
f"/api/v1/repos/{owner}/{repo}/pulls/{number}/merge",
json=payload,
)
def dispatch_workflow(self, *, owner: str, repo: str, workflow_id: str, ref: str) -> None:
"""Trigger a workflow_dispatch run."""
workflow_path = quote(workflow_id, safe="")
self._request(
"POST",
f"/api/v1/repos/{owner}/{repo}/actions/workflows/{workflow_path}/dispatches",
expected_statuses={EXPECTED_OK, EXPECTED_NO_CONTENT},
json={"ref": ref},
)
def list_run_jobs(self, *, owner: str, repo: str, run_id: str | int) -> list[WorkflowJob]:
"""List workflow jobs for a specific run."""
jobs: list[WorkflowJob] = []
page = 1
while True:
response = self._request(
"GET",
f"/api/v1/repos/{owner}/{repo}/actions/jobs",
params={"page": page, "limit": DEFAULT_PAGE_SIZE},
)
payload = response.json()
batch = payload.get("jobs", [])
if not batch:
break
for item in batch:
if str(item.get("run_id")) != str(run_id):
continue
jobs.append(_workflow_job_from_api(item))
if len(batch) < DEFAULT_PAGE_SIZE:
break
page += 1
return jobs
def download_job_logs(self, *, owner: str, repo: str, job_id: int) -> str:
"""Download logs for a workflow job."""
response = self._request(
"GET",
f"/api/v1/repos/{owner}/{repo}/actions/jobs/{job_id}/logs",
)
return response.text
def close(self) -> None:
"""Close the underlying HTTP client."""
self._client.close()
def __enter__(self) -> Self:
"""Enter the context manager."""
return self
def __exit__(self, *args: object) -> None:
"""Close the HTTP client."""
self.close()
def _request(
self,
method: str,
path: str,
*,
expected_statuses: set[int] | None = None,
**kwargs: object,
) -> httpx.Response:
"""Send an HTTP request and validate the response status."""
response = self._client.request(method, path, **kwargs)
statuses = expected_statuses or {EXPECTED_OK}
if response.status_code not in statuses:
msg = f"Gitea request failed ({response.status_code}): {response.text}"
raise GiteaError(msg)
return response
def _pull_request_from_api(data: dict[str, object]) -> PullRequest:
"""Convert Gitea API pull-request data into a dataclass."""
number = _optional_int(data.get("number")) or _optional_int(data.get("index"))
if number is None:
msg = "Gitea pull request payload is missing a number"
raise GiteaError(msg)
labels = tuple(str(label.get("name", "")) for label in data.get("labels", []))
head = data.get("head", {})
base = data.get("base", {})
return PullRequest(
number=number,
title=str(data.get("title", "")),
html_url=_optional_str(data.get("html_url")),
labels=tuple(label for label in labels if label),
head_branch=_optional_str(head.get("ref")) or _optional_str(data.get("head_branch")),
base_branch=_optional_str(base.get("ref")) or _optional_str(data.get("base_branch")),
)
def _workflow_job_from_api(data: dict[str, object]) -> WorkflowJob:
"""Convert Gitea API workflow-job data into a dataclass."""
job_id = _optional_int(data.get("id"))
if job_id is None:
msg = "Gitea workflow job payload is missing an ID"
raise GiteaError(msg)
return WorkflowJob(
id=job_id,
name=str(data.get("name", "")),
run_id=_optional_int(data.get("run_id")),
status=_optional_str(data.get("status")),
conclusion=_optional_str(data.get("conclusion")),
)
def _optional_int(value: object) -> int | None:
"""Convert an API value to an integer when present."""
if value is None:
return None
return int(value)
def _optional_str(value: object) -> str | None:
"""Convert an API value to a string when present."""
if value is None:
return None
return str(value)
-148
View File
@@ -1,148 +0,0 @@
"""Automation helpers for flake.lock pull requests on Gitea."""
from __future__ import annotations
import subprocess
from os import getenv
from typing import Annotated
import typer
from python.gitea import GiteaClient, PullRequest, split_repo_name
DEFAULT_BASE_BRANCH = "main"
DEFAULT_BRANCH = "automation/update-flake-lock"
DEFAULT_GITEA_URL = "https://gitea.tmmworkshop.com"
PR_LABELS = ["dependencies", "automated", "flake_lock_update"]
PR_CHECK_WORKFLOWS = ["build_systems.yml", "treefmt.yml", "pytest.yml"]
PR_TITLE = "Update flake.lock"
PR_BODY = "Automated flake.lock update."
app = typer.Typer(add_completion=False)
def run_cmd(cmd: list[str], *, check: bool = True) -> subprocess.CompletedProcess[str]:
"""Run a subprocess command."""
return subprocess.run(cmd, capture_output=True, text=True, check=check)
def ensure_flake_lock_pull_request(
client: GiteaClient,
*,
owner: str,
repo: str,
branch: str,
base: str,
) -> PullRequest:
"""Return an existing flake.lock PR for the branch or create one."""
pull_requests = client.list_open_pull_requests(owner=owner, repo=repo, head=branch)
if pull_requests:
return pull_requests[0]
return client.create_pull_request(
owner=owner,
repo=repo,
title=PR_TITLE,
body=PR_BODY,
head=branch,
base=base,
labels=PR_LABELS,
)
def find_flake_lock_pull_request(client: GiteaClient, *, owner: str, repo: str) -> PullRequest | None:
"""Find the first open flake.lock pull request."""
pull_requests = client.list_open_pull_requests(owner=owner, repo=repo, labels=["flake_lock_update"])
if not pull_requests:
return None
return pull_requests[0]
def dispatch_pull_request_checks(client: GiteaClient, *, owner: str, repo: str, branch: str) -> None:
"""Dispatch the workflows that normally run for pull requests."""
for workflow in PR_CHECK_WORKFLOWS:
client.dispatch_workflow(owner=owner, repo=repo, workflow_id=workflow, ref=branch)
def has_worktree_changes() -> bool:
"""Return whether `flake.lock` has worktree changes."""
result = run_cmd(["git", "diff", "--quiet", "--", "flake.lock"], check=False)
return result.returncode != 0
def commit_flake_lock_update(*, branch: str) -> None:
"""Commit the updated lock file to the automation branch."""
run_cmd(["git", "config", "user.name", "gitea-actions[bot]"])
run_cmd(["git", "config", "user.email", "gitea-actions@tmmworkshop.com"])
run_cmd(["git", "checkout", "-B", branch])
run_cmd(["git", "add", "flake.lock"])
run_cmd(["git", "commit", "-m", "chore: update flake.lock"])
def push_branch(*, branch: str) -> None:
"""Push the automation branch to origin."""
run_cmd(["git", "push", "origin", f"HEAD:{branch}", "--force"])
def _required_gitea_token() -> str:
"""Read the required Gitea token from the environment."""
token = getenv("GITEA_TOKEN")
if token:
return token
msg = "GITEA_TOKEN environment variable is required"
raise RuntimeError(msg)
@app.command()
def update(
repo: Annotated[str, typer.Option("--repo", help="Gitea repository in owner/repo form")],
base: Annotated[str, typer.Option("--base", help="Base branch")] = DEFAULT_BASE_BRANCH,
branch: Annotated[str, typer.Option("--branch", help="Automation branch")] = DEFAULT_BRANCH,
) -> None:
"""Commit flake.lock changes and ensure a pull request exists."""
if not has_worktree_changes():
typer.echo("No flake.lock changes detected")
return
commit_flake_lock_update(branch=branch)
push_branch(branch=branch)
owner, repo_name = split_repo_name(repo)
with GiteaClient(
base_url=getenv("GITEA_URL", DEFAULT_GITEA_URL),
token=_required_gitea_token(),
) as client:
pull_request = ensure_flake_lock_pull_request(
client,
owner=owner,
repo=repo_name,
branch=branch,
base=base,
)
# We can remove this if Gitea fixes the following issue:
# https://github.com/go-gitea/gitea/issues/33963
dispatch_pull_request_checks(client, owner=owner, repo=repo_name, branch=branch)
typer.echo(pull_request.html_url or f"Pull request #{pull_request.number}")
@app.command()
def merge(
repo: Annotated[str, typer.Option("--repo", help="Gitea repository in owner/repo form")],
) -> None:
"""Merge the first open flake.lock pull request."""
owner, repo_name = split_repo_name(repo)
with GiteaClient(
base_url=getenv("GITEA_URL", DEFAULT_GITEA_URL),
token=_required_gitea_token(),
) as client:
pull_request = find_flake_lock_pull_request(client, owner=owner, repo=repo_name)
if not pull_request:
typer.echo("No open PR found with label flake_lock_update")
return
client.merge_pull_request(owner=owner, repo=repo_name, number=pull_request.number, merge_method="rebase")
typer.echo(f"Merged PR #{pull_request.number}")
if __name__ == "__main__":
app()
@@ -1,14 +0,0 @@
"""init."""
from python.orm.data_science_dev.congress.bill import Bill, BillText
from python.orm.data_science_dev.congress.legislator import Legislator, LegislatorSocialMedia
from python.orm.data_science_dev.congress.vote import Vote, VoteRecord
__all__ = [
"Bill",
"BillText",
"Legislator",
"LegislatorSocialMedia",
"Vote",
"VoteRecord",
]
@@ -1,66 +0,0 @@
"""Bill model - legislation introduced in Congress."""
from __future__ import annotations
from datetime import date
from typing import TYPE_CHECKING
from sqlalchemy import ForeignKey, Index, UniqueConstraint
from sqlalchemy.orm import Mapped, mapped_column, relationship
from python.orm.data_science_dev.base import DataScienceDevTableBase
if TYPE_CHECKING:
from python.orm.data_science_dev.congress.vote import Vote
class Bill(DataScienceDevTableBase):
"""Legislation with congress number, type, titles, status, and sponsor."""
__tablename__ = "bill"
congress: Mapped[int]
bill_type: Mapped[str]
number: Mapped[int]
title: Mapped[str | None]
title_short: Mapped[str | None]
official_title: Mapped[str | None]
status: Mapped[str | None]
status_at: Mapped[date | None]
sponsor_bioguide_id: Mapped[str | None]
subjects_top_term: Mapped[str | None]
votes: Mapped[list[Vote]] = relationship(
"Vote",
back_populates="bill",
)
bill_texts: Mapped[list[BillText]] = relationship(
"BillText",
back_populates="bill",
cascade="all, delete-orphan",
)
__table_args__ = (
UniqueConstraint("congress", "bill_type", "number", name="uq_bill_congress_type_number"),
Index("ix_bill_congress", "congress"),
)
class BillText(DataScienceDevTableBase):
"""Stores different text versions of a bill (introduced, enrolled, etc.)."""
__tablename__ = "bill_text"
bill_id: Mapped[int] = mapped_column(ForeignKey("main.bill.id", ondelete="CASCADE"))
version_code: Mapped[str]
version_name: Mapped[str | None]
text_content: Mapped[str | None]
date: Mapped[date | None]
bill: Mapped[Bill] = relationship("Bill", back_populates="bill_texts")
__table_args__ = (UniqueConstraint("bill_id", "version_code", name="uq_bill_text_bill_id_version_code"),)
@@ -1,66 +0,0 @@
"""Legislator model - members of Congress."""
from __future__ import annotations
from datetime import date
from typing import TYPE_CHECKING
from sqlalchemy import ForeignKey, Text
from sqlalchemy.orm import Mapped, mapped_column, relationship
from python.orm.data_science_dev.base import DataScienceDevTableBase
if TYPE_CHECKING:
from python.orm.data_science_dev.congress.vote import VoteRecord
class Legislator(DataScienceDevTableBase):
"""Members of Congress with identification and current term info."""
__tablename__ = "legislator"
bioguide_id: Mapped[str] = mapped_column(Text, unique=True, index=True)
thomas_id: Mapped[str | None]
lis_id: Mapped[str | None]
govtrack_id: Mapped[int | None]
opensecrets_id: Mapped[str | None]
fec_ids: Mapped[str | None]
first_name: Mapped[str]
last_name: Mapped[str]
official_full_name: Mapped[str | None]
nickname: Mapped[str | None]
birthday: Mapped[date | None]
gender: Mapped[str | None]
current_party: Mapped[str | None]
current_state: Mapped[str | None]
current_district: Mapped[int | None]
current_chamber: Mapped[str | None]
social_media_accounts: Mapped[list[LegislatorSocialMedia]] = relationship(
"LegislatorSocialMedia",
back_populates="legislator",
cascade="all, delete-orphan",
)
vote_records: Mapped[list[VoteRecord]] = relationship(
"VoteRecord",
back_populates="legislator",
cascade="all, delete-orphan",
)
class LegislatorSocialMedia(DataScienceDevTableBase):
"""Social media account linked to a legislator."""
__tablename__ = "legislator_social_media"
legislator_id: Mapped[int] = mapped_column(ForeignKey("main.legislator.id"))
platform: Mapped[str]
account_name: Mapped[str]
url: Mapped[str | None]
source: Mapped[str]
legislator: Mapped[Legislator] = relationship(back_populates="social_media_accounts")
@@ -1,79 +0,0 @@
"""Vote model - roll call votes in Congress."""
from __future__ import annotations
from datetime import date
from typing import TYPE_CHECKING
from sqlalchemy import ForeignKey, Index, UniqueConstraint
from sqlalchemy.orm import Mapped, mapped_column, relationship
from python.orm.data_science_dev.base import DataScienceDevBase, DataScienceDevTableBase
if TYPE_CHECKING:
from python.orm.data_science_dev.congress.bill import Bill
from python.orm.data_science_dev.congress.legislator import Legislator
from python.orm.data_science_dev.congress.vote import Vote
class VoteRecord(DataScienceDevBase):
"""Links a vote to a legislator with their position (Yea, Nay, etc.)."""
__tablename__ = "vote_record"
vote_id: Mapped[int] = mapped_column(
ForeignKey("main.vote.id", ondelete="CASCADE"),
primary_key=True,
)
legislator_id: Mapped[int] = mapped_column(
ForeignKey("main.legislator.id", ondelete="CASCADE"),
primary_key=True,
)
position: Mapped[str]
vote: Mapped[Vote] = relationship("Vote", back_populates="vote_records")
legislator: Mapped[Legislator] = relationship("Legislator", back_populates="vote_records")
class Vote(DataScienceDevTableBase):
"""Roll call votes with counts and optional bill linkage."""
__tablename__ = "vote"
congress: Mapped[int]
chamber: Mapped[str]
session: Mapped[int]
number: Mapped[int]
vote_type: Mapped[str | None]
question: Mapped[str | None]
result: Mapped[str | None]
result_text: Mapped[str | None]
vote_date: Mapped[date]
yea_count: Mapped[int | None]
nay_count: Mapped[int | None]
not_voting_count: Mapped[int | None]
present_count: Mapped[int | None]
bill_id: Mapped[int | None] = mapped_column(ForeignKey("main.bill.id"))
bill: Mapped[Bill | None] = relationship("Bill", back_populates="votes")
vote_records: Mapped[list[VoteRecord]] = relationship(
"VoteRecord",
back_populates="vote",
cascade="all, delete-orphan",
)
__table_args__ = (
UniqueConstraint(
"congress",
"chamber",
"session",
"number",
name="uq_vote_congress_chamber_session_number",
),
Index("ix_vote_date", "vote_date"),
Index("ix_vote_congress_chamber", "congress", "chamber"),
)
-6
View File
@@ -2,15 +2,9 @@
from __future__ import annotations from __future__ import annotations
from python.orm.data_science_dev.congress import Bill, BillText, Legislator, Vote, VoteRecord
from python.orm.data_science_dev.posts import partitions # noqa: F401 — registers partition classes in metadata from python.orm.data_science_dev.posts import partitions # noqa: F401 — registers partition classes in metadata
from python.orm.data_science_dev.posts.tables import Posts from python.orm.data_science_dev.posts.tables import Posts
__all__ = [ __all__ = [
"Bill",
"BillText",
"Legislator",
"Posts", "Posts",
"Vote",
"VoteRecord",
] ]
+5 -4
View File
@@ -2,8 +2,8 @@
from __future__ import annotations from __future__ import annotations
from python.orm.richie.audiobook import Audiobook, AudiobookAuthor, AudiobookSeries
from python.orm.richie.base import RichieBase, TableBase, TableBaseBig, TableBaseSmall from python.orm.richie.base import RichieBase, TableBase, TableBaseBig, TableBaseSmall
from python.orm.richie.congress import Bill, Legislator, Vote, VoteRecord
from python.orm.richie.contact import ( from python.orm.richie.contact import (
Contact, Contact,
ContactNeed, ContactNeed,
@@ -13,16 +13,17 @@ from python.orm.richie.contact import (
) )
__all__ = [ __all__ = [
"Audiobook", "Bill",
"AudiobookAuthor",
"AudiobookSeries",
"Contact", "Contact",
"ContactNeed", "ContactNeed",
"ContactRelationship", "ContactRelationship",
"Legislator",
"Need", "Need",
"RelationshipType", "RelationshipType",
"RichieBase", "RichieBase",
"TableBase", "TableBase",
"TableBaseBig", "TableBaseBig",
"TableBaseSmall", "TableBaseSmall",
"Vote",
"VoteRecord",
] ]
-46
View File
@@ -1,46 +0,0 @@
"""Audiobook catalog models."""
from __future__ import annotations
from sqlalchemy import ForeignKey, String, UniqueConstraint
from sqlalchemy.orm import Mapped, mapped_column, relationship
from python.orm.richie.base import TableBase
class AudiobookAuthor(TableBase):
"""Canonical audiobook author."""
__tablename__ = "audiobook_author"
name: Mapped[str] = mapped_column(String, unique=True)
books: Mapped[list[Audiobook]] = relationship("Audiobook", back_populates="author")
series: Mapped[list[AudiobookSeries]] = relationship("AudiobookSeries", back_populates="author")
class AudiobookSeries(TableBase):
"""Canonical audiobook series."""
__tablename__ = "audiobook_series"
__table_args__ = (UniqueConstraint("author_id", "name"),)
name: Mapped[str] = mapped_column(String)
author_id: Mapped[int] = mapped_column(ForeignKey("main.audiobook_author.id", ondelete="CASCADE"))
author: Mapped[AudiobookAuthor] = relationship("AudiobookAuthor", back_populates="series")
books: Mapped[list[Audiobook]] = relationship("Audiobook", back_populates="series")
class Audiobook(TableBase):
"""Canonical audiobook title."""
__tablename__ = "audiobook"
title: Mapped[str] = mapped_column(String)
author_id: Mapped[int] = mapped_column(ForeignKey("main.audiobook_author.id", ondelete="CASCADE"))
series_id: Mapped[int | None] = mapped_column(ForeignKey("main.audiobook_series.id", ondelete="SET NULL"))
series_index: Mapped[int] = mapped_column(default=0)
author: Mapped[AudiobookAuthor] = relationship("AudiobookAuthor", back_populates="books")
series: Mapped[AudiobookSeries | None] = relationship("AudiobookSeries", back_populates="books")
+150
View File
@@ -0,0 +1,150 @@
"""Congress Tracker database models."""
from __future__ import annotations
from datetime import date
from sqlalchemy import ForeignKey, Index, Text, UniqueConstraint
from sqlalchemy.orm import Mapped, mapped_column, relationship
from python.orm.richie.base import RichieBase, TableBase
class Legislator(TableBase):
"""Legislator model - members of Congress."""
__tablename__ = "legislator"
# Natural key - bioguide ID is the authoritative identifier
bioguide_id: Mapped[str] = mapped_column(Text, unique=True, index=True)
# Other IDs for cross-referencing
thomas_id: Mapped[str | None]
lis_id: Mapped[str | None]
govtrack_id: Mapped[int | None]
opensecrets_id: Mapped[str | None]
fec_ids: Mapped[str | None] # JSON array stored as string
# Name info
first_name: Mapped[str]
last_name: Mapped[str]
official_full_name: Mapped[str | None]
nickname: Mapped[str | None]
# Bio
birthday: Mapped[date | None]
gender: Mapped[str | None] # M/F
# Current term info (denormalized for query efficiency)
current_party: Mapped[str | None]
current_state: Mapped[str | None]
current_district: Mapped[int | None] # House only
current_chamber: Mapped[str | None] # rep/sen
# Relationships
vote_records: Mapped[list[VoteRecord]] = relationship(
"VoteRecord",
back_populates="legislator",
cascade="all, delete-orphan",
)
class Bill(TableBase):
"""Bill model - legislation introduced in Congress."""
__tablename__ = "bill"
# Composite natural key: congress + bill_type + number
congress: Mapped[int]
bill_type: Mapped[str] # hr, s, hres, sres, hjres, sjres
number: Mapped[int]
# Bill info
title: Mapped[str | None]
title_short: Mapped[str | None]
official_title: Mapped[str | None]
# Status
status: Mapped[str | None]
status_at: Mapped[date | None]
# Sponsor
sponsor_bioguide_id: Mapped[str | None]
# Subjects
subjects_top_term: Mapped[str | None]
# Relationships
votes: Mapped[list[Vote]] = relationship(
"Vote",
back_populates="bill",
)
__table_args__ = (
UniqueConstraint("congress", "bill_type", "number", name="uq_bill_congress_type_number"),
Index("ix_bill_congress", "congress"),
)
class Vote(TableBase):
"""Vote model - roll call votes in Congress."""
__tablename__ = "vote"
# Composite natural key: congress + chamber + session + number
congress: Mapped[int]
chamber: Mapped[str] # house/senate
session: Mapped[int]
number: Mapped[int]
# Vote details
vote_type: Mapped[str | None]
question: Mapped[str | None]
result: Mapped[str | None]
result_text: Mapped[str | None]
# Timing
vote_date: Mapped[date]
# Vote counts (denormalized for efficiency)
yea_count: Mapped[int | None]
nay_count: Mapped[int | None]
not_voting_count: Mapped[int | None]
present_count: Mapped[int | None]
# Related bill (optional - not all votes are on bills)
bill_id: Mapped[int | None] = mapped_column(ForeignKey("main.bill.id"))
# Relationships
bill: Mapped[Bill | None] = relationship("Bill", back_populates="votes")
vote_records: Mapped[list[VoteRecord]] = relationship(
"VoteRecord",
back_populates="vote",
cascade="all, delete-orphan",
)
__table_args__ = (
UniqueConstraint("congress", "chamber", "session", "number", name="uq_vote_congress_chamber_session_number"),
Index("ix_vote_date", "vote_date"),
Index("ix_vote_congress_chamber", "congress", "chamber"),
)
class VoteRecord(RichieBase):
"""Association table: Vote <-> Legislator with position."""
__tablename__ = "vote_record"
vote_id: Mapped[int] = mapped_column(
ForeignKey("main.vote.id", ondelete="CASCADE"),
primary_key=True,
)
legislator_id: Mapped[int] = mapped_column(
ForeignKey("main.legislator.id", ondelete="CASCADE"),
primary_key=True,
)
position: Mapped[str] # Yea, Nay, Not Voting, Present
# Relationships
vote: Mapped[Vote] = relationship("Vote", back_populates="vote_records")
legislator: Mapped[Legislator] = relationship("Legislator", back_populates="vote_records")
+19 -17
View File
@@ -63,9 +63,9 @@ class DeviceRegistry:
return return
with Session(self.engine) as session: with Session(self.engine) as session:
device = session.scalars( device = session.execute(
select(SignalDevice).where(SignalDevice.phone_number == phone_number) select(SignalDevice).where(SignalDevice.phone_number == phone_number)
).one_or_none() ).scalar_one_or_none()
if device: if device:
if device.safety_number != safety_number and device.trust_level != TrustLevel.BLOCKED: if device.safety_number != safety_number and device.trust_level != TrustLevel.BLOCKED:
@@ -99,9 +99,9 @@ class DeviceRegistry:
Returns True if the device was found and verified. Returns True if the device was found and verified.
""" """
with Session(self.engine) as session: with Session(self.engine) as session:
device = session.scalars( device = session.execute(
select(SignalDevice).where(SignalDevice.phone_number == phone_number) select(SignalDevice).where(SignalDevice.phone_number == phone_number)
).one_or_none() ).scalar_one_or_none()
if not device: if not device:
logger.warning(f"Cannot verify unknown device: {phone_number}") logger.warning(f"Cannot verify unknown device: {phone_number}")
@@ -139,9 +139,9 @@ class DeviceRegistry:
def grant_role(self, phone_number: str, role: Role) -> bool: def grant_role(self, phone_number: str, role: Role) -> bool:
"""Add a role to a device. Called by admin over SSH.""" """Add a role to a device. Called by admin over SSH."""
with Session(self.engine) as session: with Session(self.engine) as session:
device = session.scalars( device = session.execute(
select(SignalDevice).where(SignalDevice.phone_number == phone_number) select(SignalDevice).where(SignalDevice.phone_number == phone_number)
).one_or_none() ).scalar_one_or_none()
if not device: if not device:
logger.warning(f"Cannot grant role for unknown device: {phone_number}") logger.warning(f"Cannot grant role for unknown device: {phone_number}")
@@ -150,7 +150,7 @@ class DeviceRegistry:
if any(record.name == role for record in device.roles): if any(record.name == role for record in device.roles):
return True return True
role_record = session.scalars(select(RoleRecord).where(RoleRecord.name == role)).one_or_none() role_record = session.execute(select(RoleRecord).where(RoleRecord.name == role)).scalar_one_or_none()
if not role_record: if not role_record:
logger.warning(f"Unknown role: {role}") logger.warning(f"Unknown role: {role}")
@@ -165,9 +165,9 @@ class DeviceRegistry:
def revoke_role(self, phone_number: str, role: Role) -> bool: def revoke_role(self, phone_number: str, role: Role) -> bool:
"""Remove a role from a device. Called by admin over SSH.""" """Remove a role from a device. Called by admin over SSH."""
with Session(self.engine) as session: with Session(self.engine) as session:
device = session.scalars( device = session.execute(
select(SignalDevice).where(SignalDevice.phone_number == phone_number) select(SignalDevice).where(SignalDevice.phone_number == phone_number)
).one_or_none() ).scalar_one_or_none()
if not device: if not device:
logger.warning(f"Cannot revoke role for unknown device: {phone_number}") logger.warning(f"Cannot revoke role for unknown device: {phone_number}")
@@ -182,16 +182,16 @@ class DeviceRegistry:
def set_roles(self, phone_number: str, roles: list[Role]) -> bool: def set_roles(self, phone_number: str, roles: list[Role]) -> bool:
"""Replace all roles for a device. Called by admin over SSH.""" """Replace all roles for a device. Called by admin over SSH."""
with Session(self.engine) as session: with Session(self.engine) as session:
device = session.scalars( device = session.execute(
select(SignalDevice).where(SignalDevice.phone_number == phone_number) select(SignalDevice).where(SignalDevice.phone_number == phone_number)
).one_or_none() ).scalar_one_or_none()
if not device: if not device:
logger.warning(f"Cannot set roles for unknown device: {phone_number}") logger.warning(f"Cannot set roles for unknown device: {phone_number}")
return False return False
role_names = [str(role) for role in roles] role_names = [str(role) for role in roles]
records = session.scalars(select(RoleRecord).where(RoleRecord.name.in_(role_names))).all() records = list(session.execute(select(RoleRecord).where(RoleRecord.name.in_(role_names))).scalars().all())
device.roles = records device.roles = records
session.commit() session.commit()
self._update_cache(phone_number, device) self._update_cache(phone_number, device)
@@ -203,7 +203,7 @@ class DeviceRegistry:
def list_devices(self) -> list[SignalDevice]: def list_devices(self) -> list[SignalDevice]:
"""Return all known devices.""" """Return all known devices."""
with Session(self.engine) as session: with Session(self.engine) as session:
return list(session.scalars(select(SignalDevice)).all()) return list(session.execute(select(SignalDevice)).scalars().all())
def sync_identities(self) -> None: def sync_identities(self) -> None:
"""Pull identity list from signal-cli and record any new ones.""" """Pull identity list from signal-cli and record any new ones."""
@@ -226,7 +226,9 @@ class DeviceRegistry:
def _load_device(self, phone_number: str) -> SignalDevice | None: def _load_device(self, phone_number: str) -> SignalDevice | None:
"""Fetch a device by phone number (with joined roles).""" """Fetch a device by phone number (with joined roles)."""
with Session(self.engine) as session: with Session(self.engine) as session:
return session.scalars(select(SignalDevice).where(SignalDevice.phone_number == phone_number)).one_or_none() return session.execute(
select(SignalDevice).where(SignalDevice.phone_number == phone_number)
).scalar_one_or_none()
def _update_cache(self, phone_number: str, device: SignalDevice) -> None: def _update_cache(self, phone_number: str, device: SignalDevice) -> None:
"""Refresh the cache entry for a device.""" """Refresh the cache entry for a device."""
@@ -242,9 +244,9 @@ class DeviceRegistry:
def _set_trust(self, phone_number: str, level: str, log_msg: str | None = None) -> bool: def _set_trust(self, phone_number: str, level: str, log_msg: str | None = None) -> bool:
"""Update the trust level for a device.""" """Update the trust level for a device."""
with Session(self.engine) as session: with Session(self.engine) as session:
device = session.scalars( device = session.execute(
select(SignalDevice).where(SignalDevice.phone_number == phone_number) select(SignalDevice).where(SignalDevice.phone_number == phone_number)
).one_or_none() ).scalar_one_or_none()
if not device: if not device:
return False return False
@@ -267,7 +269,7 @@ def sync_roles(engine: Engine) -> None:
expected = {role.value for role in Role} expected = {role.value for role in Role}
with Session(engine) as session: with Session(engine) as session:
existing = set(session.scalars(select(RoleRecord.name)).all()) existing = {record.name for record in session.execute(select(RoleRecord)).scalars().all()}
to_add = expected - existing to_add = expected - existing
to_remove = existing - expected to_remove = existing - expected
-1
View File
@@ -1 +0,0 @@
"""Audiobook tools."""
-444
View File
@@ -1,444 +0,0 @@
"""Convert Audible AAX downloads into Audiobookshelf-friendly M4B files."""
from __future__ import annotations
import json
import logging
import shutil
import subprocess
from concurrent.futures import ThreadPoolExecutor
from dataclasses import asdict, dataclass
from os import getenv
from pathlib import Path # noqa: TC003 This is required for the typer CLI
from typing import TYPE_CHECKING, Annotated, Any
from uuid import uuid7
import typer
from python.common import configure_logger
from python.orm.common import get_postgres_engine
from python.tools.audiobook.metadata_agent import (
AgentConfig,
StandardBookMetadata,
standard_book_metadata,
write_agent_log,
)
if TYPE_CHECKING:
from sqlalchemy.engine import Engine
logger = logging.getLogger(__name__)
SENSITIVE_COMMAND_ARGUMENTS = {"-activation_bytes"}
@dataclass(frozen=True)
class ConversionConfig:
"""Runtime settings for one conversion command."""
resolved_output: Path
ollama_api_key: str
agent_config: AgentConfig
engine: Engine
activation_bytes: str | None
dry_run: bool
overwrite: bool
work_directory_name: str = ".audible_convert"
dry_run_directory_name: str = "dry-run"
temp_directory_name: str = "tmp"
log_directory_name: str = "logs"
review_directory_name: str = "review"
@dataclass(frozen=True)
class ConcurrentConversionResult:
"""Result from running ffmpeg and metadata resolution together."""
metadata: StandardBookMetadata | None
conversion_error: Exception | None
metadata_error: Exception | None
class CommandExecutionError(RuntimeError):
"""Command failed without exposing sensitive arguments."""
def __init__(self, arguments: list[str], returncode: int) -> None:
"""Create a redacted command failure."""
self.arguments = tuple(arguments)
self.returncode = returncode
command = " ".join(redact_command_arguments(arguments))
super().__init__(f"Command failed with exit code {returncode}: {command}")
def main(
input_directory: Annotated[Path, typer.Argument(help="Directory audible-cli downloads AAX files into.")],
output_directory: Annotated[Path, typer.Argument(help="Audiobook output directory.")],
*,
dry_run: Annotated[
bool,
typer.Option("--dry-run", help="Print planned output files and write marker files without converting."),
] = False,
overwrite: Annotated[bool, typer.Option("--overwrite", help="Overwrite existing M4B files.")] = False,
) -> None:
"""Convert AAX files from a download directory into M4B files."""
configure_logger()
resolved_input = input_directory.resolve(strict=True)
resolved_output = output_directory.resolve()
if not dry_run:
resolved_output.mkdir(parents=True, exist_ok=True)
ollama_api_key = getenv("OLLAMA_API_KEY")
if not ollama_api_key:
msg = "OLLAMA_API_KEY is required for audiobook metadata resolution"
raise RuntimeError(msg)
config = ConversionConfig(
resolved_output=resolved_output,
ollama_api_key=ollama_api_key,
agent_config=AgentConfig(),
engine=get_postgres_engine(name="RICHIE"),
activation_bytes=getenv("AUDIBLE_ACTIVATION_BYTES"),
dry_run=dry_run,
overwrite=overwrite,
)
aax_files = sorted(resolved_input.glob("*.aax"))
if not aax_files:
logger.info("No AAX files found in %s", resolved_input)
return
for aax_file in aax_files:
logger.info("Converting %s", aax_file)
convert_aax_file_with_agent(aax_file, config)
def run_command(arguments: list[str], *, capture: bool = False) -> subprocess.CompletedProcess[str]:
"""Run a command and return the completed process.
Args:
arguments: Command and arguments to run.
capture: Whether to capture stdout and stderr.
Returns:
The completed process.
"""
logger.debug("%s", " ".join(redact_command_arguments(arguments)))
try:
return subprocess.run(arguments, check=True, capture_output=capture, text=True)
except subprocess.CalledProcessError as error:
raise CommandExecutionError(arguments, error.returncode) from error
def redact_command_arguments(arguments: list[str]) -> list[str]:
"""Return command arguments with sensitive values redacted."""
redacted = []
redact_next = False
for argument in arguments:
if redact_next:
redacted.append("<redacted>")
redact_next = False
continue
redacted.append(argument)
redact_next = argument in SENSITIVE_COMMAND_ARGUMENTS
return redacted
def read_metadata(aax_file: Path) -> dict[str, str]:
"""Read ffprobe format tags from an AAX file.
Args:
aax_file: AAX file to inspect.
Returns:
Lower-cased metadata tag names mapped to their values.
"""
completed = run_command(
[
"ffprobe",
"-v",
"quiet",
"-print_format",
"json",
"-show_format",
str(aax_file),
],
capture=True,
)
ffprobe_data: dict[str, Any] = json.loads(completed.stdout)
tags = ffprobe_data.get("format", {}).get("tags", {})
return {str(key).lower(): str(value) for key, value in tags.items()}
def output_stem(metadata: StandardBookMetadata) -> str:
"""Build the output stem for a book.
Args:
metadata: Book metadata.
Returns:
Output stem in author-series_01-title form.
"""
return f"{metadata.author}-{metadata.series}_{metadata.series_index:02}-{metadata.title}"
def metadata_output_path(output_directory: Path, metadata: StandardBookMetadata) -> Path:
"""Build the final M4B path from resolved metadata."""
stem = output_stem(metadata)
return output_directory / stem / f"{stem}.m4b"
def convert_aax_file(
aax_file: Path,
destination: Path,
activation_bytes: str | None,
*,
overwrite: bool,
) -> None:
"""Convert an AAX file into an M4B file.
Args:
aax_file: Source AAX file.
destination: Destination M4B file.
activation_bytes: Optional Audible activation bytes for ffmpeg.
overwrite: Whether to overwrite an existing M4B.
"""
if destination.exists() and not overwrite:
logger.info("Skipping existing file %s", destination)
return
destination.parent.mkdir(parents=True, exist_ok=True)
arguments = ["ffmpeg", "-hide_banner", "-y" if overwrite else "-n"]
if activation_bytes:
arguments.extend(["-activation_bytes", activation_bytes])
arguments.extend(["-i", str(aax_file), "-map_metadata", "0", "-c", "copy", str(destination)])
run_command(arguments)
def write_review_file(
*,
destination: Path | None,
ffprobe_metadata: dict[str, str],
log_file: Path,
metadata: StandardBookMetadata | None,
reason: str,
review_file: Path,
source: Path,
temp_file: Path | None,
) -> None:
"""Write a manual review file for an unresolved conversion."""
review_file.parent.mkdir(parents=True, exist_ok=True)
payload = {
"destination": str(destination) if destination else None,
"ffprobe_metadata": ffprobe_metadata,
"metadata": asdict(metadata) if metadata else None,
"reason": reason,
"source": str(source),
"temp_file": str(temp_file) if temp_file else None,
}
review_file.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
write_agent_log(log_file, "review_written", path=str(review_file), reason=reason)
def cleanup_temp_output(temp_file: Path) -> None:
"""Remove a run's temporary output directory."""
shutil.rmtree(temp_file.parent, ignore_errors=True)
def dry_run_aax_file_with_agent(
aax_file: Path,
ffprobe_metadata: dict[str, str],
engine: Engine,
config: ConversionConfig,
log_file: Path,
review_file: Path,
) -> None:
"""Resolve and print the planned output path without converting."""
metadata = standard_book_metadata(
aax_file.name,
ffprobe_metadata,
engine,
log_file,
config.ollama_api_key,
config.agent_config,
)
destination = None if metadata.needs_review else metadata_output_path(config.resolved_output, metadata)
if metadata.needs_review:
write_review_file(
destination=destination,
ffprobe_metadata=ffprobe_metadata,
log_file=log_file,
metadata=metadata,
reason="metadata_needs_review",
review_file=review_file,
source=aax_file,
temp_file=None,
)
typer.echo(f"{aax_file} -> REVIEW {review_file}")
else:
stem = output_stem(metadata)
dry_run_file = (
config.resolved_output / config.work_directory_name / config.dry_run_directory_name / stem / f"{stem}.m4b"
)
dry_run_file.parent.mkdir(parents=True, exist_ok=True)
dry_run_file.write_text(f"{destination}\n", encoding="utf-8")
write_agent_log(
log_file,
"dry_run_file_written",
destination=str(destination),
path=str(dry_run_file),
)
typer.echo(f"{aax_file} -> {destination}")
def convert_temp_file_and_resolve_metadata(
aax_file: Path,
temp_file: Path,
ffprobe_metadata: dict[str, str],
config: ConversionConfig,
log_file: Path,
) -> ConcurrentConversionResult:
"""Run ffmpeg and metadata resolution in parallel."""
conversion_error: Exception | None = None
metadata_error: Exception | None = None
metadata: StandardBookMetadata | None = None
with ThreadPoolExecutor(max_workers=2) as executor:
conversion_future = executor.submit(
convert_aax_file,
aax_file,
temp_file,
config.activation_bytes,
overwrite=True,
)
metadata_future = executor.submit(
standard_book_metadata,
aax_file.name,
ffprobe_metadata,
config.engine,
log_file,
config.ollama_api_key,
config.agent_config,
)
conversion_error = conversion_future.exception()
if conversion_error is None:
conversion_future.result()
metadata_error = metadata_future.exception()
if metadata_error is None:
metadata = metadata_future.result()
return ConcurrentConversionResult(
metadata=metadata,
conversion_error=conversion_error,
metadata_error=metadata_error,
)
def convert_aax_file_with_agent(aax_file: Path, config: ConversionConfig) -> None:
"""Convert one AAX file using the metadata agent for the final path."""
run_id = uuid7().hex
log_file = config.resolved_output / config.work_directory_name / config.log_directory_name / f"{run_id}.jsonl"
review_file = config.resolved_output / config.work_directory_name / config.review_directory_name / f"{run_id}.json"
write_agent_log(log_file, "conversion_start", source=str(aax_file), dry_run=config.dry_run)
try:
ffprobe_metadata = read_metadata(aax_file)
except Exception as error:
logger.exception("ffprobe failed")
write_review_file(
destination=None,
ffprobe_metadata={},
log_file=log_file,
metadata=None,
reason=f"ffprobe_failed: {error}",
review_file=review_file,
source=aax_file,
temp_file=None,
)
return
if config.dry_run:
dry_run_aax_file_with_agent(
aax_file,
ffprobe_metadata,
config.engine,
config,
log_file,
review_file,
)
return
temp_file = (
config.resolved_output / config.work_directory_name / config.temp_directory_name / run_id / "converted.m4b"
)
temp_file.parent.mkdir(parents=True, exist_ok=True)
result = convert_temp_file_and_resolve_metadata(aax_file, temp_file, ffprobe_metadata, config, log_file)
if result.conversion_error:
reason = f"ffmpeg_failed: {result.conversion_error}"
write_review_file(
destination=None,
ffprobe_metadata=ffprobe_metadata,
log_file=log_file,
metadata=result.metadata,
reason=reason,
review_file=review_file,
source=aax_file,
temp_file=temp_file if temp_file.exists() else None,
)
return
if result.metadata_error:
write_review_file(
destination=None,
ffprobe_metadata=ffprobe_metadata,
log_file=log_file,
metadata=None,
reason=f"metadata_failed: {result.metadata_error}",
review_file=review_file,
source=aax_file,
temp_file=temp_file,
)
return
if result.metadata is None or result.metadata.needs_review:
write_review_file(
destination=None,
ffprobe_metadata=ffprobe_metadata,
log_file=log_file,
metadata=result.metadata,
reason="metadata_needs_review",
review_file=review_file,
source=aax_file,
temp_file=temp_file,
)
return
destination = metadata_output_path(config.resolved_output, result.metadata)
if destination.exists() and not config.overwrite:
write_agent_log(log_file, "destination_exists", destination=str(destination))
cleanup_temp_output(temp_file)
return
destination.parent.mkdir(parents=True, exist_ok=True)
try:
temp_file.replace(destination)
except Exception as error: # noqa: BLE001
write_review_file(
destination=destination,
ffprobe_metadata=ffprobe_metadata,
log_file=log_file,
metadata=result.metadata,
reason=f"rename_failed: {error}",
review_file=review_file,
source=aax_file,
temp_file=temp_file if temp_file.exists() else None,
)
else:
cleanup_temp_output(temp_file)
write_agent_log(log_file, "conversion_complete", destination=str(destination))
if __name__ == "__main__":
typer.run(main)
-176
View File
@@ -1,176 +0,0 @@
"""Import audiobook catalog authors and series from CSV files."""
from __future__ import annotations
import csv
import logging
from pathlib import Path # noqa: TC003 This is required for the typer CLI
from typing import Annotated
import typer
from sqlalchemy import select
from sqlalchemy.orm import Session
from python.common import configure_logger
from python.orm.common import get_postgres_engine
from python.orm.richie import AudiobookAuthor, AudiobookSeries
logger = logging.getLogger(__name__)
AUTHOR_NAME_COLUMN = "author_name"
ID_COLUMN = "id"
NAME_COLUMN = "name"
class CatalogImportError(ValueError):
"""CSV catalog import failed validation."""
def main(
authors_csv: Annotated[Path, typer.Argument(help="CSV with name and optional id.")],
series_csv: Annotated[Path, typer.Argument(help="CSV with name, author_name, and optional id.")],
) -> None:
"""Upsert audiobook authors and series from CSV files."""
configure_logger()
try:
engine = get_postgres_engine(name="RICHIE")
with Session(engine) as session:
author_count = upsert_authors_from_csv(session, authors_csv)
series_count = upsert_series_from_csv(session, series_csv)
session.commit()
except CatalogImportError as error:
typer.echo(str(error), err=True)
raise typer.Exit(code=1) from error
logger.info("Upserted %s authors and %s series", author_count, series_count)
def upsert_authors_from_csv(session: Session, authors_csv: Path) -> int:
"""Upsert authors from a CSV file."""
count = 0
for row_number, row in csv_rows(authors_csv):
name = required_csv_value(row, authors_csv, row_number, NAME_COLUMN)
upsert_author(session, name, csv_id(row, authors_csv, row_number))
count += 1
return count
def upsert_series_from_csv(session: Session, series_csv: Path) -> int:
"""Upsert series from a CSV file."""
count = 0
for row_number, row in csv_rows(series_csv):
series_name = required_csv_value(row, series_csv, row_number, NAME_COLUMN)
author_name = required_csv_value(row, series_csv, row_number, AUTHOR_NAME_COLUMN)
author = find_author_by_name(session, author_name)
if author is None:
msg = f"{series_csv}:{row_number}: author not found: {author_name}"
raise CatalogImportError(msg)
upsert_series(session, series_name, author, csv_id(row, series_csv, row_number))
count += 1
return count
def upsert_author(session: Session, name: str, author_id: int | None) -> AudiobookAuthor:
"""Upsert one author by id or exact name."""
if author_id is not None:
author = session.get(AudiobookAuthor, author_id)
if author is None:
author = AudiobookAuthor(id=author_id, name=name)
session.add(author)
else:
author.name = name
session.flush()
return author
author = find_author_by_name(session, name)
if author is None:
author = AudiobookAuthor(name=name)
session.add(author)
session.flush()
return author
def upsert_series(
session: Session,
name: str,
author: AudiobookAuthor,
series_id: int | None,
) -> AudiobookSeries:
"""Upsert one series by id or exact author/name match."""
if series_id is not None:
series = session.get(AudiobookSeries, series_id)
if series is None:
series = AudiobookSeries(id=series_id, name=name, author=author)
session.add(series)
else:
series.name = name
series.author = author
session.flush()
return series
series = find_series_by_name_and_author(session, name, author.id)
if series is None:
series = AudiobookSeries(name=name, author=author)
session.add(series)
session.flush()
return series
def find_author_by_name(session: Session, name: str) -> AudiobookAuthor | None:
"""Find one author by exact name."""
return session.scalar(select(AudiobookAuthor).where(AudiobookAuthor.name == name))
def find_series_by_name_and_author(
session: Session,
name: str,
author_id: int,
) -> AudiobookSeries | None:
"""Find one series by exact name and author."""
return session.scalar(
select(AudiobookSeries).where(
AudiobookSeries.name == name,
AudiobookSeries.author_id == author_id,
),
)
def csv_rows(csv_path: Path) -> list[tuple[int, dict[str, str | None]]]:
"""Read a CSV file as numbered rows."""
with csv_path.open(newline="", encoding="utf-8") as file:
reader = csv.DictReader(file)
if reader.fieldnames is None:
msg = f"{csv_path}: missing CSV header"
raise CatalogImportError(msg)
return [(row_number, row) for row_number, row in enumerate(reader, start=2)]
def required_csv_value(
row: dict[str, str | None],
csv_path: Path,
row_number: int,
column: str,
) -> str:
"""Read a required CSV value."""
value = row.get(column)
if value and value.strip():
return value.strip()
msg = f"{csv_path}:{row_number}: missing required column value: {column}"
raise CatalogImportError(msg)
def csv_id(row: dict[str, str | None], csv_path: Path, row_number: int) -> int | None:
"""Read an optional id field from a CSV row."""
value = row.get(ID_COLUMN)
if value is None or not value.strip():
return None
try:
return int(value)
except ValueError as error:
msg = f"{csv_path}:{row_number}: id must be an integer: {value}"
raise CatalogImportError(msg) from error
return None
if __name__ == "__main__":
typer.run(main)
-565
View File
@@ -1,565 +0,0 @@
"""LLM tool calling support for audiobook metadata resolution."""
from __future__ import annotations
import json
import re
import time
from collections.abc import Callable
from dataclasses import dataclass
from typing import TYPE_CHECKING
from sqlalchemy import or_, select
from python.orm.richie import Audiobook, AudiobookAuthor, AudiobookSeries
if TYPE_CHECKING:
from pathlib import Path
from sqlalchemy.orm import Session
from python.tools.audiobook.metadata_agent import AgentConfig
CATALOG_SLUG_PATTERN = re.compile(r"^[a-z0-9]+(?:_[a-z0-9]+)*$")
TITLE_SLUG_PATTERN = re.compile(r"^[a-z0-9]+(?:-[a-z0-9]+)*$")
LogWriter = Callable[..., None]
class MetadataResolutionError(ValueError):
"""Metadata resolution failed validation."""
@dataclass(frozen=True)
class EnsuredBook:
"""Book row plus whether it was created."""
book: Audiobook
action: str
class CatalogToolRegistry:
"""Controlled catalog tools exposed to the metadata model."""
def __init__(
self,
session: Session,
log_path: Path,
config: AgentConfig,
write_log: LogWriter,
) -> None:
"""Create a registry bound to one database session and audit log."""
self.session = session
self.log_path = log_path
self.config = config
self.write_log = write_log
self.seen_author_ids: set[int] = set()
self.seen_series_ids: set[int] = set()
self.seen_book_ids: set[int] = set()
self.created_author_ids: set[int] = set()
self.created_series_ids: set[int] = set()
self.created_book_ids: set[int] = set()
def tool_schemas(self) -> list[dict[str, object]]:
"""Return Ollama tool schemas."""
schemas = [
{
"type": "function",
"function": {
"name": "search_authors",
"description": "Search canonical audiobook authors by slug or noisy source text.",
"parameters": {
"type": "object",
"properties": {"query": {"type": "string"}},
"required": ["query"],
},
},
},
{
"type": "function",
"function": {
"name": "search_series",
"description": "Search canonical audiobook series by slug or noisy source text.",
"parameters": {
"type": "object",
"properties": {
"query": {"type": "string"},
"author_id": {"type": ["integer", "null"]},
},
"required": ["query"],
},
},
},
{
"type": "function",
"function": {
"name": "search_books",
"description": "Search canonical audiobook titles with optional author and series filters.",
"parameters": {
"type": "object",
"properties": {
"query": {"type": "string"},
"author_id": {"type": ["integer", "null"]},
"series_id": {"type": ["integer", "null"]},
},
"required": ["query"],
},
},
},
{
"type": "function",
"function": {
"name": "ensure_author",
"description": "Normalize an author name to a catalog slug, then return or create that author.",
"parameters": {
"type": "object",
"properties": {"name": {"type": "string"}},
"required": ["name"],
},
},
},
{
"type": "function",
"function": {
"name": "ensure_series",
"description": "Normalize a series name to a catalog slug, then return or create it for an author.",
"parameters": {
"type": "object",
"properties": {
"name": {"type": "string"},
"author_id": {"type": "integer"},
},
"required": ["name", "author_id"],
},
},
},
{
"type": "function",
"function": {
"name": "ensure_book",
"description": "Normalize a title to a book slug, then return or create it for an author/series.",
"parameters": {
"type": "object",
"properties": {
"title": {"type": "string"},
"author_id": {"type": "integer"},
"series_id": {"type": ["integer", "null"]},
"series_index": {"type": "integer"},
},
"required": ["title", "author_id", "series_id", "series_index"],
},
},
},
]
enabled_tool_names = set(self.config.tool_names)
return [schema for schema in schemas if schema["function"]["name"] in enabled_tool_names]
def run(self, name: str, arguments: dict[str, object]) -> list[dict[str, object]]:
"""Run one catalog tool and audit the call."""
handlers = {
"search_authors": self.run_search_authors,
"search_series": self.run_search_series,
"search_books": self.run_search_books,
"ensure_author": self.run_ensure_author,
"ensure_series": self.run_ensure_series,
"ensure_book": self.run_ensure_book,
}
handler = handlers.get(name)
if handler is None:
self.write_log(self.log_path, "tool_error", tool=name, arguments=arguments, error="unknown_tool")
msg = f"Unknown audiobook metadata tool: {name}"
raise MetadataResolutionError(msg)
if name not in self.config.tool_names:
self.write_log(self.log_path, "tool_error", tool=name, arguments=arguments, error="tool_not_enabled")
msg = f"Audiobook metadata tool is not enabled: {name}"
raise MetadataResolutionError(msg)
started = time.perf_counter()
self.write_log(self.log_path, "tool_call", tool=name, arguments=arguments)
result = handler(arguments)
duration_ms = round((time.perf_counter() - started) * 1000, 3)
self.write_log(
self.log_path,
"tool_result",
tool=name,
duration_ms=duration_ms,
result_count=len(result),
preview=result[:3],
)
return result
def get_author(self, author_id: int) -> AudiobookAuthor | None:
"""Return an author by id."""
return self.session.get(AudiobookAuthor, author_id)
def get_book(self, book_id: int) -> Audiobook | None:
"""Return a book by id."""
return self.session.get(Audiobook, book_id)
def get_series(self, series_id: int) -> AudiobookSeries | None:
"""Return a series by id."""
return self.session.get(AudiobookSeries, series_id)
def prune_unused_created_rows(self, *, author_id: int, book_id: int | None, series_id: int | None) -> None:
"""Remove catalog rows created during this run but not used by final metadata."""
used_book_ids = {book_id} if book_id is not None else set()
for created_book_id in self.created_book_ids - used_book_ids:
if book := self.get_book(created_book_id):
self.session.delete(book)
self.session.flush()
used_series_ids = {series_id} if series_id is not None else set()
for created_series_id in self.created_series_ids - used_series_ids:
series = self.get_series(created_series_id)
if series and not series.books:
self.session.delete(series)
self.session.flush()
for created_author_id in self.created_author_ids - {author_id}:
author = self.get_author(created_author_id)
if author and not author.books and not author.series:
self.session.delete(author)
def run_search_authors(self, arguments: dict[str, object]) -> list[dict[str, object]]:
"""Search authors from tool arguments and remember returned ids."""
query = required_string(arguments, "query")
statement = select(AudiobookAuthor).order_by(AudiobookAuthor.name).limit(self.config.max_tool_results)
if terms := query_terms(query):
statement = statement.where(or_(*(AudiobookAuthor.name.ilike(f"%{term}%") for term in terms)))
authors = self.session.scalars(statement).all()
self.seen_author_ids.update(author.id for author in authors)
return [{"id": author.id, "name": author.name} for author in authors]
def run_search_series(self, arguments: dict[str, object]) -> list[dict[str, object]]:
"""Search series from tool arguments and remember returned ids."""
query = required_string(arguments, "query")
author_id = optional_int(arguments.get("author_id"), "author_id")
statement = select(AudiobookSeries).order_by(AudiobookSeries.name).limit(self.config.max_tool_results)
if terms := query_terms(query):
statement = statement.where(or_(*(AudiobookSeries.name.ilike(f"%{term}%") for term in terms)))
if author_id is not None:
statement = statement.where(AudiobookSeries.author_id == author_id)
series_rows = self.session.scalars(statement).all()
self.seen_series_ids.update(series.id for series in series_rows)
self.seen_author_ids.update(series.author_id for series in series_rows)
return [
{
"id": series.id,
"name": series.name,
"author_id": series.author_id,
"author": series.author.name,
}
for series in series_rows
]
def run_search_books(self, arguments: dict[str, object]) -> list[dict[str, object]]:
"""Search books from tool arguments and remember returned ids."""
query = required_string(arguments, "query")
author_id = optional_int(arguments.get("author_id"), "author_id")
series_id = optional_int(arguments.get("series_id"), "series_id")
statement = select(Audiobook).order_by(Audiobook.title).limit(self.config.max_tool_results)
if terms := query_terms(query):
statement = statement.where(or_(*(Audiobook.title.ilike(f"%{term}%") for term in terms)))
if author_id is not None:
statement = statement.where(Audiobook.author_id == author_id)
if series_id is not None:
statement = statement.where(Audiobook.series_id == series_id)
books = self.session.scalars(statement).all()
self.seen_book_ids.update(book.id for book in books)
self.seen_author_ids.update(book.author_id for book in books)
self.seen_series_ids.update(book.series_id for book in books if book.series_id is not None)
return [
{
"id": book.id,
"title": book.title,
"author_id": book.author_id,
"author": book.author.name,
"series_id": book.series_id,
"series": book.series.name if book.series else self.config.standalone_series,
"series_index": book.series_index,
}
for book in books
]
def run_ensure_author(self, arguments: dict[str, object]) -> list[dict[str, object]]:
"""Ensure an author from tool arguments and return a tool result."""
name = normalize_catalog_slug(required_string(arguments, "name"))
validate_catalog_slug(name, "author")
author = self.session.scalar(select(AudiobookAuthor).where(AudiobookAuthor.name == name))
action = "existing"
if author is None:
author = AudiobookAuthor(name=name)
self.session.add(author)
self.session.flush()
self.created_author_ids.add(author.id)
action = "created"
self.seen_author_ids.add(author.id)
return [{"id": author.id, "name": author.name, "action": action}]
def run_ensure_series(self, arguments: dict[str, object]) -> list[dict[str, object]]:
"""Ensure a series from tool arguments and return a tool result."""
name = normalize_catalog_slug(required_string(arguments, "name"))
author_id = required_int(arguments, "author_id")
validate_catalog_slug(name, "series")
author = self.required_author(author_id)
series = self.session.scalar(
select(AudiobookSeries).where(
AudiobookSeries.name == name,
AudiobookSeries.author_id == author.id,
),
)
action = "existing"
if series is None:
series = AudiobookSeries(name=name, author=author)
self.session.add(series)
self.session.flush()
self.created_series_ids.add(series.id)
action = "created"
self.seen_author_ids.add(author.id)
self.seen_series_ids.add(series.id)
return [self.series_result(series, action)]
def run_ensure_book(self, arguments: dict[str, object]) -> list[dict[str, object]]:
"""Ensure a book from tool arguments and return a tool result."""
title = required_string(arguments, "title")
author_id = required_int(arguments, "author_id")
series_id = optional_int(arguments.get("series_id"), "series_id")
series_index = required_int(arguments, "series_index")
ensured = self.ensure_book(title, author_id, series_id, series_index)
return [self.book_result(ensured.book, ensured.action)]
def ensure_book(
self,
title: str,
author_id: int,
series_id: int | None,
series_index: int,
) -> EnsuredBook:
"""Return an existing book row, or create it after validating ownership."""
title = normalize_title_slug(title)
validate_title_slug(title)
author = self.required_author(author_id)
series = None
if series_id is None:
if series_index != 0:
msg = "standalone books must use series_index 0"
raise MetadataResolutionError(msg)
else:
series = self.required_series(series_id)
if series.author_id != author.id:
msg = f"series_id {series_id} does not belong to author_id {author_id}"
raise MetadataResolutionError(msg)
if series_index <= 0:
msg = "series books must use a positive series_index"
raise MetadataResolutionError(msg)
statement = select(Audiobook).where(
Audiobook.title == title,
Audiobook.author_id == author.id,
)
if series is None:
statement = statement.where(Audiobook.series_id.is_(None))
else:
statement = statement.where(Audiobook.series_id == series.id)
book = self.session.scalar(statement)
if book is None:
book = Audiobook(title=title, author=author, series=series, series_index=series_index)
self.session.add(book)
self.session.flush()
self.created_book_ids.add(book.id)
action = "created"
else:
action = "existing"
self.seen_book_ids.add(book.id)
self.seen_author_ids.add(author.id)
if book.series_id is not None:
self.seen_series_ids.add(book.series_id)
return EnsuredBook(book=book, action=action)
def required_author(self, author_id: int) -> AudiobookAuthor:
"""Return an author or fail metadata resolution."""
author = self.get_author(author_id)
if author is None:
msg = f"author_id {author_id} does not exist"
raise MetadataResolutionError(msg)
return author
def required_series(self, series_id: int) -> AudiobookSeries:
"""Return a series or fail metadata resolution."""
series = self.get_series(series_id)
if series is None:
msg = f"series_id {series_id} does not exist"
raise MetadataResolutionError(msg)
return series
def series_result(self, series: AudiobookSeries, action: str) -> dict[str, object]:
"""Build a normalized series tool result."""
return {
"id": series.id,
"name": series.name,
"author_id": series.author_id,
"author": series.author.name,
"action": action,
}
def book_result(self, book: Audiobook, action: str) -> dict[str, object]:
"""Build a normalized book tool result."""
return {
"id": book.id,
"title": book.title,
"author_id": book.author_id,
"author": book.author.name,
"series_id": book.series_id,
"series": book.series.name if book.series else self.config.standalone_series,
"series_index": book.series_index,
"action": action,
}
def run_tool_calls(
messages: list[dict[str, object]],
message: dict[str, object],
tool_calls: list[tuple[str, dict[str, object]]],
registry: CatalogToolRegistry,
log_path: Path,
write_log: LogWriter,
) -> str | None:
"""Run tool calls, append tool messages, and return fatal error text when stopped."""
messages.append(message)
for tool_name, arguments in tool_calls:
try:
tool_result = registry.run(tool_name, arguments)
except MetadataResolutionError as error:
if is_fatal_tool_error(error):
return str(error)
write_log(log_path, "tool_error", tool=tool_name, arguments=arguments, error=str(error))
messages.append(
{
"role": "tool",
"tool_name": tool_name,
"content": json.dumps({"error": str(error)}, sort_keys=True),
},
)
continue
messages.append(
{
"role": "tool",
"tool_name": tool_name,
"content": json.dumps(tool_result, sort_keys=True),
},
)
return None
def parse_tool_calls(message: dict[str, object]) -> list[tuple[str, dict[str, object]]]:
"""Parse Ollama tool calls from a response message."""
raw_tool_calls = message.get("tool_calls") or []
if not isinstance(raw_tool_calls, list):
msg = "tool_calls must be a list"
raise MetadataResolutionError(msg)
tool_calls = []
for raw_call in raw_tool_calls:
if not isinstance(raw_call, dict):
msg = "tool call must be an object"
raise MetadataResolutionError(msg)
function = raw_call.get("function")
if not isinstance(function, dict):
msg = "tool call is missing function"
raise MetadataResolutionError(msg)
name = function.get("name")
if not isinstance(name, str) or not name:
msg = "tool call is missing function name"
raise MetadataResolutionError(msg)
arguments = parse_tool_arguments(function.get("arguments", {}))
tool_calls.append((name, arguments))
return tool_calls
def parse_tool_arguments(raw_arguments: object) -> dict[str, object]:
"""Parse tool call arguments returned by Ollama."""
if isinstance(raw_arguments, dict):
return {str(key): value for key, value in raw_arguments.items()}
if isinstance(raw_arguments, str):
parsed = json.loads(raw_arguments) if raw_arguments else {}
if isinstance(parsed, dict):
return {str(key): value for key, value in parsed.items()}
msg = "tool arguments must be an object"
raise MetadataResolutionError(msg)
def validate_title_slug(title: str) -> None:
"""Validate a canonical book title slug."""
if not TITLE_SLUG_PATTERN.fullmatch(title):
msg = f"title slug is invalid: {title}"
raise MetadataResolutionError(msg)
def validate_catalog_slug(value: str, label: str) -> None:
"""Validate a canonical catalog slug."""
if not CATALOG_SLUG_PATTERN.fullmatch(value):
msg = f"{label} slug is invalid: {value}"
raise MetadataResolutionError(msg)
def normalize_catalog_slug(value: str) -> str:
"""Normalize noisy catalog names into lower snake-case slugs."""
return re.sub(r"[^a-z0-9]+", "_", value.strip().casefold()).strip("_")
def normalize_title_slug(value: str) -> str:
"""Normalize noisy book titles into lower kebab-case slugs."""
return re.sub(r"[^a-z0-9]+", "-", value.strip().casefold()).strip("-")
def is_fatal_tool_error(error: MetadataResolutionError) -> bool:
"""Return whether a tool error should stop the agent immediately."""
message = str(error)
return message.startswith(
(
"Unknown audiobook metadata tool",
"Audiobook metadata tool is not enabled",
),
)
def query_terms(query: str) -> tuple[str, ...]:
"""Return text variants useful for matching noisy audiobook metadata."""
normalized = query.strip().casefold()
underscore_slug = normalize_catalog_slug(normalized)
hyphen_slug = normalize_title_slug(normalized)
return tuple(dict.fromkeys(term for term in (normalized, underscore_slug, hyphen_slug) if term))
def required_string(data: dict[str, object], key: str) -> str:
"""Read a required string field."""
value = data.get(key)
if not isinstance(value, str) or not value.strip():
msg = f"{key} must be a non-empty string"
raise MetadataResolutionError(msg)
return value.strip()
def required_int(data: dict[str, object], key: str) -> int:
"""Read a required integer field."""
value = data.get(key)
if isinstance(value, bool) or not isinstance(value, int):
msg = f"{key} must be an integer"
raise MetadataResolutionError(msg)
return value
def optional_int(value: object, key: str) -> int | None:
"""Read an optional integer field."""
if value is None:
return None
if isinstance(value, bool) or not isinstance(value, int):
msg = f"{key} must be an integer or null"
raise MetadataResolutionError(msg)
return value
-572
View File
@@ -1,572 +0,0 @@
"""Resolve audiobook metadata with a controlled Ollama tool loop."""
from __future__ import annotations
import json
import re
from dataclasses import asdict, dataclass, is_dataclass, replace
from os import PathLike
from typing import TYPE_CHECKING
import httpx
from sqlalchemy.orm import Session
from python.common import utcnow
from python.tools.audiobook.llm_tool_calling import (
CatalogToolRegistry,
MetadataResolutionError,
normalize_title_slug,
optional_int,
parse_tool_calls,
required_int,
required_string,
run_tool_calls,
validate_catalog_slug,
validate_title_slug,
)
if TYPE_CHECKING:
from pathlib import Path
from sqlalchemy.engine import Engine
from python.orm.richie import AudiobookAuthor
FENCED_JSON_PATTERN = re.compile(r"^```(?:json)?\s*(?P<json>.*?)\s*```$", re.IGNORECASE | re.DOTALL)
@dataclass(frozen=True)
class AgentConfig:
"""Runtime settings for the audiobook metadata agent."""
model: str = "deepseek-v4-flash:cloud"
ollama_chat_url: str = "https://ollama.com/api/chat"
http_timeout_seconds: int = 300
max_agent_turns: int = 8
max_tool_results: int = 10
min_confidence: float = 0.85
invalid_final_retries: int = 1
standalone_series: str = "standalone"
tool_names: tuple[str, ...] = (
"search_authors",
"search_series",
"search_books",
"ensure_author",
"ensure_series",
"ensure_book",
)
@dataclass(frozen=True)
class StandardBookMetadata:
"""Canonical metadata for the final audiobook path."""
author_id: int
author: str
book_id: int | None
title: str
series_id: int | None
series: str
series_index: int
confidence: float
needs_review: bool
evidence: list[str]
@dataclass(frozen=True)
class FinalMetadataFields:
"""Raw model fields after schema validation."""
author_id: int
book_id: int | None
title: str
series_id: int | None
series_index: int
confidence: float
evidence: list[str]
@dataclass(frozen=True)
class ResolvedBookFields:
"""Book fields after optional catalog book resolution."""
book_id: int | None
title: str
series_id: int | None
series_index: int
@dataclass(frozen=True)
class AgentStepResult:
"""Outcome from one model response."""
metadata: StandardBookMetadata | None
invalid_final_count: int
should_continue: bool
def standard_book_metadata(
aax_file_name: str,
aax_metadata_from_ffprobe: dict[str, str],
engine: Engine,
log_path: Path,
ollama_api_key: str,
config: AgentConfig,
) -> StandardBookMetadata:
"""Resolve canonical audiobook metadata with the configured Ollama Cloud model."""
with Session(engine) as session:
registry = CatalogToolRegistry(session, log_path, config, write_agent_log)
agent = AudiobookMetadataAgent(
registry=registry, log_path=log_path, ollama_api_key=ollama_api_key, config=config
)
metadata = agent.run(aax_file_name, aax_metadata_from_ffprobe)
if metadata.needs_review:
session.rollback()
else:
registry.prune_unused_created_rows(
author_id=metadata.author_id,
book_id=metadata.book_id,
series_id=metadata.series_id,
)
session.commit()
return metadata
class AudiobookMetadataAgent:
"""Ollama-backed metadata resolver with a fixed local tool registry."""
def __init__(
self,
*,
registry: CatalogToolRegistry,
log_path: Path,
ollama_api_key: str,
config: AgentConfig,
) -> None:
"""Create an Ollama metadata agent."""
self._registry = registry
self._log_path = log_path
self._ollama_api_key = ollama_api_key
self._config = config
def run(self, aax_file_name: str, aax_metadata_from_ffprobe: dict[str, str]) -> StandardBookMetadata:
"""Resolve metadata for one AAX file."""
messages = [
{"role": "system", "content": system_prompt()},
{"role": "user", "content": user_prompt(aax_file_name, aax_metadata_from_ffprobe)},
]
invalid_final_count = 0
result: StandardBookMetadata | None = None
for turn in range(1, self._config.max_agent_turns + 1):
step = self.run_step(messages, turn, invalid_final_count)
invalid_final_count = step.invalid_final_count
if step.should_continue:
continue
result = step.metadata
break
if result is None:
return self.force_final_response(messages)
return result
def run_step(
self,
messages: list[dict[str, object]],
turn: int,
invalid_final_count: int,
) -> AgentStepResult:
"""Run one model turn and return the next agent-loop action."""
data = self.chat(messages, turn)
message = data.get("message")
if not isinstance(message, dict):
return AgentStepResult(
metadata=review_metadata("Ollama response did not include a message", self._config),
invalid_final_count=invalid_final_count,
should_continue=False,
)
try:
tool_calls = parse_tool_calls(message)
except (json.JSONDecodeError, MetadataResolutionError) as error:
return AgentStepResult(
metadata=review_metadata(str(error), self._config),
invalid_final_count=invalid_final_count,
should_continue=False,
)
if tool_calls:
fatal_error = run_tool_calls(messages, message, tool_calls, self._registry, self._log_path, write_agent_log)
if fatal_error is not None:
return AgentStepResult(
metadata=review_metadata(fatal_error, self._config),
invalid_final_count=invalid_final_count,
should_continue=False,
)
return AgentStepResult(metadata=None, invalid_final_count=invalid_final_count, should_continue=True)
return self.handle_final_message(messages, message, invalid_final_count)
def handle_final_message(
self,
messages: list[dict[str, object]],
message: dict[str, object],
invalid_final_count: int,
) -> AgentStepResult:
"""Validate a final model message or request one retry."""
content = message.get("content")
if not isinstance(content, str):
return AgentStepResult(
metadata=review_metadata("Ollama final response did not include string content", self._config),
invalid_final_count=invalid_final_count,
should_continue=False,
)
try:
resolved = self.validate_final(parse_final_json_content(content))
except (json.JSONDecodeError, MetadataResolutionError) as error:
return self.handle_invalid_final(messages, error, invalid_final_count)
write_agent_log(self._log_path, "final_metadata", metadata=resolved)
return AgentStepResult(metadata=resolved, invalid_final_count=invalid_final_count, should_continue=False)
def handle_invalid_final(
self,
messages: list[dict[str, object]],
error: json.JSONDecodeError | MetadataResolutionError,
invalid_final_count: int,
) -> AgentStepResult:
"""Log invalid final JSON and either retry or return review metadata."""
invalid_final_count += 1
write_agent_log(
self._log_path,
"final_validation_error",
error=str(error),
invalid_final_count=invalid_final_count,
)
if invalid_final_count > self._config.invalid_final_retries:
return AgentStepResult(
metadata=review_metadata(str(error), self._config),
invalid_final_count=invalid_final_count,
should_continue=False,
)
messages.append(
{
"role": "user",
"content": (
"Your previous final answer was invalid. Return only valid JSON matching the required "
f"schema. Validation error: {error}"
),
},
)
return AgentStepResult(metadata=None, invalid_final_count=invalid_final_count, should_continue=True)
def force_final_response(self, messages: list[dict[str, object]]) -> StandardBookMetadata:
"""Request a no-tool final answer after the normal turn limit."""
messages.append({"role": "user", "content": forced_final_prompt()})
write_agent_log(self._log_path, "forced_final_request", reason="max_turns")
data = self.chat(messages, self._config.max_agent_turns + 1, tools_enabled=False)
message = data.get("message")
if not isinstance(message, dict):
return review_metadata("Ollama forced final response did not include a message", self._config)
content = message.get("content")
if not isinstance(content, str):
return review_metadata("Ollama forced final response did not include string content", self._config)
try:
resolved = self.validate_final(parse_final_json_content(content))
except (json.JSONDecodeError, MetadataResolutionError) as error:
return review_metadata(f"Ollama forced final response was invalid: {error}", self._config)
write_agent_log(self._log_path, "final_metadata", metadata=resolved)
return resolved
def chat(self, messages: list[dict[str, object]], turn: int, *, tools_enabled: bool = True) -> dict[str, object]:
"""Send one chat request to Ollama and log the request and response."""
payload = {
"model": self._config.model,
"messages": messages,
"stream": False,
"options": {"temperature": 0},
}
tool_names = []
if tools_enabled:
payload["tools"] = self._registry.tool_schemas()
tool_names = self._config.tool_names
write_agent_log(
self._log_path,
"model_request",
model=self._config.model,
turn=turn,
message_count=len(messages),
tool_names=tool_names,
tools_enabled=tools_enabled,
)
write_agent_log(
self._log_path,
"llm_messages_sent",
model=self._config.model,
turn=turn,
messages=messages,
tools_enabled=tools_enabled,
)
response = httpx.post(
self._config.ollama_chat_url,
headers={"Authorization": f"Bearer {self._ollama_api_key}"},
json=payload,
timeout=self._config.http_timeout_seconds,
)
response.raise_for_status()
raw_data = response.json()
if not isinstance(raw_data, dict):
return {}
data = {str(key): value for key, value in raw_data.items()}
message = data.get("message", {})
content = message.get("content") if isinstance(message, dict) else ""
write_agent_log(
self._log_path,
"llm_message_received",
model=self._config.model,
turn=turn,
message=message,
)
write_agent_log(
self._log_path,
"model_response",
model=self._config.model,
turn=turn,
has_tool_calls=bool(isinstance(message, dict) and message.get("tool_calls")),
content_chars=len(content) if isinstance(content, str) else 0,
)
return data
def validate_final(self, raw_metadata: object) -> StandardBookMetadata:
"""Validate final model metadata against catalog rows."""
fields = parse_final_metadata_fields(raw_metadata)
fields = replace(fields, title=normalize_title_slug(fields.title))
author = self.validate_author(fields.author_id)
validate_title_slug(fields.title)
book_fields = self.resolve_book_fields(fields)
series = self.validate_series(fields.author_id, book_fields.series_id, book_fields.series_index)
return StandardBookMetadata(
author_id=fields.author_id,
author=author.name,
book_id=book_fields.book_id,
title=book_fields.title,
series_id=book_fields.series_id,
series=series,
series_index=book_fields.series_index,
confidence=fields.confidence,
needs_review=fields.confidence < self._config.min_confidence,
evidence=fields.evidence,
)
def validate_author(self, author_id: int) -> AudiobookAuthor:
"""Validate that an author id was seen and exists."""
if author_id not in self._registry.seen_author_ids:
msg = f"author_id {author_id} was not returned by search_authors"
raise MetadataResolutionError(msg)
author = self._registry.get_author(author_id)
if author is None:
msg = f"author_id {author_id} does not exist"
raise MetadataResolutionError(msg)
validate_catalog_slug(author.name, "author")
return author
def resolve_book_fields(self, fields: FinalMetadataFields) -> ResolvedBookFields:
"""Resolve final book fields from a seen book id or created book."""
if fields.book_id is None:
ensured = self._registry.ensure_book(
fields.title,
fields.author_id,
fields.series_id,
fields.series_index,
)
return ResolvedBookFields(
book_id=ensured.book.id,
title=ensured.book.title,
series_id=ensured.book.series_id,
series_index=ensured.book.series_index,
)
if fields.book_id not in self._registry.seen_book_ids:
msg = f"book_id {fields.book_id} was not returned by search_books"
raise MetadataResolutionError(msg)
book = self._registry.get_book(fields.book_id)
if book is None:
msg = f"book_id {fields.book_id} does not exist"
raise MetadataResolutionError(msg)
if book.author_id != fields.author_id:
msg = f"book_id {fields.book_id} does not belong to author_id {fields.author_id}"
raise MetadataResolutionError(msg)
return ResolvedBookFields(
book_id=fields.book_id,
title=book.title,
series_id=book.series_id,
series_index=book.series_index,
)
def validate_series(self, author_id: int, series_id: int | None, series_index: int) -> str:
"""Validate final series fields and return the canonical series slug."""
if series_id is None:
if series_index != 0:
msg = "standalone books must use series_index 0"
raise MetadataResolutionError(msg)
return self._config.standalone_series
if series_id not in self._registry.seen_series_ids:
msg = f"series_id {series_id} was not returned by search_series"
raise MetadataResolutionError(msg)
series = self._registry.get_series(series_id)
if series is None:
msg = f"series_id {series_id} does not exist"
raise MetadataResolutionError(msg)
if series.author_id != author_id:
msg = f"series_id {series_id} does not belong to author_id {author_id}"
raise MetadataResolutionError(msg)
if series_index <= 0:
msg = "series books must use a positive series_index"
raise MetadataResolutionError(msg)
validate_catalog_slug(series.name, "series")
return series.name
def write_agent_log(log_path: Path, event: str, **fields: object) -> None:
"""Append one JSONL audit event."""
log_path.parent.mkdir(parents=True, exist_ok=True)
record = {
"created": utcnow().isoformat(),
"event": event,
**{key: json_log_value(value) for key, value in fields.items()},
}
with log_path.open("a", encoding="utf-8") as file:
file.write(json.dumps(record, sort_keys=True))
file.write("\n")
def json_log_value(value: object) -> object:
"""Return a JSON-serializable value for audit logs."""
if is_dataclass(value) and not isinstance(value, type):
return json_log_value(asdict(value))
if isinstance(value, dict):
return {str(key): json_log_value(item) for key, item in value.items()}
if isinstance(value, list | tuple):
return [json_log_value(item) for item in value]
if isinstance(value, set):
return [json_log_value(item) for item in sorted(value, key=str)]
if isinstance(value, PathLike):
return str(value)
return value
def system_prompt() -> str:
"""Return the stable system prompt."""
return """You standardize Audible audiobook metadata against a private catalog.
Rules:
- You must use the provided tools before returning final metadata.
- Only use author_id, series_id, or book_id values returned by tools.
- Return final metadata as JSON only. Do not wrap it in Markdown.
- The final JSON object must contain author_id, book_id, title, series_id, series_index, confidence, and evidence.
- title must be a canonical title slug using lower-case words separated by hyphens.
- Use series_id null and series_index 0 for standalone books.
- If you use a series_id, series_index must be an integer greater than or equal to 1.
- Detect omnibus or box-set editions that contain multiple numbered novels, books, or novellas.
- For an omnibus, make a best-effort range from the filename, tags, and catalog rows. Keep series_index as the
first covered book number and include the range in the title when the source title includes it, for example
books-1-3.
- Be careful with omnibuses of novels or novellas later published as one book: keep the omnibus as the audiobook's
book record unless catalog rows clearly identify a better match.
- Do not create publisher collections or author collections as series unless the book metadata clearly gives a
numbered series.
- Series belong to authors. Use a series_id only when it belongs to the selected author_id.
- Always search for the author before creating one. If no exact author slug exists, call ensure_author.
- Always search for a series with author_id before creating one. If no exact series slug exists, call ensure_series.
- Always search for a book before creating one. If no exact title slug exists, call ensure_book.
- If a tool returns an error, correct your tool arguments or final metadata before continuing.
- confidence must be a number from 0 to 1.
- evidence must be a short list of strings explaining which filename, tags, and catalog rows support the answer."""
def forced_final_prompt() -> str:
"""Return the no-tools finalization prompt."""
return (
"Stop calling tools. Return final metadata as JSON only using the tool results already provided. "
"If search_books returned no matching rows but author and series are known, use book_id null and resolve "
"the title slug from the AAX filename and ffprobe tags. The validator will create the missing book. "
"Use only author_id and series_id values returned by earlier tool results."
)
def user_prompt(aax_file_name: str, metadata: dict[str, str]) -> str:
"""Build the user prompt from source metadata."""
return (
"Resolve this Audible audiobook.\n\n"
f"AAX file name: {aax_file_name}\n\n"
"ffprobe format tags:\n"
f"{json.dumps(metadata, indent=2, sort_keys=True)}"
)
def parse_final_json_content(content: str) -> object:
"""Parse final model content, accepting bare or fenced JSON."""
stripped = content.strip()
if match := FENCED_JSON_PATTERN.fullmatch(stripped):
stripped = match.group("json").strip()
return json.loads(stripped)
def parse_final_metadata_fields(raw_metadata: object) -> FinalMetadataFields:
"""Parse the model's final JSON object into typed fields."""
if not isinstance(raw_metadata, dict):
msg = "Final metadata must be a JSON object"
raise MetadataResolutionError(msg)
data = {str(key): value for key, value in raw_metadata.items()}
return FinalMetadataFields(
author_id=required_int(data, "author_id"),
book_id=optional_int(data.get("book_id"), "book_id"),
title=required_string(data, "title"),
series_id=optional_int(data.get("series_id"), "series_id"),
series_index=required_int(data, "series_index"),
confidence=required_float(data, "confidence"),
evidence=required_string_list(data, "evidence"),
)
def review_metadata(reason: str, config: AgentConfig) -> StandardBookMetadata:
"""Return a metadata result that must be reviewed manually."""
return StandardBookMetadata(
author_id=0,
author="unknown_author",
book_id=None,
title="unknown-title",
series_id=None,
series=config.standalone_series,
series_index=0,
confidence=0,
needs_review=True,
evidence=[reason],
)
def required_float(data: dict[str, object], key: str) -> float:
"""Read a required float field."""
value = data.get(key)
if isinstance(value, bool) or not isinstance(value, int | float):
msg = f"{key} must be a number"
raise MetadataResolutionError(msg)
confidence = float(value)
if confidence < 0 or confidence > 1:
msg = f"{key} must be between 0 and 1"
raise MetadataResolutionError(msg)
return confidence
def required_string_list(data: dict[str, object], key: str) -> list[str]:
"""Read a required list of strings."""
value = data.get(key)
if not isinstance(value, list) or not value or not all(isinstance(item, str) for item in value):
msg = f"{key} must be a non-empty list of strings"
raise MetadataResolutionError(msg)
strings = [item.strip() for item in value if item.strip()]
if not strings:
msg = f"{key} must include at least one non-empty string"
raise MetadataResolutionError(msg)
return strings
+2 -4
View File
@@ -34,9 +34,8 @@ def main(config_file: Path) -> None:
logger.error(msg) logger.error(msg)
signal_alert(msg) signal_alert(msg)
continue continue
count_lookup = get_count_lookup(config_file, dataset.name)
logger.info(f"using {count_lookup} for {dataset.name}") get_snapshots_to_delete(dataset, get_count_lookup(config_file, dataset.name))
get_snapshots_to_delete(dataset, count_lookup)
except Exception: except Exception:
logger.exception("snapshot_manager failed") logger.exception("snapshot_manager failed")
signal_alert("snapshot_manager failed") signal_alert("snapshot_manager failed")
@@ -100,7 +99,6 @@ def get_snapshots_to_delete(
""" """
snapshots = dataset.get_snapshots() snapshots = dataset.get_snapshots()
logger.info(f"calculating snapshots for {dataset.name} to be deleted")
if not snapshots: if not snapshots:
logger.info(f"{dataset.name} has no snapshots") logger.info(f"{dataset.name} has no snapshots")
return return
-17
View File
@@ -1,17 +0,0 @@
FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1
RUN apt-get update \
&& apt-get install -y --no-install-recommends python3 python3-pip ffmpeg \
&& rm -rf /var/lib/apt/lists/*
RUN pip3 install --no-cache-dir --upgrade pip \
&& pip3 install --no-cache-dir faster-whisper requests
WORKDIR /app
COPY python/tools/whisper/inference.py /app/inference.py
ENTRYPOINT ["python3", "/app/inference.py"]
@@ -1,2 +0,0 @@
*
!python/tools/whisper/inference.py
-1
View File
@@ -1 +0,0 @@
"""Whisper transcription tools (host orchestrator and container entrypoint)."""
-136
View File
@@ -1,136 +0,0 @@
"""Container entrypoint that transcribes a directory of audio files with faster-whisper.
Run inside the whisper-transcribe docker image; segment timestamps are grouped
into one-minute buckets so the output reads as ``[HH:MM:00] text``.
"""
from __future__ import annotations
import argparse
import logging
from pathlib import Path
from faster_whisper import WhisperModel
logger = logging.getLogger(__name__)
AUDIO_EXTENSIONS = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".opus", ".mp4", ".mkv", ".webm", ".aac"}
BUCKET_SECONDS = 60
BEAM_SIZE = 5
SECONDS_PER_HOUR = 3600
SECONDS_PER_MINUTE = 60
def format_timestamp(total_seconds: float) -> str:
"""Render a whole-minute timestamp as ``HH:MM:00``.
Args:
total_seconds: Offset in seconds from the start of the audio.
Returns:
A zero-padded ``HH:MM:00`` string.
"""
hours = int(total_seconds // SECONDS_PER_HOUR)
minutes = int((total_seconds % SECONDS_PER_HOUR) // SECONDS_PER_MINUTE)
return f"{hours:02d}:{minutes:02d}:00"
def transcribe_file(model: WhisperModel, audio_path: Path, output_path: Path) -> None:
"""Transcribe one audio file and write the bucketed transcript to disk.
Args:
model: Loaded faster-whisper model.
audio_path: Source audio file.
output_path: Destination ``.txt`` path.
"""
logger.info("Transcribing %s", audio_path)
segments, info = model.transcribe(
str(audio_path),
language="en",
beam_size=BEAM_SIZE,
vad_filter=True,
)
logger.info("Duration %.1fs", info.duration)
buckets: dict[int, list[str]] = {}
for segment in segments:
bucket = int(segment.start // BUCKET_SECONDS)
buckets.setdefault(bucket, []).append(segment.text.strip())
lines = [f"[{format_timestamp(bucket * BUCKET_SECONDS)}] {' '.join(buckets[bucket])}" for bucket in sorted(buckets)]
output_path.write_text("\n\n".join(lines) + "\n", encoding="utf-8")
logger.info("Wrote %s", output_path)
def find_audio_files(input_directory: Path) -> list[Path]:
"""Collect every audio file under ``input_directory``.
Args:
input_directory: Directory to walk recursively.
Returns:
Sorted list of audio file paths.
"""
return sorted(
path for path in input_directory.rglob("*") if path.is_file() and path.suffix.lower() in AUDIO_EXTENSIONS
)
def configure_container_logger() -> None:
"""Configure logging for the container (stdout, INFO)."""
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
)
def parse_arguments() -> argparse.Namespace:
"""Parse CLI arguments for the container entrypoint.
Returns:
Parsed argparse namespace.
"""
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--input", type=Path, default=Path("/audio"))
parser.add_argument("--output", type=Path, default=Path("/output"))
parser.add_argument("--model", default="large-v3")
parser.add_argument(
"--download-only",
action="store_true",
help="Download the model into the cache volume and exit without transcribing.",
)
return parser.parse_args()
def main() -> None:
"""Load the model, then either exit (download-only) or transcribe the directory."""
configure_container_logger()
arguments = parse_arguments()
logger.info("Loading model %s on CUDA", arguments.model)
model = WhisperModel(arguments.model, device="cuda", compute_type="float16")
if arguments.download_only:
logger.info("Model ready; exiting (download-only mode)")
return
arguments.output.mkdir(parents=True, exist_ok=True)
audio_files = find_audio_files(arguments.input)
if not audio_files:
logger.warning("No audio files found in %s", arguments.input)
return
logger.info("Found %d audio file(s)", len(audio_files))
for audio_path in audio_files:
relative = audio_path.relative_to(arguments.input)
output_path = arguments.output / relative.with_suffix(".txt")
output_path.parent.mkdir(parents=True, exist_ok=True)
if output_path.exists():
logger.info("Skip %s (already transcribed)", relative)
continue
transcribe_file(model, audio_path, output_path)
if __name__ == "__main__":
main()
-167
View File
@@ -1,167 +0,0 @@
"""Build and run the whisper transcription docker container on demand.
The container is started fresh for each invocation and removed on exit
(``docker run --rm``). The model is cached in a named docker volume so
only the first run pays the download cost.
"""
from __future__ import annotations
import logging
import subprocess
from pathlib import Path
from typing import Annotated
import typer
from python.common import configure_logger
logger = logging.getLogger(__name__)
class Config:
"""Paths and names for the whisper-transcribe Docker workflow."""
image_tag = "whisper-transcribe:latest"
model_volume = "whisper-models"
repo_root = Path(__file__).resolve().parents[3]
dockerfile = Path(__file__).resolve().parent / "Dockerfile"
huggingface_cache = "/root/.cache/huggingface"
def run_docker(arguments: list[str]) -> None:
"""Run a docker subcommand, streaming output and raising on failure.
Args:
arguments: Arguments to pass to the ``docker`` binary.
Raises:
subprocess.CalledProcessError: If docker exits non-zero.
"""
logger.info("docker %s", " ".join(arguments))
subprocess.run(["docker", *arguments], check=True)
def build_image() -> None:
"""Build the whisper-transcribe image using the repo root as build context."""
logger.info("Building image %s", Config.image_tag)
run_docker(
[
"build",
"--tag",
Config.image_tag,
"--file",
str(Config.dockerfile),
str(Config.repo_root),
],
)
def model_cache_present(model: str) -> bool:
"""Check whether the given model is already downloaded in the cache volume.
Args:
model: faster-whisper model name (e.g. ``large-v3``).
Returns:
True if the HuggingFace cache directory for the model exists in the volume.
"""
cache_directory = f"hub/models--Systran--faster-whisper-{model}"
completed = subprocess.run(
[
"docker",
"run",
"--rm",
"--volume",
f"{Config.model_volume}:/cache",
"alpine",
"test",
"-d",
f"/cache/{cache_directory}",
],
check=False,
)
return completed.returncode == 0
def download_model(model: str) -> None:
"""Download the model into the cache volume and exit.
Args:
model: faster-whisper model name.
"""
logger.info("Downloading model %s into volume %s", model, Config.model_volume)
run_docker(
[
"run",
"--rm",
"--device=nvidia.com/gpu=all",
"--ipc=host",
"--volume",
f"{Config.model_volume}:{Config.huggingface_cache}",
Config.image_tag,
"--model",
model,
"--download-only",
],
)
def transcribe(input_directory: Path, output_directory: Path, model: str) -> None:
"""Run transcription on every audio file under ``input_directory``.
Args:
input_directory: Host path containing audio files (mounted read-only).
output_directory: Host path for ``.txt`` transcripts.
model: faster-whisper model name.
"""
logger.info("Transcribing %s -> %s (model=%s)", input_directory, output_directory, model)
run_docker(
[
"run",
"--rm",
"--device=nvidia.com/gpu=all",
"--ipc=host",
"--volume",
f"{input_directory}:/audio:ro",
"--volume",
f"{output_directory}:/output",
"--volume",
f"{Config.model_volume}:{Config.huggingface_cache}",
Config.image_tag,
"--model",
model,
],
)
def main(
input_directory: Annotated[Path, typer.Argument(help="Directory of audio files to transcribe.")],
output_directory: Annotated[Path, typer.Argument(help="Directory to write .txt transcripts to.")],
model: Annotated[str, typer.Option(help="faster-whisper model name.")] = "large-v3",
*,
force_download: Annotated[
bool,
typer.Option("--force-download", help="Re-download the model even if already cached."),
] = False,
) -> None:
"""Build the image, ensure the model is cached, then transcribe and stop."""
configure_logger()
resolved_input = input_directory.resolve(strict=True)
output_directory.mkdir(parents=True, exist_ok=True)
resolved_output = output_directory.resolve()
build_image()
if force_download or not model_cache_present(model):
download_model(model)
else:
logger.info("Model %s already cached in volume %s", model, Config.model_volume)
transcribe(resolved_input, resolved_output, model)
logger.info("Done. Container stopped.")
if __name__ == "__main__":
typer.run(main)
+3 -9
View File
@@ -1,30 +1,24 @@
{ inputs, pkgs, ... }: { inputs, ... }:
{ {
imports = [ imports = [
"${inputs.self}/users/math"
"${inputs.self}/users/richie" "${inputs.self}/users/richie"
"${inputs.self}/users/steve"
"${inputs.self}/common/global" "${inputs.self}/common/global"
"${inputs.self}/common/optional/desktop.nix"
"${inputs.self}/common/optional/docker.nix" "${inputs.self}/common/optional/docker.nix"
"${inputs.self}/common/optional/scanner.nix" "${inputs.self}/common/optional/scanner.nix"
"${inputs.self}/common/optional/monitoring-agent.nix"
"${inputs.self}/common/optional/steam.nix" "${inputs.self}/common/optional/steam.nix"
"${inputs.self}/common/optional/syncthing_base.nix" "${inputs.self}/common/optional/syncthing_base.nix"
"${inputs.self}/common/optional/systemd-boot.nix" "${inputs.self}/common/optional/systemd-boot.nix"
"${inputs.self}/common/optional/update.nix" "${inputs.self}/common/optional/update.nix"
"${inputs.self}/common/optional/yubikey.nix" "${inputs.self}/common/optional/yubikey.nix"
"${inputs.self}/common/optional/zerotier.nix" "${inputs.self}/common/optional/zerotier.nix"
"${inputs.self}/common/optional/brain_substituter.nix"
"${inputs.self}/common/optional/nvidia.nix" "${inputs.self}/common/optional/nvidia.nix"
./hardware.nix ./hardware.nix
./syncthing.nix ./syncthing.nix
./llms.nix ./llms.nix
]; ];
boot = {
kernelPackages = pkgs.linuxPackages_6_18;
zfs.package = pkgs.zfs_2_4;
};
networking = { networking = {
hostName = "bob"; hostName = "bob";
hostId = "7c678a41"; hostId = "7c678a41";
+1
View File
@@ -28,6 +28,7 @@
allowDiscards = true; allowDiscards = true;
keyFileSize = 4096; keyFileSize = 4096;
keyFile = "/dev/disk/by-id/usb-Samsung_Flash_Drive_FIT_0374620080067131-0:0"; keyFile = "/dev/disk/by-id/usb-Samsung_Flash_Drive_FIT_0374620080067131-0:0";
fallbackToPassword = true;
}; };
}; };
kernelModules = [ "kvm-amd" ]; kernelModules = [ "kvm-amd" ];
+1 -5
View File
@@ -23,7 +23,6 @@
"magistral:24b" "magistral:24b"
"ministral-3:14b" "ministral-3:14b"
"nemotron-3-nano:30b" "nemotron-3-nano:30b"
"nemotron-3-nano:4b"
"nemotron-cascade-2:30b" "nemotron-cascade-2:30b"
"qwen3-coder:30b" "qwen3-coder:30b"
"qwen3-embedding:0.6b" "qwen3-embedding:0.6b"
@@ -42,14 +41,11 @@
"qwen3:8b" "qwen3:8b"
"qwen3.5:27b" "qwen3.5:27b"
"qwen3.5:35b" "qwen3.5:35b"
"qwen3.6:27b"
"qwen3.6:35b"
"rinex20/translategemma3:12b"
"translategemma:12b" "translategemma:12b"
"translategemma:27b" "translategemma:27b"
"translategemma:4b" "translategemma:4b"
]; ];
models = "/zfs/storage/models"; models = "/zfs/models";
openFirewall = true; openFirewall = true;
}; };
} }
-10
View File
@@ -31,15 +31,5 @@
]; ];
fsWatcherEnabled = true; fsWatcherEnabled = true;
}; };
"recordings" = {
path = "/home/richie/recordings";
devices = [
"jeeves"
"phone"
"rhapsody-in-green"
];
fsWatcherEnabled = true;
};
}; };
} }
+1
View File
@@ -26,6 +26,7 @@
allowDiscards = true; allowDiscards = true;
keyFileSize = 4096; keyFileSize = 4096;
keyFile = "/dev/disk/by-id/usb-USB_SanDisk_3.2Gen1_03021630090925173333-0:0"; keyFile = "/dev/disk/by-id/usb-USB_SanDisk_3.2Gen1_03021630090925173333-0:0";
fallbackToPassword = true;
}; };
}; };
kernelModules = [ "kvm-intel" ]; kernelModules = [ "kvm-intel" ];
+2 -11
View File
@@ -4,21 +4,17 @@ let
in in
{ {
imports = [ imports = [
"${inputs.self}/users/dov"
"${inputs.self}/users/math"
"${inputs.self}/users/richie" "${inputs.self}/users/richie"
"${inputs.self}/users/steve" "${inputs.self}/users/math"
"${inputs.self}/users/dov"
"${inputs.self}/common/global" "${inputs.self}/common/global"
"${inputs.self}/common/optional/docker.nix" "${inputs.self}/common/optional/docker.nix"
"${inputs.self}/common/optional/monitoring-agent.nix"
"${inputs.self}/common/optional/ssh_decrypt.nix" "${inputs.self}/common/optional/ssh_decrypt.nix"
"${inputs.self}/common/optional/syncthing_base.nix" "${inputs.self}/common/optional/syncthing_base.nix"
"${inputs.self}/common/optional/update.nix" "${inputs.self}/common/optional/update.nix"
"${inputs.self}/common/optional/zerotier.nix" "${inputs.self}/common/optional/zerotier.nix"
./monitoring
./docker ./docker
./services ./services
./web_services
./hardware.nix ./hardware.nix
./networking.nix ./networking.nix
./programs.nix ./programs.nix
@@ -39,10 +35,5 @@ in
zerotierone.joinNetworks = [ "a09acf02330d37b9" ]; zerotierone.joinNetworks = [ "a09acf02330d37b9" ];
}; };
users.groups = {
nornsight = { };
nornsight-admin = { };
};
system.stateVersion = "24.05"; system.stateVersion = "24.05";
} }
+1
View File
@@ -9,6 +9,7 @@ let
inherit device; inherit device;
keyFileSize = 4096; keyFileSize = 4096;
keyFile = "/dev/disk/by-id/usb-XIAO_USB_Drive_24587CE29074-0:0"; keyFile = "/dev/disk/by-id/usb-XIAO_USB_Drive_24587CE29074-0:0";
fallbackToPassword = true;
}; };
makeLuksSSD = makeLuksSSD =
device: device:
@@ -1,426 +0,0 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "100 * (1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])))",
"legendFormat": "{{instance}}",
"range": true,
"refId": "A"
}
],
"title": "CPU Used",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
"x": 6,
"y": 0
},
"id": 2,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
"legendFormat": "{{instance}}",
"range": true,
"refId": "A"
}
],
"title": "RAM Used",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
"x": 12,
"y": 0
},
"id": 3,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "100 * (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes))",
"legendFormat": "{{instance}}",
"range": true,
"refId": "A"
}
],
"title": "Swap Used",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
"x": 18,
"y": 0
},
"id": 4,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "node_load1",
"legendFormat": "{{instance}} load1",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "node_load5",
"legendFormat": "{{instance}} load5",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "node_load15",
"legendFormat": "{{instance}} load15",
"range": true,
"refId": "C"
}
],
"title": "Load",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 8
},
"id": 5,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "sum by (instance) (rate(node_disk_read_bytes_total[5m]))",
"legendFormat": "{{instance}} read",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "sum by (instance) (rate(node_disk_written_bytes_total[5m]))",
"legendFormat": "{{instance}} write",
"range": true,
"refId": "B"
}
],
"title": "Disk Throughput",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 8
},
"id": 6,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "100 * (1 - (node_filesystem_avail_bytes{mountpoint=~\"(/|/home|/var|/zfs.*)\",fstype!=\"\"} / node_filesystem_size_bytes{mountpoint=~\"(/|/home|/var|/zfs.*)\",fstype!=\"\"}))",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{mountpoint}}",
"refId": "A"
}
],
"title": "Filesystem Usage",
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 17
},
"id": 7,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(10, rate(namedprocess_namegroup_cpu_seconds_total[5m]))",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{groupname}}",
"refId": "A"
}
],
"title": "Top Grouped CPU",
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 17
},
"id": 8,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(10, namedprocess_namegroup_memory_bytes{memtype=\"resident\"})",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{groupname}}",
"refId": "A"
}
],
"title": "Top Grouped Memory",
"type": "table"
}
],
"refresh": "30s",
"schemaVersion": 39,
"style": "dark",
"tags": [
"monitoring"
],
"templating": {
"list": []
},
"time": {
"from": "now-24h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Overview",
"uid": "monitor-overview",
"version": 1,
"weekStart": ""
}
@@ -1,216 +0,0 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(10, rate(namedprocess_namegroup_cpu_seconds_total[5m]))",
"legendFormat": "{{instance}} {{groupname}}",
"range": true,
"refId": "A"
}
],
"title": "Grouped CPU",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 0
},
"id": 2,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(10, namedprocess_namegroup_memory_bytes{memtype=\"resident\"})",
"legendFormat": "{{instance}} {{groupname}}",
"range": true,
"refId": "A"
}
],
"title": "Grouped Resident Memory",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 10
},
"id": 3,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(10, rate(namedprocess_namegroup_read_bytes_total[5m]))",
"legendFormat": "{{instance}} {{groupname}}",
"range": true,
"refId": "A"
}
],
"title": "Grouped Read I/O",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 10
},
"id": 4,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(10, rate(namedprocess_namegroup_write_bytes_total[5m]))",
"legendFormat": "{{instance}} {{groupname}}",
"range": true,
"refId": "A"
}
],
"title": "Grouped Write I/O",
"type": "timeseries"
}
],
"refresh": "30s",
"schemaVersion": 39,
"style": "dark",
"tags": [
"monitoring",
"process"
],
"templating": {
"list": []
},
"time": {
"from": "now-7d",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Process History Grouped",
"uid": "monitor-process-history",
"version": 1,
"weekStart": ""
}
@@ -1,224 +0,0 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-pid-short"
},
"fieldConfig": {
"defaults": {
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-pid-short"
},
"editorMode": "code",
"expr": "topk(20, rate(namedprocess_namegroup_cpu_seconds_total[2m]))",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{groupname}}",
"refId": "A"
}
],
"title": "Top PID CPU",
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-pid-short"
},
"fieldConfig": {
"defaults": {
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 0
},
"id": 2,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-pid-short"
},
"editorMode": "code",
"expr": "topk(20, namedprocess_namegroup_memory_bytes{memtype=\"resident\"})",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{groupname}}",
"refId": "A"
}
],
"title": "Top PID RSS",
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-pid-short"
},
"fieldConfig": {
"defaults": {
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 10
},
"id": 3,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-pid-short"
},
"editorMode": "code",
"expr": "topk(20, rate(namedprocess_namegroup_read_bytes_total[2m]))",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{groupname}}",
"refId": "A"
}
],
"title": "Top PID Read I/O",
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-pid-short"
},
"fieldConfig": {
"defaults": {
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 10
},
"id": 4,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-pid-short"
},
"editorMode": "code",
"expr": "topk(20, rate(namedprocess_namegroup_write_bytes_total[2m]))",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{groupname}}",
"refId": "A"
}
],
"title": "Top PID Write I/O",
"type": "table"
}
],
"refresh": "15s",
"schemaVersion": 39,
"style": "dark",
"tags": [
"monitoring",
"process"
],
"templating": {
"list": []
},
"time": {
"from": "now-10m",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Process Live PID",
"uid": "monitor-process-pid",
"version": 1,
"weekStart": ""
}
@@ -1,351 +0,0 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "100 * (zfs_pool_allocated_bytes / zfs_pool_size_bytes)",
"legendFormat": "{{instance}} {{pool}}",
"range": true,
"refId": "A"
}
],
"title": "Pool Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 8,
"y": 0
},
"id": 2,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "zfs_pool_free_bytes",
"legendFormat": "{{instance}} {{pool}}",
"range": true,
"refId": "A"
}
],
"title": "Pool Free Bytes",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 0
},
"id": 3,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(20, zfs_dataset_used_bytes{type=\"filesystem\"})",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{name}}",
"refId": "A"
}
],
"title": "Top Filesystems by Used Bytes",
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "ns"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 8
},
"id": 4,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(20, zpool_iostat_total_wait_read_ns{vdev!=\"_pool\"})",
"legendFormat": "{{host}} {{pool}} {{vdev}}",
"range": true,
"refId": "A"
}
],
"title": "ZFS Read Wait",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "ns"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 8
},
"id": 5,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "topk(20, zpool_iostat_total_wait_write_ns{vdev!=\"_pool\"})",
"legendFormat": "{{host}} {{pool}} {{vdev}}",
"range": true,
"refId": "A"
}
],
"title": "ZFS Write Wait",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "celsius"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 17
},
"id": 6,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "smartctl_device_temperature{temperature_type=\"current\"}",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{device}}",
"refId": "A"
}
],
"title": "Disk Temperature",
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"fieldConfig": {
"defaults": {
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 17
},
"id": 7,
"options": {
"cellHeight": "sm",
"showHeader": true,
"sortBy": [
{
"desc": false,
"displayName": "Value"
}
]
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prom-main"
},
"editorMode": "code",
"expr": "smartctl_device_smart_status",
"format": "table",
"instant": true,
"legendFormat": "{{instance}} {{device}}",
"refId": "A"
}
],
"title": "SMART Health",
"type": "table"
}
],
"refresh": "30s",
"schemaVersion": 39,
"style": "dark",
"tags": [
"monitoring",
"zfs"
],
"templating": {
"list": []
},
"time": {
"from": "now-24h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Storage and ZFS",
"uid": "monitor-storage",
"version": 1,
"weekStart": ""
}
-186
View File
@@ -1,186 +0,0 @@
{
lib,
pkgs,
...
}:
let
vars = import ../vars.nix;
prometheusDataRoot = "${vars.database}/prometheus";
mainPrometheusDataDir = "${prometheusDataRoot}/main";
pidPrometheusDataDir = "${prometheusDataRoot}/pid-short";
prometheusYaml = pkgs.formats.yaml { };
mkPrometheusConfig =
name: cfg:
let
configFile = prometheusYaml.generate "${name}.yaml" cfg;
in
pkgs.runCommand "${name}-checked.yaml"
{
nativeBuildInputs = [ pkgs.prometheus.cli ];
}
''
promtool check config ${configFile}
cp ${configFile} $out
'';
mkTarget = host: address: {
targets = [ address ];
labels.instance = host;
};
mainPrometheusConfig = mkPrometheusConfig "prometheus-main" {
global = {
scrape_interval = "30s";
scrape_timeout = "10s";
evaluation_interval = "30s";
};
scrape_configs = [
{
job_name = "node";
static_configs = [
(mkTarget "jeeves" "192.168.90.40:9100")
(mkTarget "bob" "192.168.90.25:9100")
];
}
{
job_name = "process_grouped";
static_configs = [
(mkTarget "jeeves" "192.168.90.40:9256")
(mkTarget "bob" "192.168.90.25:9256")
];
}
{
job_name = "smartctl";
static_configs = [
(mkTarget "jeeves" "192.168.90.40:9633")
(mkTarget "bob" "192.168.90.25:9633")
];
}
{
job_name = "zfs";
static_configs = [
(mkTarget "jeeves" "192.168.90.40:9134")
(mkTarget "bob" "192.168.90.25:9134")
];
}
];
};
pidPrometheusConfig = mkPrometheusConfig "prometheus-pid-short" {
global = {
scrape_interval = "15s";
scrape_timeout = "10s";
evaluation_interval = "15s";
};
scrape_configs = [
{
job_name = "process_pid";
static_configs = [
(mkTarget "jeeves" "192.168.90.40:9257")
(mkTarget "bob" "192.168.90.25:9257")
];
}
];
};
mkPrometheusService =
{
dataDir,
configFile,
port,
retention,
}:
{
after = [
"zfs-media-database-prometheus.mount"
"network.target"
];
requires = [ "zfs-media-database-prometheus.mount" ];
wantedBy = [ "multi-user.target" ];
unitConfig.RequiresMountsFor = [ dataDir ];
serviceConfig = {
ExecStart = "${lib.getExe pkgs.prometheus} ${
lib.escapeShellArgs [
"--config.file=${configFile}"
"--storage.tsdb.path=${dataDir}"
"--storage.tsdb.retention.time=${retention}"
"--web.listen-address=127.0.0.1:${toString port}"
]
}";
User = "prometheus";
Group = "prometheus";
Restart = "always";
RestartSec = "5s";
WorkingDirectory = dataDir;
ReadWritePaths = [ dataDir ];
CapabilityBoundingSet = [ "" ];
DeviceAllow = [ "/dev/null rw" ];
DevicePolicy = "strict";
LockPersonality = true;
MemoryDenyWriteExecute = true;
NoNewPrivileges = true;
PrivateDevices = true;
PrivateTmp = true;
ProtectClock = true;
ProtectControlGroups = true;
ProtectHome = true;
ProtectHostname = true;
ProtectKernelLogs = true;
ProtectKernelModules = true;
ProtectKernelTunables = true;
ProtectProc = "invisible";
ProtectSystem = "strict";
RemoveIPC = true;
RestrictAddressFamilies = [
"AF_INET"
"AF_INET6"
"AF_UNIX"
];
RestrictNamespaces = true;
RestrictRealtime = true;
RestrictSUIDSGID = true;
SystemCallArchitectures = "native";
SystemCallFilter = [
"@system-service"
"~@privileged"
];
};
};
in
{
users = {
groups.prometheus = { };
users.prometheus = {
isSystemUser = true;
group = "prometheus";
description = "Prometheus daemon user";
};
};
systemd = {
services = {
prometheus-main = mkPrometheusService {
configFile = mainPrometheusConfig;
dataDir = mainPrometheusDataDir;
port = 9090;
retention = "90d";
};
prometheus-pid-short = mkPrometheusService {
configFile = pidPrometheusConfig;
dataDir = pidPrometheusDataDir;
port = 9092;
retention = "10m";
};
};
tmpfiles.rules = [
"d ${prometheusDataRoot} 0755 root root - -"
"d ${mainPrometheusDataDir} 0750 prometheus prometheus - -"
"d ${pidPrometheusDataDir} 0750 prometheus prometheus - -"
];
};
}
+19 -22
View File
@@ -1,13 +1,4 @@
{ {
# Docker loads br_netfilter on jeeves. Disable bridge netfilter so
# br-nix-builder behaves like a pure L2 bridge and bridged traffic
# does not hit the host firewall/rpfilter path.
boot.kernel.sysctl = {
"net.bridge.bridge-nf-call-arptables" = 0;
"net.bridge.bridge-nf-call-ip6tables" = 0;
"net.bridge.bridge-nf-call-iptables" = 0;
};
networking = { networking = {
hostName = "jeeves"; hostName = "jeeves";
hostId = "0e15ce35"; hostId = "0e15ce35";
@@ -43,18 +34,11 @@
}; };
}; };
networks = { networks = {
"10-Primary" = { "10-1GB_Primary" = {
matchConfig.Name = "enp97s0"; matchConfig.Name = "enp97s0f1";
address = [ "192.168.99.14/24" ]; address = [ "192.168.99.14/24" ];
dns = [
"192.168.99.1"
"2600:4040:abfb:d700::1"
];
routes = [ { Gateway = "192.168.99.1"; } ]; routes = [ { Gateway = "192.168.99.1"; } ];
vlan = [ "internet-vlan" ]; vlan = [ "internet-vlan" ];
dhcpV4Config.UseDNS = false;
dhcpV6Config.UseDNS = false;
ipv6AcceptRAConfig.UseDNS = false;
linkConfig.RequiredForOnline = "routable"; linkConfig.RequiredForOnline = "routable";
}; };
"50-internet-vlan" = { "50-internet-vlan" = {
@@ -65,10 +49,23 @@
"60-br-nix-builder" = { "60-br-nix-builder" = {
matchConfig.Name = "br-nix-builder"; matchConfig.Name = "br-nix-builder";
bridgeConfig = { }; bridgeConfig = { };
networkConfig = { address = [ "192.168.3.10/24" ];
IPv6AcceptRA = false; routingPolicyRules = [
LinkLocalAddressing = "no"; {
}; From = "192.168.3.0/24";
Table = 100;
Priority = 100;
}
];
routes = [
{
Gateway = "192.168.3.1";
Table = 100;
GatewayOnLink = false;
Metric = 2048;
PreferredSource = "192.168.3.10";
}
];
linkConfig.RequiredForOnline = "no"; linkConfig.RequiredForOnline = "no";
}; };
}; };
-1
View File
@@ -3,6 +3,5 @@
environment.systemPackages = with pkgs; [ environment.systemPackages = with pkgs; [
filebot filebot
docker-compose docker-compose
ffmpeg
]; ];
} }
+14 -1
View File
@@ -1,7 +1,20 @@
{ ... }: { pkgs, ... }:
{ {
imports = [ ./nix_builder.nix ]; imports = [ ./nix_builder.nix ];
users = {
users.github-runners = {
shell = pkgs.bash;
isSystemUser = true;
group = "github-runners";
uid = 601;
openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIA/S8i+BNX/12JNKg+5EKGX7Aqimt5KM+ve3wt/SyWuO github-runners" # cspell:disable-line
];
};
groups.github-runners.gid = 601;
};
services.nix_builder.containers = { services.nix_builder.containers = {
nix-builder-00.enable = true; nix-builder-00.enable = true;
nix-builder-01.enable = true; nix-builder-01.enable = true;
+31 -60
View File
@@ -2,7 +2,6 @@
config, config,
lib, lib,
outputs, outputs,
utils,
... ...
}: }:
@@ -10,8 +9,6 @@ with lib;
let let
vars = import ../vars.nix; vars = import ../vars.nix;
cfg = config.services.nix_builder; cfg = config.services.nix_builder;
runnerUsername = "gitea-runner";
runnerUserid = 601;
in in
{ {
options.services.nix_builder = { options.services.nix_builder = {
@@ -26,40 +23,37 @@ in
types.submodule ( types.submodule (
{ name, ... }: { name, ... }:
{ {
options.enable = mkEnableOption "Gitea runner container"; options.enable = mkEnableOption "GitHub runner container";
} }
) )
); );
default = { }; default = { };
description = "Gitea runner container configurations"; description = "GitHub runner container configurations";
}; };
}; };
config = { config = {
users = {
users.${runnerUsername} = {
isSystemUser = true;
group = runnerUsername;
uid = runnerUserid;
};
groups.${runnerUsername}.gid = runnerUserid;
};
containers = mapAttrs ( containers = mapAttrs (
name: containerCfg: name: containerCfg:
mkIf containerCfg.enable { mkIf containerCfg.enable {
autoStart = true; autoStart = true;
privateNetwork = true; privateNetwork = true;
hostBridge = cfg.bridgeName; hostBridge = cfg.bridgeName;
ephemeral = true;
bindMounts = { bindMounts = {
storage = {
hostPath = "/zfs/media/github-runners/${name}";
mountPoint = "/zfs/media/github-runners/${name}";
isReadOnly = false;
};
host-nix = { host-nix = {
mountPoint = "/host-nix/var/nix/daemon-socket"; mountPoint = "/host-nix/var/nix/daemon-socket";
hostPath = "/nix/var/nix/daemon-socket"; hostPath = "/nix/var/nix/daemon-socket";
isReadOnly = false; isReadOnly = false;
}; };
token = { pat = {
hostPath = "${vars.secrets}/services/gitea-runners"; hostPath = "${vars.secrets}/services/github-runners/runner_pat";
mountPoint = "/run/secrets/gitea-runners"; mountPoint = "${vars.secrets}/services/github-runners/runner_pat";
isReadOnly = true; isReadOnly = true;
}; };
}; };
@@ -98,69 +92,46 @@ in
"nix-command" "nix-command"
]; ];
sandbox = true; sandbox = true;
allowed-users = [ "gitea-runner" ]; allowed-users = [ "github-runners" ];
trusted-users = [ trusted-users = [
"root" "root"
"gitea-runner" "github-runners"
]; ];
}; };
nixpkgs = { nixpkgs = {
overlays = builtins.attrValues outputs.overlays; overlays = builtins.attrValues outputs.overlays;
config.allowUnfree = true; config.allowUnfree = true;
}; };
users = { services.github-runners.${name} = {
users.${runnerUsername} = {
isSystemUser = true;
group = runnerUsername;
uid = runnerUserid;
};
groups.${runnerUsername}.gid = runnerUserid;
};
services.gitea-actions-runner.instances.${name} = {
enable = true; enable = true;
name = "jeeves-${name}"; replace = true;
url = "http://192.168.99.14:6443/"; workDir = "/zfs/media/github-runners/${name}";
labels = [ url = "https://github.com/RichieCahill/dotfiles";
"self-hosted:host" extraLabels = [ "nixos" ];
"nixos:host" tokenFile = "${vars.secrets}/services/github-runners/runner_pat";
]; user = "github-runners";
tokenFile = "/run/secrets/gitea-runners/registration-token"; group = "github-runners";
hostPackages = with pkgs; [ extraPackages = with pkgs; [
bash
coreutils
curl
gawk
gitMinimal gitMinimal
gnused gh
my_python
nix
nixfmt nixfmt
nixos-rebuild nixos-rebuild
nodejs
treefmt treefmt
wget my_python
]; ];
}; };
systemd.services."gitea-runner-${utils.escapeSystemdPath name}" = { users = {
serviceConfig = { users.github-runners = {
DynamicUser = mkForce false; shell = pkgs.bash;
User = mkForce runnerUsername; isSystemUser = true;
Group = mkForce runnerUsername; group = "github-runners";
uid = 601;
}; };
groups.github-runners.gid = 601;
}; };
system.stateVersion = "24.05"; system.stateVersion = "24.05";
}; };
} }
) cfg.containers; ) cfg.containers;
systemd.services = builtins.listToAttrs (
map (name: {
name = "container@${name}";
value = {
requires = [ "gitea.service" ];
after = [ "gitea.service" ];
};
}) (builtins.attrNames (filterAttrs (_: c: c.enable) cfg.containers))
);
}; };
} }
+1 -4
View File
@@ -21,9 +21,7 @@ sudo zfs create media/secure/docker -o compression=zstd-9
sudo zfs create media/secure/github-runners -o compression=zstd-9 -o sync=disabled sudo zfs create media/secure/github-runners -o compression=zstd-9 -o sync=disabled
sudo zfs create media/secure/home_assistant -o compression=zstd-19 sudo zfs create media/secure/home_assistant -o compression=zstd-19
sudo zfs create media/secure/notes -o copies=2 sudo zfs create media/secure/notes -o copies=2
sudo zfs create media/secure/postgres -o mountpoint=/zfs/media/database/postgres -o recordsize=16k -o primarycache=metadata sudo zfs create media/secure/postgres -o recordsize=16k -o primarycache=metadata
sudo zfs create media/secure/postgres-wal -o mountpoint=/zfs/media/database/postgres-wal -o recordsize=32k -o primarycache=metadata -o special_small_blocks=32K -o compression=lz4 -o secondarycache=none -o logbias=latency
sudo zfs create media/secure/prometheus -o mountpoint=/zfs/media/database/prometheus -o compression=lz4
sudo zfs create media/secure/services -o compression=zstd-9 sudo zfs create media/secure/services -o compression=zstd-9
sudo zfs create media/secure/share -o mountpoint=/zfs/media/share -o exec=off sudo zfs create media/secure/share -o mountpoint=/zfs/media/share -o exec=off
@@ -42,4 +40,3 @@ sudo zfs create storage/secure/plex -o recordsize=1M -o compression=zstd-19
sudo zfs create storage/secure/secrets -o compression=zstd-19 -o copies=3 sudo zfs create storage/secure/secrets -o compression=zstd-19 -o copies=3
sudo zfs create storage/secure/syncthing -o compression=zstd-19 sudo zfs create storage/secure/syncthing -o compression=zstd-19
sudo zfs create storage/secure/transmission -o recordsize=1M -o compression=zstd-9 -o exec=off -o sync=disabled sudo zfs create storage/secure/transmission -o recordsize=1M -o compression=zstd-9 -o exec=off -o sync=disabled
sudo zfs create storage/secure/important -o compression=zstd-19 -o copies=2 -o mountpoint=/zfs/storage/important
+1 -4
View File
@@ -3,10 +3,7 @@ let
vars = import ../vars.nix; vars = import ../vars.nix;
in in
{ {
services.audiobookshelf = { services.audiobookshelf.enable = true;
enable = true;
port = 8000;
};
systemd.services.audiobookshelf.serviceConfig.WorkingDirectory = systemd.services.audiobookshelf.serviceConfig.WorkingDirectory =
lib.mkForce "${vars.docker_configs}/audiobookshelf"; lib.mkForce "${vars.docker_configs}/audiobookshelf";
users.users.audiobookshelf.home = lib.mkForce "${vars.docker_configs}/audiobookshelf"; users.users.audiobookshelf.home = lib.mkForce "${vars.docker_configs}/audiobookshelf";
@@ -0,0 +1,96 @@
{
pkgs,
inputs,
...
}:
let
commonEnv = {
PYTHONPATH = "${inputs.self}";
KAFKA_BOOTSTRAP_SERVERS = "localhost:9092";
BLUESKY_FIREHOSE_TOPIC = "bluesky.firehose.posts";
};
commonServiceConfig = {
Type = "simple";
WorkingDirectory = "${inputs.self}";
User = "richie";
Group = "users";
Restart = "on-failure";
RestartSec = "10s";
StandardOutput = "journal";
StandardError = "journal";
NoNewPrivileges = true;
ProtectSystem = "strict";
ProtectHome = "read-only";
PrivateTmp = true;
ReadOnlyPaths = [ "${inputs.self}" ];
};
in
{
systemd.services.bluesky-firehose-topic-init = {
description = "Create Kafka topic for Bluesky firehose";
after = [ "apache-kafka.service" ];
requires = [ "apache-kafka.service" ];
wantedBy = [ "multi-user.target" ];
serviceConfig = {
Type = "oneshot";
RemainAfterExit = true;
ExecStart = pkgs.writeShellScript "create-bluesky-topic" ''
${pkgs.apacheKafka}/bin/kafka-topics.sh \
--bootstrap-server localhost:9092 \
--create \
--if-not-exists \
--topic bluesky.firehose.posts \
--partitions 6 \
--replication-factor 1
'';
};
};
systemd.services.bluesky-firehose-producer = {
description = "Bluesky Jetstream to Kafka producer";
after = [
"network.target"
"apache-kafka.service"
"bluesky-firehose-topic-init.service"
];
requires = [
"apache-kafka.service"
"bluesky-firehose-topic-init.service"
];
wantedBy = [ "multi-user.target" ];
environment = commonEnv;
serviceConfig = commonServiceConfig // {
ExecStart = "${pkgs.my_python}/bin/python -m python.data_science.firehose_producer";
};
};
systemd.services.bluesky-firehose-consumer = {
description = "Bluesky Kafka to PostgreSQL consumer";
after = [
"network.target"
"apache-kafka.service"
"bluesky-firehose-topic-init.service"
"postgresql.service"
];
requires = [
"apache-kafka.service"
"bluesky-firehose-topic-init.service"
"postgresql.service"
];
wantedBy = [ "multi-user.target" ];
environment = commonEnv // {
DATA_SCIENCE_DEV_DB = "data_science_dev";
DATA_SCIENCE_DEV_USER = "richie";
DATA_SCIENCE_DEV_HOST = "/run/postgresql";
DATA_SCIENCE_DEV_PORT = "5432";
};
serviceConfig = commonServiceConfig // {
ExecStart = "${pkgs.my_python}/bin/python -m python.data_science.firehose_consumer";
};
};
}
@@ -1,80 +0,0 @@
{
...
}:
let
vars = import ../vars.nix;
in
{
systemd.tmpfiles.rules = [
"d ${vars.docker_configs}/camofox-browser 0750 root root - -"
];
containers.camofox-browser = {
autoStart = true;
privateNetwork = false;
bindMounts = {
camofox-browser = {
hostPath = "${vars.docker_configs}/camofox-browser";
mountPoint = "/var/lib/camofox-browser";
isReadOnly = false;
};
};
config =
{
pkgs,
lib,
...
}:
{
networking.hostName = "camofox-browser";
environment.systemPackages = with pkgs; [
ffmpeg
git
nodejs
python3Packages.yt-dlp
];
systemd.services.camofox-browser = {
description = "Camofox browser server";
wantedBy = [ "multi-user.target" ];
after = [ "network.target" ];
environment = {
CAMOFOX_HOST = "127.0.0.1";
CAMOFOX_PORT = "9377";
HOME = "/var/lib/camofox-browser";
};
path = with pkgs; [
bash
coreutils
git
nodejs
];
serviceConfig = {
Restart = "always";
RestartSec = "5s";
WorkingDirectory = "/var/lib/camofox-browser";
};
script = ''
set -eu
app_dir=/var/lib/camofox-browser/app
if [ ! -d "$app_dir/.git" ]; then
git clone --depth 1 https://github.com/jo-inc/camofox-browser "$app_dir"
fi
cd "$app_dir"
if [ ! -d node_modules ]; then
npm install
fi
exec npm start
'';
};
system.stateVersion = lib.mkDefault "24.05";
};
};
}
@@ -0,0 +1,17 @@
{ pkgs, ... }:
let
vars = import ../vars.nix;
in
{
systemd.services.cloud_flare_tunnel = {
description = "cloud_flare_tunnel proxy's traffic through cloudflare";
after = [ "network.target" ];
wantedBy = [ "multi-user.target" ];
serviceConfig = {
Type = "simple";
EnvironmentFile = "${vars.secrets}/docker/cloud_flare_tunnel";
ExecStart = "${pkgs.cloudflared}/bin/cloudflared --no-autoupdate tunnel run";
Restart = "on-failure";
};
};
}
+2 -9
View File
@@ -2,10 +2,7 @@ let
vars = import ../vars.nix; vars = import ../vars.nix;
in in
{ {
networking.firewall.allowedTCPPorts = [ networking.firewall.allowedTCPPorts = [ 6443 ];
6443
2223
];
services.gitea = { services.gitea = {
enable = true; enable = true;
@@ -21,17 +18,13 @@ in
createDatabase = false; createDatabase = false;
}; };
settings = { settings = {
actions = {
ENABLED = true;
DEFAULT_ACTIONS_URL = "github";
};
service.DISABLE_REGISTRATION = true; service.DISABLE_REGISTRATION = true;
server = { server = {
DOMAIN = "tmmworkshop.com"; DOMAIN = "tmmworkshop.com";
ROOT_URL = "https://gitea.tmmworkshop.com/"; ROOT_URL = "https://gitea.tmmworkshop.com/";
HTTP_PORT = 6443; HTTP_PORT = 6443;
SSH_PORT = 2223; SSH_PORT = 2223;
SSH_LISTEN_PORT = 2223; SSH_LISTEN_PORT = 2224;
START_SSH_SERVER = true; START_SSH_SERVER = true;
PUBLIC_URL_DETECTION = "auto"; PUBLIC_URL_DETECTION = "auto";
}; };
-80
View File
@@ -1,80 +0,0 @@
{
...
}:
let
vars = import ../vars.nix;
grafanaDataDir = "${vars.services}/grafana";
in
{
networking.firewall.allowedTCPPorts = [ 3000 ];
services.grafana = {
enable = true;
dataDir = grafanaDataDir;
settings = {
database.type = "sqlite3";
security = {
admin_password = "$__file{${vars.secrets}/services/grafana/admin_password}";
admin_user = "admin";
secret_key = "$__file{${vars.secrets}/services/grafana/secret_key}";
};
server = {
http_addr = "192.168.90.40";
http_port = 3000;
root_url = "http://192.168.90.40:3000/";
};
};
provision = {
enable = true;
dashboards.settings = {
apiVersion = 1;
providers = [
{
name = "monitoring";
folder = "Monitoring";
type = "file";
disableDeletion = false;
editable = false;
allowUiUpdates = false;
updateIntervalSeconds = 30;
options.path = ../monitoring/dashboards;
}
];
};
datasources.settings = {
apiVersion = 1;
prune = true;
datasources = [
{
access = "proxy";
editable = false;
isDefault = true;
name = "prom-main";
type = "prometheus";
uid = "prom-main";
url = "http://127.0.0.1:9090";
}
{
access = "proxy";
editable = false;
name = "prom-pid-short";
type = "prometheus";
uid = "prom-pid-short";
url = "http://127.0.0.1:9092";
}
];
};
};
};
systemd = {
services.grafana.after = [
"prometheus-main.service"
"prometheus-pid-short.service"
];
tmpfiles.rules = [
"d ${grafanaDataDir} 0750 grafana grafana - -"
];
};
}
@@ -6,7 +6,6 @@ global
defaults defaults
log global log global
mode http mode http
option httplog
retries 3 retries 3
maxconn 2000 maxconn 2000
timeout connect 5s timeout connect 5s
@@ -23,38 +22,24 @@ defaults
#Application Setup #Application Setup
frontend ContentSwitching frontend ContentSwitching
bind *:80 v4v6 bind *:80 v4v6
bind *:443 v4v6 ssl crt /var/lib/acme/audiobookshelf.tmmworkshop.com/full.pem crt /var/lib/acme/cache.tmmworkshop.com/full.pem crt /var/lib/acme/jellyfin.tmmworkshop.com/full.pem crt /var/lib/acme/share.tmmworkshop.com/full.pem crt /var/lib/acme/gitea.tmmworkshop.com/full.pem crt /var/lib/acme/www.norn-sight.com/full.pem bind *:443 v4v6 ssl crt /zfs/storage/secrets/docker/cloudflare.pem
mode http mode http
# ACME challenge routing (must be first)
acl is_acme path_beg /.well-known/acme-challenge/
# tmmworkshop.com # tmmworkshop.com
acl host_audiobookshelf hdr(host) -i audiobookshelf.tmmworkshop.com acl host_audiobookshelf hdr(host) -i audiobookshelf.tmmworkshop.com
acl host_cache hdr(host) -i cache.tmmworkshop.com acl host_cache hdr(host) -i cache.tmmworkshop.com
acl host_jellyfin hdr(host) -i jellyfin.tmmworkshop.com acl host_jellyfin hdr(host) -i jellyfin.tmmworkshop.com
acl host_share hdr(host) -i share.tmmworkshop.com acl host_share hdr(host) -i share.tmmworkshop.com
acl host_gcw hdr(host) -i gcw.tmmworkshop.com
acl host_n8n hdr(host) -i n8n.tmmworkshop.com
acl host_gitea hdr(host) -i gitea.tmmworkshop.com acl host_gitea hdr(host) -i gitea.tmmworkshop.com
acl host_norn_sight hdr(host) -i www.norn-sight.com
# Hosts allowed to serve plain HTTP (add entries to skip the HTTPS redirect)
acl allow_http hdr(host) -i __none__
# acl allow_http hdr(host) -i example.tmmworkshop.com
# Redirect all HTTP to HTTPS unless on the allow list or ACME challenge
http-request redirect scheme https code 301 if !{ ssl_fc } !allow_http !is_acme
use_backend acme_challenge if is_acme
use_backend audiobookshelf_nodes if host_audiobookshelf use_backend audiobookshelf_nodes if host_audiobookshelf
use_backend cache_nodes if host_cache use_backend cache_nodes if host_cache
use_backend jellyfin if host_jellyfin use_backend jellyfin if host_jellyfin
use_backend share_nodes if host_share use_backend share_nodes if host_share
use_backend gcw_nodes if host_gcw
use_backend n8n if host_n8n
use_backend gitea if host_gitea use_backend gitea if host_gitea
use_backend norn_sight if host_norn_sight
backend acme_challenge
mode http
server acme 127.0.0.1:8402
backend audiobookshelf_nodes backend audiobookshelf_nodes
mode http mode http
@@ -75,10 +60,14 @@ backend share_nodes
mode http mode http
server server 127.0.0.1:8091 server server 127.0.0.1:8091
backend gcw_nodes
mode http
server server 127.0.0.1:8092
backend n8n
mode http
server server 127.0.0.1:5678
backend gitea backend gitea
mode http mode http
server server 127.0.0.1:6443 server server 127.0.0.1:6443
backend norn_sight
mode http
server server 127.0.0.1:8001
-7
View File
@@ -7,13 +7,6 @@ in
settings = { settings = {
listeners = [ "PLAINTEXT://localhost:9092" ]; listeners = [ "PLAINTEXT://localhost:9092" ];
"log.dirs" = [ vars.kafka ]; "log.dirs" = [ vars.kafka ];
"num.partitions" = 6;
"default.replication.factor" = 1;
"log.retention.hours" = 168;
"log.retention.bytes" = 10737418240;
"log.segment.bytes" = 1073741824;
"log.cleanup.policy" = "delete";
"auto.create.topics.enable" = false;
}; };
}; };
} }
-107
View File
@@ -1,107 +0,0 @@
{ pkgs, ... }:
let
vars = import ../vars.nix;
stateDir = "${vars.services}/nornsight";
appDir = "${stateDir}/app";
binPath = pkgs.lib.makeBinPath [
pkgs.binutils
pkgs.libpq
pkgs.postgresql
pkgs.stdenv.cc
];
libraryPath = pkgs.lib.makeLibraryPath [
pkgs.libpq
pkgs.postgresql.lib
];
in
{
systemd.tmpfiles.rules = [
"d ${stateDir} 0750 nornsight nornsight - -"
];
users.users.nornsight = {
isSystemUser = true;
group = "nornsight";
home = stateDir;
};
systemd.services.nornsight = {
description = "Norn Sight";
after = [ "network-online.target" ];
wants = [ "network-online.target" ];
wantedBy = [ "multi-user.target" ];
environment = {
HOME = stateDir;
UV_CACHE_DIR = "${stateDir}/.cache/uv";
UV_PROJECT_ENVIRONMENT = "${appDir}/.venv";
UV_PYTHON = "${pkgs.python313}/bin/python3.13";
UV_PYTHON_DOWNLOADS = "never";
LD_LIBRARY_PATH = libraryPath;
LIBRARY_PATH = libraryPath;
PSYCOPG_IMPL = "python";
};
path = with pkgs; [
bash
coreutils
git
uv
];
serviceConfig = {
Type = "simple";
User = "nornsight";
Group = "nornsight";
EnvironmentFile = "-${vars.secrets}/services/nornsight";
WorkingDirectory = stateDir;
Restart = "on-failure";
RestartSec = "5s";
StandardOutput = "journal";
StandardError = "journal";
NoNewPrivileges = true;
PrivateTmp = true;
ProtectHome = true;
ProtectSystem = "strict";
ReadWritePaths = [ stateDir ];
};
script = ''
set -eu
export PATH="${binPath}:$PATH"
export LD_LIBRARY_PATH="${libraryPath}:''${LD_LIBRARY_PATH:-}"
export LIBRARY_PATH="${libraryPath}:''${LIBRARY_PATH:-}"
: "''${NORN_SIGHT_REPO_URL:?NORN_SIGHT_REPO_URL is required}"
branch="''${NORN_SIGHT_BRANCH:-main}"
if [ -d "${appDir}/.git" ]; then
current_origin="$(git -C "${appDir}" remote get-url origin)"
if [ "$current_origin" != "$NORN_SIGHT_REPO_URL" ]; then
rm -rf "${appDir}"
fi
fi
if [ ! -d "${appDir}/.git" ]; then
git clone --branch "$branch" "$NORN_SIGHT_REPO_URL" "${appDir}"
else
cd "${appDir}"
git fetch origin "$branch"
git checkout "$branch"
git pull --ff-only origin "$branch"
fi
cd "${appDir}"
uv sync --upgrade
uv run python - <<'PY'
import ctypes.util
import os
print(f"LD_LIBRARY_PATH={os.environ.get('LD_LIBRARY_PATH')}")
print(f"LIBRARY_PATH={os.environ.get('LIBRARY_PATH')}")
print(f"libpq={ctypes.util.find_library('pq')}")
PY
exec uv run uvicorn pipelines.web.main:app --host 0.0.0.0 --port 8001
'';
};
}
-16
View File
@@ -5,14 +5,9 @@ in
{ {
networking.firewall.allowedTCPPorts = [ 5432 ]; networking.firewall.allowedTCPPorts = [ 5432 ];
# Symlink pg_wal to a ZFS dataset on the special (metadata) vdev for fast WAL writes
# this is required for systemd sandboxing
systemd.services.postgresql.serviceConfig.ReadWritePaths = [ "/zfs/media/database/postgres-wal" ];
services.postgresql = { services.postgresql = {
enable = true; enable = true;
package = pkgs.postgresql_17_jit; package = pkgs.postgresql_17_jit;
extensions = ps: with ps; [ pgvector ];
enableTCPIP = true; enableTCPIP = true;
enableJIT = true; enableJIT = true;
dataDir = "${vars.database}/postgres"; dataDir = "${vars.database}/postgres";
@@ -38,9 +33,6 @@ in
# signalbot # signalbot
local signalbot signalbot trust local signalbot signalbot trust
# hedgedoc
local hedgedoc hedgedoc trust
# math # math
local postgres math trust local postgres math trust
host postgres math 127.0.0.1/32 trust host postgres math 127.0.0.1/32 trust
@@ -120,19 +112,11 @@ in
login = true; login = true;
}; };
} }
{
name = "hedgedoc";
ensureDBOwnership = true;
ensureClauses = {
login = true;
};
}
]; ];
ensureDatabases = [ ensureDatabases = [
"data_science_dev" "data_science_dev"
"hass" "hass"
"gitea" "gitea"
"hedgedoc"
"math" "math"
"n8n" "n8n"
"richie" "richie"
+57
View File
@@ -0,0 +1,57 @@
{
pkgs,
inputs,
...
}:
let
vars = import ../vars.nix;
in
{
users = {
users.signalbot = {
isSystemUser = true;
group = "signalbot";
};
groups.signalbot = { };
};
systemd.services.signal-bot = {
description = "Signal command and control bot";
after = [
"network.target"
"podman-signal_cli_rest_api.service"
];
wants = [ "podman-signal_cli_rest_api.service" ];
wantedBy = [ "multi-user.target" ];
environment = {
PYTHONPATH = "${inputs.self}";
SIGNALBOT_DB = "signalbot";
SIGNALBOT_USER = "signalbot";
SIGNALBOT_HOST = "/run/postgresql";
SIGNALBOT_PORT = "5432";
};
serviceConfig = {
Type = "simple";
WorkingDirectory = "${inputs.self}";
User = "signalbot";
Group = "signalbot";
EnvironmentFile = "${vars.secrets}/services/signal-bot";
ExecStart = "${pkgs.my_python}/bin/python -m python.signal_bot.main";
StateDirectory = "signal-bot";
Restart = "on-failure";
RestartSec = "10s";
StandardOutput = "journal";
StandardError = "journal";
NoNewPrivileges = true;
ProtectSystem = "strict";
ProtectHome = "read-only";
PrivateTmp = true;
ReadWritePaths = [ "/var/lib/signal-bot" ];
ReadOnlyPaths = [
"${inputs.self}"
];
};
};
}
@@ -1,6 +1,7 @@
zpool = ["root_pool", "storage", "media"] zpool = ["root_pool", "storage", "media"]
services = [ services = [
"audiobookshelf", "audiobookshelf",
"cloud_flare_tunnel",
"haproxy", "haproxy",
"docker", "docker",
"home-assistant", "home-assistant",
+32 -72
View File
@@ -4,7 +4,6 @@ hourly = 24
daily = 0 daily = 0
monthly = 0 monthly = 0
# root_pool
["root_pool/home"] ["root_pool/home"]
15_min = 8 15_min = 8
hourly = 24 hourly = 24
@@ -28,96 +27,57 @@ monthly = 0
hourly = 24 hourly = 24
daily = 30 daily = 30
monthly = 6 monthly = 6
# storage
["storage/ollama"]
15_min = 2
hourly = 0
daily = 0
monthly = 0
["storage/secure"] ["storage/plex"]
15_min = 0
hourly = 0
daily = 0
monthly = 0
["storage/secure/plex"]
15_min = 6 15_min = 6
hourly = 2 hourly = 2
daily = 1 daily = 1
monthly = 0 monthly = 0
["storage/secure/transmission"] ["media/plex"]
15_min = 4 15_min = 6
hourly = 0 hourly = 2
daily = 0 daily = 1
monthly = 0 monthly = 0
["storage/secure/secrets"] ["media/notes"]
15_min = 8 15_min = 8
hourly = 24 hourly = 24
daily = 30 daily = 30
monthly = 12 monthly = 12
# media ["media/docker"]
["media/temp"] 15_min = 3
15_min = 2 hourly = 12
hourly = 0 daily = 14
daily = 0 monthly = 2
monthly = 0
["media/services"]
["media/secure"] 15_min = 3
15_min = 0 hourly = 12
hourly = 0 daily = 14
daily = 0 monthly = 2
monthly = 0
["media/home_assistant"]
["media/secure/plex"]
15_min = 6
hourly = 2
daily = 1
monthly = 0
["media/secure/postgres-wal"]
15_min = 4
hourly = 2
daily = 0
monthly = 0
["media/secure/postgres"]
15_min = 8
hourly = 24
daily = 7
monthly = 0
["media/secure/share"]
15_min = 4
hourly = 0
daily = 0
monthly = 0
["media/secure/github-runners"]
15_min = 6
hourly = 2
daily = 1
monthly = 0
["media/secure/notes"]
15_min = 8
hourly = 24
daily = 30
monthly = 12
["media/secure/docker"]
15_min = 3 15_min = 3
hourly = 12 hourly = 12
daily = 14 daily = 14
monthly = 2 monthly = 2
# scratch
["scratch/transmission"] ["scratch/transmission"]
15_min = 2 15_min = 0
hourly = 0
daily = 0
monthly = 0
["storage/transmission"]
15_min = 0
hourly = 0
daily = 0
monthly = 0
["storage/ollama"]
15_min = 0
hourly = 0 hourly = 0
daily = 0 daily = 0
monthly = 0 monthly = 0
+1 -18
View File
@@ -10,14 +10,6 @@ in
settings = { settings = {
devices.davids-server.id = "7GXTDGR-AOXFW2O-K6J7NM3-XYZNRRW-AKHAFWM-GBOWUPQ-OA6JIWD-ER7RDQL"; # cspell:disable-line devices.davids-server.id = "7GXTDGR-AOXFW2O-K6J7NM3-XYZNRRW-AKHAFWM-GBOWUPQ-OA6JIWD-ER7RDQL"; # cspell:disable-line
folders = { folders = {
photos = {
path = "${vars.syncthing}/important";
devices = [
"rhapsody-in-green"
"phone"
];
fsWatcherEnabled = true;
};
"dotfiles" = { "dotfiles" = {
path = "/home/richie/dotfiles"; path = "/home/richie/dotfiles";
devices = [ devices = [
@@ -97,16 +89,7 @@ in
]; ];
fsWatcherEnabled = true; fsWatcherEnabled = true;
}; };
"recordings" = { #
path = "/home/richie/recordings";
devices = [
"bob"
"phone"
"rhapsody-in-green"
];
fsWatcherEnabled = true;
};
# davids-server
"davids-backup1" = { "davids-backup1" = {
id = "8229p-8z3tm"; # cspell:disable-line id = "8229p-8z3tm"; # cspell:disable-line
path = "${vars.syncthing}/davids_backups/1"; path = "${vars.syncthing}/davids_backups/1";
-74
View File
@@ -1,74 +0,0 @@
let
domains = [
"audiobookshelf"
"cache"
"gitea"
"jellyfin"
"share"
];
extraDomains = [ "www.norn-sight.com" ];
makeCert = name: {
name = "${name}.tmmworkshop.com";
value = {
webroot = "/var/lib/acme/.challenges";
group = "acme";
reloadServices = [ "haproxy.service" ];
};
};
makeExtraCert = name: {
inherit name;
value = {
webroot = "/var/lib/acme/.challenges";
group = "acme";
reloadServices = [ "haproxy.service" ];
};
};
acmeServices =
map (domain: "acme-${domain}.tmmworkshop.com.service") domains
++ map (domain: "acme-${domain}.service") extraDomains;
in
{
users.users.haproxy.extraGroups = [ "acme" ];
security.acme = {
acceptTerms = true;
defaults.email = "Richie@tmmworkshop.com";
certs = builtins.listToAttrs ((map makeCert domains) ++ (map makeExtraCert extraDomains));
};
# Minimal nginx to serve ACME HTTP-01 challenge files for HAProxy
services.nginx = {
enable = true;
virtualHosts."acme-challenge" = {
listen = [
{
addr = "127.0.0.1";
port = 8402;
}
];
locations."/.well-known/acme-challenge/" = {
root = "/var/lib/acme/.challenges";
};
};
};
# Ensure the challenge directory exists with correct permissions
systemd.tmpfiles.rules = [
"d /var/lib/acme/.challenges 0750 acme acme - -"
"d /var/lib/acme/.challenges/.well-known 0750 acme acme - -"
"d /var/lib/acme/.challenges/.well-known/acme-challenge 0750 acme acme - -"
];
users.users.nginx.extraGroups = [ "acme" ];
# HAProxy needs certs to exist before it can bind :443.
# NixOS's acme module generates self-signed placeholders on first boot
# via acme-<domain>.service — just make HAProxy wait for them.
systemd.services.haproxy = {
after = acmeServices;
wants = acmeServices;
};
}
-9
View File
@@ -1,9 +0,0 @@
{ lib, ... }:
{
imports =
let
files = builtins.attrNames (builtins.readDir ./.);
nixFiles = builtins.filter (name: lib.hasSuffix ".nix" name && name != "default.nix") files;
in
map (file: ./. + "/${file}") nixFiles;
}
@@ -1,35 +0,0 @@
{
pkgs,
inputs,
...
}:
{
systemd.services.agent-logger = {
description = "Unified agent logger";
after = [ "local-fs.target" ];
wantedBy = [ "multi-user.target" ];
environment = {
AGENT_LOG_DB = "/var/lib/agent-logger/agent_log.sqlite";
HOME = "/home/richie";
PYTHONPATH = "${inputs.self}";
};
serviceConfig = {
Type = "simple";
User = "richie";
WorkingDirectory = "/home/richie";
ExecStart = "${pkgs.my_python}/bin/python -m python.agent_logger.main";
StateDirectory = "agent-logger";
Restart = "on-failure";
RestartSec = "5s";
StandardOutput = "journal";
StandardError = "journal";
NoNewPrivileges = true;
ProtectSystem = "strict";
ProtectHome = "read-only";
PrivateTmp = true;
ReadOnlyPaths = [ "${inputs.self}" ];
};
};
}
+2 -11
View File
@@ -11,8 +11,8 @@
"${inputs.self}/common/optional/yubikey.nix" "${inputs.self}/common/optional/yubikey.nix"
"${inputs.self}/common/optional/zerotier.nix" "${inputs.self}/common/optional/zerotier.nix"
./hardware.nix ./hardware.nix
./llms.nix
./open_webui.nix ./open_webui.nix
./programs.nix
./qmk.nix ./qmk.nix
./syncthing.nix ./syncthing.nix
inputs.nixos-hardware.nixosModules.framework-13-7040-amd inputs.nixos-hardware.nixosModules.framework-13-7040-amd
@@ -23,20 +23,11 @@
hostId = "6404140d"; hostId = "6404140d";
firewall = { firewall = {
enable = true; enable = true;
allowedTCPPorts = [ allowedTCPPorts = [ ];
8000
8080
8081
];
}; };
networkmanager.enable = true; networkmanager.enable = true;
}; };
programs.appimage = {
enable = true;
binfmt = true; # allows *.AppImage to be run directly
};
services = { services = {
openssh.ports = [ 922 ]; openssh.ports = [ 922 ];
flatpak.enable = true; flatpak.enable = true;
+30
View File
@@ -0,0 +1,30 @@
{
services.ollama = {
user = "ollama";
enable = true;
host = "127.0.0.1";
syncModels = true;
loadModels = [
"deepscaler:1.5b"
"deepseek-r1:8b"
"gemma3:12b"
"gemma3:27b"
"gpt-oss:20b"
"lfm2:24b"
"qwen3:14b"
"qwen3.5:27b"
];
};
systemd.services = {
ollama.serviceConfig = {
Nice = 19;
IOSchedulingPriority = 7;
};
ollama-model-loader.serviceConfig = {
Nice = 19;
CPUWeight = 50;
IOSchedulingClass = "idle";
IOSchedulingPriority = 7;
};
};
}
-1
View File
@@ -1,7 +1,6 @@
{ {
services.open-webui = { services.open-webui = {
enable = true; enable = true;
host = "0.0.0.0";
environment = { environment = {
ANONYMIZED_TELEMETRY = "False"; ANONYMIZED_TELEMETRY = "False";
DO_NOT_TRACK = "True"; DO_NOT_TRACK = "True";
-6
View File
@@ -1,6 +0,0 @@
{ pkgs, ... }:
{
environment.systemPackages = with pkgs; [
ffmpeg
];
}
-17
View File
@@ -39,14 +39,6 @@
]; ];
fsWatcherEnabled = true; fsWatcherEnabled = true;
}; };
photos = {
path = "/home/richie/photos";
devices = [
"jeeves"
"phone"
];
fsWatcherEnabled = true;
};
"projects" = { "projects" = {
id = "vyma6-lqqrz"; # cspell:disable-line id = "vyma6-lqqrz"; # cspell:disable-line
path = "/home/richie/projects"; path = "/home/richie/projects";
@@ -63,15 +55,6 @@
]; ];
fsWatcherEnabled = true; fsWatcherEnabled = true;
}; };
"recordings" = {
path = "/home/richie/recordings";
devices = [
"bob"
"jeeves"
"phone"
];
fsWatcherEnabled = true;
};
"vault" = { "vault" = {
path = "/home/richie/vault"; path = "/home/richie/vault";
devices = [ devices = [
-986
View File
@@ -1,986 +0,0 @@
"""test_audible_convert."""
from __future__ import annotations
import json
import subprocess
import pytest
from sqlalchemy import create_engine
from sqlalchemy.orm import Session, sessionmaker
from python.orm.richie import Audiobook, AudiobookAuthor, AudiobookSeries, RichieBase
from python.tools.audiobook import audible_convert, metadata_agent
from python.tools.audiobook.metadata_agent import StandardBookMetadata, standard_book_metadata
class FakeOllamaResponse:
def __init__(self, payload):
self._payload = payload
def raise_for_status(self):
return None
def json(self):
return self._payload
class FakeFfprobeError(RuntimeError):
def __str__(self):
return "bad ffprobe"
@pytest.fixture
def audiobook_engine():
engine = create_engine("sqlite+pysqlite:///:memory:", future=True)
RichieBase.metadata.create_all(engine)
with sessionmaker(bind=engine, expire_on_commit=False, future=True)() as session:
session.add_all(
[
AudiobookAuthor(id=1, name="glynn_stewart"),
AudiobookAuthor(id=2, name="craig_alanson"),
AudiobookAuthor(id=4, name="dennis_e_taylor"),
AudiobookSeries(id=1, name="starships_mage", author_id=1),
AudiobookSeries(id=2, name="black_fleet_trilogy", author_id=1),
AudiobookSeries(id=3, name="expeditionary_force", author_id=2),
AudiobookSeries(id=4, name="bobiverse", author_id=4),
],
)
session.commit()
yield engine
engine.dispose()
def install_fake_ollama(monkeypatch, payloads):
calls = []
def fake_post(*args, **kwargs):
calls.append((args, kwargs))
return FakeOllamaResponse(payloads.pop(0))
monkeypatch.setattr(metadata_agent.httpx, "post", fake_post)
return calls
def conversion_config(output_directory, *, dry_run=False, overwrite=False):
return audible_convert.ConversionConfig(
resolved_output=output_directory,
ollama_api_key="test-key",
agent_config=metadata_agent.AgentConfig(),
engine=create_engine("sqlite+pysqlite:///:memory:"),
activation_bytes=None,
dry_run=dry_run,
overwrite=overwrite,
)
def sqlite_engine():
return create_engine("sqlite+pysqlite:///:memory:")
def tool_response(name, arguments):
return {
"message": {
"role": "assistant",
"content": "",
"tool_calls": [{"function": {"name": name, "arguments": arguments}}],
},
}
def final_response(metadata):
return {"message": {"role": "assistant", "content": json.dumps(metadata)}}
def fenced_final_response(metadata):
return {"message": {"role": "assistant", "content": f"```json\n{json.dumps(metadata)}\n```"}}
def test_output_stem_uses_catalog_slugs() -> None:
metadata = StandardBookMetadata(
author_id=1,
author="glynn_stewart",
book_id=None,
title="title-slug",
series_id=1,
series="starships_mage",
series_index=1,
confidence=0.96,
needs_review=False,
evidence=["test"],
)
assert audible_convert.output_stem(metadata) == "glynn_stewart-starships_mage_01-title-slug"
def test_convert_aax_file_runs_ffmpeg(tmp_path, monkeypatch) -> None:
"""test_convert_aax_file_runs_ffmpeg."""
commands = []
def fake_run_command(arguments, *, capture=False):
assert capture is False
commands.append(arguments)
return subprocess.CompletedProcess(arguments, 0, "", "")
source = tmp_path / "book.aax"
destination = tmp_path / "book" / "book.m4b"
monkeypatch.setattr(audible_convert, "run_command", fake_run_command)
audible_convert.convert_aax_file(source, destination, "abc123", overwrite=False)
assert commands == [
[
"ffmpeg",
"-hide_banner",
"-n",
"-activation_bytes",
"abc123",
"-i",
str(source),
"-map_metadata",
"0",
"-c",
"copy",
str(destination),
],
]
assert destination.parent.is_dir()
def test_run_command_redacts_activation_bytes_in_logs_and_errors(monkeypatch, caplog) -> None:
def fake_run(arguments, *, check, capture_output, text):
assert check is True
assert capture_output is False
assert text is True
raise subprocess.CalledProcessError(1, arguments)
monkeypatch.setattr(audible_convert.subprocess, "run", fake_run)
caplog.set_level("DEBUG", audible_convert.__name__)
with pytest.raises(audible_convert.CommandExecutionError) as error:
audible_convert.run_command(["ffmpeg", "-activation_bytes", "secret-token", "-i", "book.aax"])
assert "secret-token" not in caplog.text
assert "secret-token" not in str(error.value)
assert "<redacted>" in caplog.text
assert "<redacted>" in str(error.value)
def test_write_agent_log_serializes_metadata_as_json_object(tmp_path) -> None:
metadata = StandardBookMetadata(
author_id=1,
author="glynn_stewart",
book_id=None,
title="starship-mage",
series_id=1,
series="starships_mage",
series_index=1,
confidence=0.95,
needs_review=False,
evidence=["test"],
)
log_file = tmp_path / "agent.jsonl"
metadata_agent.write_agent_log(log_file, "final_metadata", metadata=metadata, path=tmp_path)
record = json.loads(log_file.read_text(encoding="utf-8"))
assert record["event"] == "final_metadata"
assert record["metadata"]["author"] == "glynn_stewart"
assert record["metadata"]["title"] == "starship-mage"
assert record["path"] == str(tmp_path)
def test_system_prompt_instructs_agent_to_detect_omnibuses() -> None:
prompt = metadata_agent.system_prompt()
assert "Detect omnibus or box-set editions" in prompt
assert "books-1-3" in prompt
assert "Keep series_index as the" in prompt
def test_standard_book_metadata_accepts_valid_tool_output(tmp_path, monkeypatch, audiobook_engine) -> None:
install_fake_ollama(
monkeypatch,
[
tool_response("search_authors", {"query": "Glynn Stewart"}),
tool_response("search_series", {"query": "starships_mage"}),
final_response(
{
"author_id": 1,
"book_id": None,
"title": "starship-mage",
"series_id": 1,
"series_index": 1,
"confidence": 0.95,
"evidence": ["filename and catalog match"],
},
),
],
)
metadata = standard_book_metadata(
"Starship Mage.aax",
{"title": "Starship Mage", "artist": "Glynn Stewart"},
audiobook_engine,
tmp_path / "agent.jsonl",
"test-key",
config=metadata_agent.AgentConfig(),
)
assert metadata == StandardBookMetadata(
author_id=1,
author="glynn_stewart",
book_id=1,
title="starship-mage",
series_id=1,
series="starships_mage",
series_index=1,
confidence=0.95,
needs_review=False,
evidence=["filename and catalog match"],
)
records = [
json.loads(line)
for line in (tmp_path / "agent.jsonl").read_text(encoding="utf-8").splitlines()
]
sent = [record for record in records if record["event"] == "llm_messages_sent"]
received = [record for record in records if record["event"] == "llm_message_received"]
assert sent[0]["messages"][0]["role"] == "system"
assert "Starship Mage" in sent[0]["messages"][1]["content"]
assert received[0]["message"]["tool_calls"][0]["function"]["name"] == "search_authors"
with Session(audiobook_engine) as session:
book = session.get(Audiobook, 1)
assert book.title == "starship-mage"
assert book.author.name == "glynn_stewart"
def test_standard_book_metadata_uses_agent_config(tmp_path, monkeypatch, audiobook_engine) -> None:
config = metadata_agent.AgentConfig(
model="custom-model",
ollama_chat_url="https://ollama.example.test/api/chat",
http_timeout_seconds=12,
max_agent_turns=1,
min_confidence=0.5,
tool_names=("search_authors",),
)
calls = install_fake_ollama(
monkeypatch,
[
tool_response("search_authors", {"query": "Glynn Stewart"}),
final_response(
{
"author_id": 1,
"book_id": None,
"title": "standalone-book",
"series_id": None,
"series_index": 0,
"confidence": 0.5,
"evidence": ["custom config"],
},
),
],
)
metadata = standard_book_metadata(
"Standalone Book.aax",
{"title": "Standalone Book", "artist": "Glynn Stewart"},
audiobook_engine,
tmp_path / "agent.jsonl",
"test-key",
config=config,
)
first_request_url = calls[0][0][0]
first_request_options = calls[0][1]
tool_names = [
tool_schema["function"]["name"]
for tool_schema in first_request_options["json"]["tools"]
]
assert first_request_url == "https://ollama.example.test/api/chat"
assert first_request_options["timeout"] == 12
assert first_request_options["json"]["model"] == "custom-model"
assert tool_names == ["search_authors"]
assert metadata.needs_review is False
assert metadata.series == "standalone"
def test_standard_book_metadata_retries_invalid_json_then_needs_review(
tmp_path,
monkeypatch,
audiobook_engine,
) -> None:
install_fake_ollama(
monkeypatch,
[
tool_response("search_authors", {"query": "Glynn Stewart"}),
tool_response("search_series", {"query": "Starship Mage"}),
{"message": {"role": "assistant", "content": "{"}},
{"message": {"role": "assistant", "content": "{"}},
],
)
metadata = standard_book_metadata(
"Starship Mage.aax",
{"title": "Starship Mage"},
audiobook_engine,
tmp_path / "agent.jsonl",
"test-key",
config=metadata_agent.AgentConfig(),
)
assert metadata.needs_review is True
assert metadata.confidence == 0
def test_standard_book_metadata_accepts_fenced_final_json(
tmp_path,
monkeypatch,
audiobook_engine,
) -> None:
install_fake_ollama(
monkeypatch,
[
tool_response("search_authors", {"query": "Dennis E. Taylor"}),
tool_response("search_series", {"query": "Bobiverse", "author_id": 4}),
tool_response("search_books", {"query": "All These Worlds", "author_id": 4, "series_id": 4}),
fenced_final_response(
{
"author_id": 4,
"book_id": None,
"title": "all-these-worlds",
"series_id": 4,
"series_index": 3,
"confidence": 0.95,
"evidence": ["fenced json from model"],
},
),
],
)
metadata = standard_book_metadata(
"All These Worlds.aax",
{"title": "All These Worlds: Bobiverse, Book 3", "artist": "Dennis E. Taylor"},
audiobook_engine,
tmp_path / "agent.jsonl",
"test-key",
config=metadata_agent.AgentConfig(),
)
assert metadata.needs_review is False
assert metadata.author == "dennis_e_taylor"
assert metadata.series == "bobiverse"
assert metadata.title == "all-these-worlds"
def test_standard_book_metadata_recovers_from_tool_validation_error(
tmp_path,
monkeypatch,
audiobook_engine,
) -> None:
install_fake_ollama(
monkeypatch,
[
tool_response("search_authors", {"query": "Cormac McCarthy"}),
tool_response("ensure_author", {"name": "Cormac McCarthy"}),
tool_response("ensure_series", {"name": "The Cormac McCarthy Collection", "author_id": 5}),
tool_response(
"ensure_book",
{
"title": "The Road",
"author_id": 5,
"series_id": 5,
"series_index": 0,
},
),
final_response(
{
"author_id": 5,
"book_id": None,
"title": "The Road",
"series_id": None,
"series_index": 0,
"confidence": 0.9,
"evidence": ["tool error showed this should be standalone"],
},
),
],
)
log_file = tmp_path / "agent.jsonl"
metadata = standard_book_metadata(
"The Road.aax",
{"title": "The Road", "artist": "Cormac McCarthy"},
audiobook_engine,
log_file,
"test-key",
config=metadata_agent.AgentConfig(),
)
assert metadata == StandardBookMetadata(
author_id=5,
author="cormac_mccarthy",
book_id=1,
title="the-road",
series_id=None,
series="standalone",
series_index=0,
confidence=0.9,
needs_review=False,
evidence=["tool error showed this should be standalone"],
)
assert "series books must use a positive series_index" in log_file.read_text(encoding="utf-8")
with Session(audiobook_engine) as session:
assert session.get(AudiobookSeries, 5) is None
book = session.get(Audiobook, 1)
assert book.title == "the-road"
assert book.series_id is None
def test_standard_book_metadata_rejects_unknown_tool(tmp_path, monkeypatch, audiobook_engine) -> None:
log_file = tmp_path / "agent.jsonl"
install_fake_ollama(monkeypatch, [tool_response("drop_table", {})])
metadata = standard_book_metadata(
"Book.aax",
{"title": "Book"},
audiobook_engine,
log_file,
"test-key",
config=metadata_agent.AgentConfig(),
)
assert metadata.needs_review is True
assert "Unknown audiobook metadata tool" in metadata.evidence[0]
assert "tool_error" in log_file.read_text(encoding="utf-8")
def test_standard_book_metadata_rejects_ids_not_returned_by_tools(
tmp_path,
monkeypatch,
audiobook_engine,
) -> None:
install_fake_ollama(
monkeypatch,
[
tool_response("search_authors", {"query": "Glynn Stewart"}),
tool_response("search_series", {"query": "Starship Mage"}),
final_response(
{
"author_id": 2,
"book_id": None,
"title": "expeditionary-force",
"series_id": 1,
"series_index": 1,
"confidence": 0.99,
"evidence": ["bad id"],
},
),
final_response(
{
"author_id": 2,
"book_id": None,
"title": "expeditionary-force",
"series_id": 1,
"series_index": 1,
"confidence": 0.99,
"evidence": ["bad id"],
},
),
],
)
metadata = standard_book_metadata(
"Book.aax",
{"title": "Book"},
audiobook_engine,
tmp_path / "agent.jsonl",
"test-key",
config=metadata_agent.AgentConfig(),
)
assert metadata.needs_review is True
assert "author_id 2 was not returned" in metadata.evidence[0]
def test_standard_book_metadata_rejects_series_for_wrong_author(
tmp_path,
monkeypatch,
audiobook_engine,
) -> None:
install_fake_ollama(
monkeypatch,
[
tool_response("search_authors", {"query": "Glynn Stewart"}),
tool_response("search_series", {"query": "expeditionary_force"}),
final_response(
{
"author_id": 1,
"book_id": None,
"title": "expeditionary-force",
"series_id": 3,
"series_index": 1,
"confidence": 0.99,
"evidence": ["wrong author"],
},
),
final_response(
{
"author_id": 1,
"book_id": None,
"title": "expeditionary-force",
"series_id": 3,
"series_index": 1,
"confidence": 0.99,
"evidence": ["wrong author"],
},
),
],
)
metadata = standard_book_metadata(
"Book.aax",
{"title": "Book"},
audiobook_engine,
tmp_path / "agent.jsonl",
"test-key",
config=metadata_agent.AgentConfig(),
)
assert metadata.needs_review is True
assert "series_id 3 does not belong to author_id 1" in metadata.evidence[0]
def test_standard_book_metadata_forces_final_after_empty_book_searches(
tmp_path,
monkeypatch,
audiobook_engine,
) -> None:
config = metadata_agent.AgentConfig(max_agent_turns=5)
install_fake_ollama(
monkeypatch,
[
tool_response("search_authors", {"query": "Dennis E. Taylor"}),
tool_response("search_series", {"query": "Bobiverse", "author_id": 4}),
tool_response("search_books", {"query": "We Are Legion We Are Bob", "author_id": 4, "series_id": 4}),
tool_response("search_books", {"query": "we are legion", "author_id": 4}),
tool_response("search_books", {"query": "We Are Legion"}),
final_response(
{
"author_id": 4,
"book_id": None,
"title": "we-are-legion-we-are-bob",
"series_id": 4,
"series_index": 1,
"confidence": 0.95,
"evidence": ["author and series tool results; title from ffprobe tags"],
},
),
],
)
metadata = standard_book_metadata(
"We_Are_Legion_(We_Are_Bob)_Bobiverse_Book_1-LC_128_44100_stereo.aax",
{
"album": "We Are Legion (We Are Bob): Bobiverse, Book 1",
"artist": "Dennis E. Taylor",
"title": "We Are Legion (We Are Bob): Bobiverse, Book 1",
},
audiobook_engine,
tmp_path / "agent.jsonl",
"test-key",
config=config,
)
assert metadata == StandardBookMetadata(
author_id=4,
author="dennis_e_taylor",
book_id=1,
title="we-are-legion-we-are-bob",
series_id=4,
series="bobiverse",
series_index=1,
confidence=0.95,
needs_review=False,
evidence=["author and series tool results; title from ffprobe tags"],
)
assert '"tools_enabled": false' in (tmp_path / "agent.jsonl").read_text(encoding="utf-8")
def test_standard_book_metadata_can_create_missing_catalog_rows(
tmp_path,
monkeypatch,
audiobook_engine,
) -> None:
install_fake_ollama(
monkeypatch,
[
tool_response("search_authors", {"query": "Martha Wells"}),
tool_response("ensure_author", {"name": "martha_wells"}),
tool_response("search_series", {"query": "Murderbot Diaries", "author_id": 5}),
tool_response("ensure_series", {"name": "murderbot_diaries", "author_id": 5}),
tool_response("search_books", {"query": "All Systems Red", "author_id": 5, "series_id": 5}),
final_response(
{
"author_id": 5,
"book_id": None,
"title": "all-systems-red",
"series_id": 5,
"series_index": 1,
"confidence": 0.96,
"evidence": ["created missing author and series; title from tags"],
},
),
],
)
metadata = standard_book_metadata(
"All Systems Red.aax",
{"title": "All Systems Red", "artist": "Martha Wells"},
audiobook_engine,
tmp_path / "agent.jsonl",
"test-key",
config=metadata_agent.AgentConfig(),
)
assert metadata == StandardBookMetadata(
author_id=5,
author="martha_wells",
book_id=1,
title="all-systems-red",
series_id=5,
series="murderbot_diaries",
series_index=1,
confidence=0.96,
needs_review=False,
evidence=["created missing author and series; title from tags"],
)
with Session(audiobook_engine) as session:
author = session.get(AudiobookAuthor, 5)
series = session.get(AudiobookSeries, 5)
book = session.get(Audiobook, 1)
assert author.name == "martha_wells"
assert series.name == "murderbot_diaries"
assert series.author_id == author.id
assert book.title == "all-systems-red"
assert book.author_id == author.id
assert book.series_id == series.id
def test_standard_book_metadata_normalizes_noisy_created_catalog_rows(
tmp_path,
monkeypatch,
audiobook_engine,
) -> None:
install_fake_ollama(
monkeypatch,
[
tool_response("search_authors", {"query": "Charles Lamb"}),
tool_response("ensure_author", {"name": "charles-lamb"}),
tool_response("search_series", {"query": "AL:ICE Series", "author_id": 5}),
tool_response("ensure_series", {"name": "AL:ICE Series", "author_id": 5}),
tool_response("search_books", {"query": "AL:ICE Space War", "author_id": 5, "series_id": 5}),
final_response(
{
"author_id": 5,
"book_id": None,
"title": "AL:ICE Space War",
"series_id": 5,
"series_index": 4,
"confidence": 0.95,
"evidence": ["created normalized author and series; title from tags"],
},
),
],
)
metadata = standard_book_metadata(
"ALICE_Space_War_ALICE_Series_Book_4-LC_64_22050_stereo.aax",
{
"album": "AL:ICE Space War: AL:ICE Series, Book 4",
"artist": "Charles Lamb",
"title": "AL:ICE Space War: AL:ICE Series, Book 4",
},
audiobook_engine,
tmp_path / "agent.jsonl",
"test-key",
config=metadata_agent.AgentConfig(),
)
assert metadata == StandardBookMetadata(
author_id=5,
author="charles_lamb",
book_id=1,
title="al-ice-space-war",
series_id=5,
series="al_ice_series",
series_index=4,
confidence=0.95,
needs_review=False,
evidence=["created normalized author and series; title from tags"],
)
with Session(audiobook_engine) as session:
author = session.get(AudiobookAuthor, 5)
series = session.get(AudiobookSeries, 5)
book = session.get(Audiobook, 1)
assert author.name == "charles_lamb"
assert series.name == "al_ice_series"
assert series.author_id == author.id
assert book.title == "al-ice-space-war"
assert book.author_id == author.id
assert book.series_id == series.id
def test_convert_aax_file_with_agent_success_renames_temp_output(tmp_path, monkeypatch) -> None:
source = tmp_path / "book.aax"
output_directory = tmp_path / "audiobooks"
source.touch()
monkeypatch.setattr(audible_convert, "read_metadata", lambda _: {"title": "Starship Mage"})
monkeypatch.setattr(
audible_convert,
"standard_book_metadata",
lambda *_, **__: StandardBookMetadata(
author_id=1,
author="glynn_stewart",
book_id=None,
title="starship-mage",
series_id=1,
series="starships_mage",
series_index=1,
confidence=0.95,
needs_review=False,
evidence=["test"],
),
)
def fake_convert(_source, destination, _activation_bytes, *, overwrite):
assert overwrite is True
destination.parent.mkdir(parents=True, exist_ok=True)
destination.write_text("converted", encoding="utf-8")
monkeypatch.setattr(audible_convert, "convert_aax_file", fake_convert)
audible_convert.convert_aax_file_with_agent(
source,
conversion_config(output_directory),
)
expected = output_directory / "glynn_stewart-starships_mage_01-starship-mage"
destination = expected / "glynn_stewart-starships_mage_01-starship-mage.m4b"
assert destination.read_text(encoding="utf-8") == "converted"
assert not list((output_directory / ".audible_convert" / "tmp").glob("*/converted.m4b"))
def test_ffprobe_failure_writes_review_without_converting(tmp_path, monkeypatch) -> None:
source = tmp_path / "book.aax"
output_directory = tmp_path / "audiobooks"
source.touch()
calls = []
def fake_read_metadata(_source):
raise FakeFfprobeError
def fake_convert(*args, **kwargs):
calls.append((args, kwargs))
monkeypatch.setattr(audible_convert, "read_metadata", fake_read_metadata)
monkeypatch.setattr(audible_convert, "convert_aax_file", fake_convert)
audible_convert.convert_aax_file_with_agent(source, conversion_config(output_directory))
review_files = list((output_directory / ".audible_convert" / "review").glob("*.json"))
assert calls == []
assert len(review_files) == 1
review = json.loads(review_files[0].read_text(encoding="utf-8"))
assert review["ffprobe_metadata"] == {}
assert review["reason"] == "ffprobe_failed: bad ffprobe"
assert review["temp_file"] is None
def test_low_confidence_metadata_keeps_temp_output_for_review(tmp_path, monkeypatch) -> None:
source = tmp_path / "book.aax"
output_directory = tmp_path / "audiobooks"
source.touch()
monkeypatch.setattr(audible_convert, "read_metadata", lambda _: {"title": "Unknown"})
monkeypatch.setattr(
audible_convert,
"standard_book_metadata",
lambda *_, **__: StandardBookMetadata(
author_id=0,
author="unknown_author",
book_id=None,
title="unknown-title",
series_id=None,
series="standalone",
series_index=0,
confidence=0.25,
needs_review=True,
evidence=["unclear"],
),
)
def fake_convert(_source, destination, _activation_bytes, *, overwrite):
assert overwrite is True
destination.parent.mkdir(parents=True, exist_ok=True)
destination.write_text("converted", encoding="utf-8")
monkeypatch.setattr(audible_convert, "convert_aax_file", fake_convert)
audible_convert.convert_aax_file_with_agent(
source,
conversion_config(output_directory),
)
temp_files = list((output_directory / ".audible_convert" / "tmp").glob("*/converted.m4b"))
review_files = list((output_directory / ".audible_convert" / "review").glob("*.json"))
assert len(temp_files) == 1
assert temp_files[0].read_text(encoding="utf-8") == "converted"
assert len(review_files) == 1
def test_existing_destination_skips_rename_and_removes_temp(tmp_path, monkeypatch) -> None:
source = tmp_path / "book.aax"
output_directory = tmp_path / "audiobooks"
source.touch()
final_file = (
output_directory
/ "glynn_stewart-starships_mage_01-starship-mage"
/ "glynn_stewart-starships_mage_01-starship-mage.m4b"
)
final_file.parent.mkdir(parents=True)
final_file.write_text("existing", encoding="utf-8")
monkeypatch.setattr(audible_convert, "read_metadata", lambda _: {"title": "Starship Mage"})
monkeypatch.setattr(
audible_convert,
"standard_book_metadata",
lambda *_, **__: StandardBookMetadata(
author_id=1,
author="glynn_stewart",
book_id=None,
title="starship-mage",
series_id=1,
series="starships_mage",
series_index=1,
confidence=0.95,
needs_review=False,
evidence=["test"],
),
)
def fake_convert(_source, destination, _activation_bytes, *, overwrite):
assert overwrite is True
destination.parent.mkdir(parents=True, exist_ok=True)
destination.write_text("converted", encoding="utf-8")
monkeypatch.setattr(audible_convert, "convert_aax_file", fake_convert)
audible_convert.convert_aax_file_with_agent(
source,
conversion_config(output_directory),
)
assert final_file.read_text(encoding="utf-8") == "existing"
assert not list((output_directory / ".audible_convert" / "tmp").glob("*/converted.m4b"))
def test_richie_exports_audiobook_models() -> None:
from python.orm.richie import Audiobook # noqa: PLC0415
assert Audiobook.__tablename__ == "audiobook"
def test_main_dry_run_prints_outputs_without_converting(tmp_path, monkeypatch, capsys) -> None:
input_directory = tmp_path / "raw"
output_directory = tmp_path / "audiobooks"
input_directory.mkdir()
source = input_directory / "book.aax"
source.touch()
monkeypatch.setenv("OLLAMA_API_KEY", "test-key")
monkeypatch.setattr(
audible_convert,
"read_metadata",
lambda _: {
"artist": "Charles Lamb",
"title": "Alice: Alice Series #1",
},
)
calls = []
def fake_convert(*args, **kwargs):
calls.append((args, kwargs))
monkeypatch.setattr(audible_convert, "convert_aax_file", fake_convert)
monkeypatch.setattr(
audible_convert,
"standard_book_metadata",
lambda *_, **__: StandardBookMetadata(
author_id=1,
author="charles_lamb",
book_id=None,
title="alice",
series_id=1,
series="alice",
series_index=1,
confidence=0.95,
needs_review=False,
evidence=["test"],
),
)
def fake_get_postgres_engine(*, name):
assert name == "RICHIE"
return create_engine("sqlite+pysqlite:///:memory:")
monkeypatch.setattr(audible_convert, "get_postgres_engine", fake_get_postgres_engine)
audible_convert.main(input_directory, output_directory, dry_run=True)
assert calls == []
assert capsys.readouterr().out == (
f"{source} -> {output_directory / 'charles_lamb-alice_01-alice' / 'charles_lamb-alice_01-alice.m4b'}\n"
)
dry_run_file = (
output_directory
/ ".audible_convert"
/ "dry-run"
/ "charles_lamb-alice_01-alice"
/ "charles_lamb-alice_01-alice.m4b"
)
assert dry_run_file.read_text(encoding="utf-8") == (
f"{output_directory / 'charles_lamb-alice_01-alice' / 'charles_lamb-alice_01-alice.m4b'}\n"
)
assert (output_directory / ".audible_convert" / "logs").is_dir()
def test_main_reads_activation_bytes_from_env(tmp_path, monkeypatch) -> None:
input_directory = tmp_path / "raw"
output_directory = tmp_path / "audiobooks"
input_directory.mkdir()
source = input_directory / "book.aax"
source.touch()
configs = []
def fake_convert(_source, config):
configs.append(config)
def fake_get_postgres_engine(*, name):
assert name == "RICHIE"
return sqlite_engine()
monkeypatch.setenv("OLLAMA_API_KEY", "test-key")
monkeypatch.setenv("AUDIBLE_ACTIVATION_BYTES", "activation-secret")
monkeypatch.setattr(audible_convert, "get_postgres_engine", fake_get_postgres_engine)
monkeypatch.setattr(audible_convert, "convert_aax_file_with_agent", fake_convert)
audible_convert.main(input_directory, output_directory)
assert configs == [
audible_convert.ConversionConfig(
resolved_output=output_directory,
ollama_api_key="test-key",
agent_config=configs[0].agent_config,
engine=configs[0].engine,
activation_bytes="activation-secret",
dry_run=False,
overwrite=False,
),
]
-126
View File
@@ -1,126 +0,0 @@
"""test_audiobook_catalog."""
from __future__ import annotations
import pytest
from sqlalchemy import create_engine, select
from sqlalchemy.orm import sessionmaker
from python.orm.richie import AudiobookAuthor, AudiobookSeries, RichieBase
from python.tools.audiobook import catalog
@pytest.fixture
def audiobook_session():
engine = create_engine("sqlite+pysqlite:///:memory:", future=True)
RichieBase.metadata.create_all(engine)
with sessionmaker(bind=engine, expire_on_commit=False, future=True)() as session:
yield session
engine.dispose()
def test_upsert_catalog_csv_inserts_and_updates_authors_and_series(tmp_path, audiobook_session) -> None:
audiobook_session.add_all(
[
AudiobookAuthor(id=10, name="old_author"),
AudiobookAuthor(id=11, name="craig_alanson"),
],
)
audiobook_session.commit()
authors_csv = tmp_path / "authors.csv"
series_csv = tmp_path / "series.csv"
authors_csv.write_text(
"name,id\n"
"glynn_stewart,\n"
"craig_alanson,\n"
"updated_author,10\n",
encoding="utf-8",
)
series_csv.write_text(
"name,author_name,id\n"
"starships_mage,glynn_stewart,\n"
"expeditionary_force,craig_alanson,\n",
encoding="utf-8",
)
author_count = catalog.upsert_authors_from_csv(audiobook_session, authors_csv)
series_count = catalog.upsert_series_from_csv(audiobook_session, series_csv)
audiobook_session.commit()
authors = audiobook_session.scalars(select(AudiobookAuthor).order_by(AudiobookAuthor.id)).all()
series = audiobook_session.scalars(select(AudiobookSeries).order_by(AudiobookSeries.name)).all()
assert author_count == 3
assert series_count == 2
assert [(author.id, author.name) for author in authors] == [
(10, "updated_author"),
(11, "craig_alanson"),
(12, "glynn_stewart"),
]
assert [(row.name, row.author.name) for row in series] == [
("expeditionary_force", "craig_alanson"),
("starships_mage", "glynn_stewart"),
]
def test_upsert_series_csv_updates_series_by_id(tmp_path, audiobook_session) -> None:
author = AudiobookAuthor(id=1, name="glynn_stewart")
audiobook_session.add_all(
[
author,
AudiobookSeries(id=7, name="old_series", author=author),
],
)
audiobook_session.commit()
series_csv = tmp_path / "series.csv"
series_csv.write_text(
"name,author_name,id\n"
"starships_mage,glynn_stewart,7\n",
encoding="utf-8",
)
count = catalog.upsert_series_from_csv(audiobook_session, series_csv)
audiobook_session.commit()
series = audiobook_session.get(AudiobookSeries, 7)
assert count == 1
assert series.name == "starships_mage"
assert series.author.name == "glynn_stewart"
def test_upsert_csv_allows_missing_id_column(tmp_path, audiobook_session) -> None:
authors_csv = tmp_path / "authors.csv"
series_csv = tmp_path / "series.csv"
authors_csv.write_text(
"name\n"
"glynn_stewart\n",
encoding="utf-8",
)
series_csv.write_text(
"name,author_name\n"
"starships_mage,glynn_stewart\n",
encoding="utf-8",
)
author_count = catalog.upsert_authors_from_csv(audiobook_session, authors_csv)
series_count = catalog.upsert_series_from_csv(audiobook_session, series_csv)
audiobook_session.commit()
series = audiobook_session.scalar(select(AudiobookSeries))
assert author_count == 1
assert series_count == 1
assert series.name == "starships_mage"
assert series.author.name == "glynn_stewart"
def test_upsert_series_csv_rejects_unknown_author(tmp_path, audiobook_session) -> None:
series_csv = tmp_path / "series.csv"
series_csv.write_text(
"name,author_name,id\n"
"starships_mage,glynn_stewart,\n",
encoding="utf-8",
)
with pytest.raises(catalog.CatalogImportError) as error:
catalog.upsert_series_from_csv(audiobook_session, series_csv)
assert "author not found: glynn_stewart" in str(error.value)
-113
View File
@@ -1,113 +0,0 @@
"""Tests for Gitea flake.lock automation."""
from __future__ import annotations
from python.gitea import PullRequest
from python.gitea_flake_lock import (
PR_CHECK_WORKFLOWS,
PR_LABELS,
dispatch_pull_request_checks,
ensure_flake_lock_pull_request,
find_flake_lock_pull_request,
)
def _pull_request(number=1, head_branch="automation/update-flake-lock"):
return PullRequest(
number=number,
title="Update flake.lock",
html_url=f"https://gitea.example.test/pulls/{number}",
labels=(),
head_branch=head_branch,
base_branch="main",
)
class FakeGiteaClient:
def __init__(self, pull_requests=None):
self.pull_requests = pull_requests or []
self.dispatch_calls = []
self.list_calls = []
self.create_calls = []
def list_open_pull_requests(self, **kwargs):
self.list_calls.append(kwargs)
return self.pull_requests
def create_pull_request(self, **kwargs):
self.create_calls.append(kwargs)
return _pull_request()
def dispatch_workflow(self, **kwargs):
self.dispatch_calls.append(kwargs)
def test_ensure_flake_lock_pull_request_finds_by_branch():
pull_request = _pull_request()
client = FakeGiteaClient([pull_request])
result = ensure_flake_lock_pull_request(
client,
owner="Richie",
repo="dotfiles",
branch="automation/update-flake-lock",
base="main",
)
assert result == pull_request
assert client.list_calls == [
{"owner": "Richie", "repo": "dotfiles", "head": "automation/update-flake-lock"},
]
assert client.create_calls == []
def test_ensure_flake_lock_pull_request_creates_with_labels():
client = FakeGiteaClient()
ensure_flake_lock_pull_request(
client,
owner="Richie",
repo="dotfiles",
branch="automation/update-flake-lock",
base="main",
)
assert client.create_calls == [
{
"owner": "Richie",
"repo": "dotfiles",
"title": "Update flake.lock",
"body": "Automated flake.lock update.",
"head": "automation/update-flake-lock",
"base": "main",
"labels": PR_LABELS,
},
]
def test_find_flake_lock_pull_request_finds_by_label():
pull_request = _pull_request()
client = FakeGiteaClient([pull_request])
result = find_flake_lock_pull_request(client, owner="Richie", repo="dotfiles")
assert result == pull_request
assert client.list_calls == [
{"owner": "Richie", "repo": "dotfiles", "labels": ["flake_lock_update"]},
]
def test_dispatch_pull_request_checks_runs_each_workflow():
client = FakeGiteaClient()
dispatch_pull_request_checks(client, owner="Richie", repo="dotfiles", branch="automation/update-flake-lock")
assert client.dispatch_calls == [
{
"owner": "Richie",
"repo": "dotfiles",
"workflow_id": workflow,
"ref": "automation/update-flake-lock",
}
for workflow in PR_CHECK_WORKFLOWS
]
+2 -2
View File
@@ -210,9 +210,9 @@ class TestContactCache:
mock_session_cls.return_value.__exit__ = MagicMock(return_value=False) mock_session_cls.return_value.__exit__ = MagicMock(return_value=False)
mock_device = MagicMock() mock_device = MagicMock()
mock_device.trust_level = TrustLevel.UNVERIFIED mock_device.trust_level = TrustLevel.UNVERIFIED
mock_session.scalars.return_value.one_or_none.return_value = mock_device mock_session.execute.return_value.scalar_one_or_none.return_value = mock_device
registry.record_contact("+1234", "abc") registry.record_contact("+1234", "abc")
mock_session.scalars.assert_called_once() mock_session.execute.assert_called_once()
class TestLocationCommand: class TestLocationCommand:
-1
View File
@@ -1,7 +1,6 @@
{ {
programs.git = { programs.git = {
enable = true; enable = true;
signing.format = null;
settings = { settings = {
user = { user = {
email = "dov.kruger@gmail.com"; email = "dov.kruger@gmail.com";
-1
View File
@@ -1,7 +1,6 @@
{ {
programs.git = { programs.git = {
enable = true; enable = true;
signing.format = null;
settings = { settings = {
user = { user = {
email = "DumbPuppy208@gmail.com"; email = "DumbPuppy208@gmail.com";
-2
View File
@@ -36,8 +36,6 @@ in
"hass" "hass"
"libvirtd" "libvirtd"
"networkmanager" "networkmanager"
"nornsight"
"nornsight-admin"
"plugdev" "plugdev"
"scanner" "scanner"
"transmission" "transmission"
-1
View File
@@ -1,7 +1,6 @@
{ {
programs.git = { programs.git = {
enable = true; enable = true;
signing.format = null;
settings = { settings = {
user = { user = {
email = "matthew.michal11@gmail.com"; email = "matthew.michal11@gmail.com";
-5
View File
@@ -1,5 +0,0 @@
{
imports = [
../home/global.nix
];
}

Some files were not shown because too many files have changed in this diff Show More