Compare commits

..

157 Commits

Author SHA1 Message Date
Richie a5d7c3be4f fixed fomat issue
treefmt / nix fmt (pull_request) Successful in 5s
pytest / pytest (pull_request) Successful in 28s
build_systems / build-brain (pull_request) Successful in 46s
build_systems / build-bob (pull_request) Successful in 47s
build_systems / build-leviathan (pull_request) Successful in 53s
build_systems / build-rhapsody-in-green (pull_request) Successful in 59s
build_systems / build-jeeves (pull_request) Successful in 2m32s
2026-06-14 15:42:05 -04:00
Richie 2995a75748 fixed test
treefmt / nix fmt (pull_request) Failing after 5s
pytest / pytest (pull_request) Successful in 28s
build_systems / build-bob (pull_request) Successful in 48s
build_systems / build-brain (pull_request) Successful in 53s
build_systems / build-leviathan (pull_request) Successful in 54s
build_systems / build-rhapsody-in-green (pull_request) Successful in 1m1s
build_systems / build-jeeves (pull_request) Successful in 2m43s
2026-06-14 15:41:09 -04:00
Richie dce1838163 opning ports for testing 2026-06-14 15:41:09 -04:00
Richie 121eb979a4 added a index for the VEctor DB 2026-06-14 15:41:09 -04:00
Richie c88315e9b6 improved BM25 write 2026-06-14 15:41:09 -04:00
Richie 50795ab7fc added ZstdMiddleware to ebook_search 2026-06-14 15:41:09 -04:00
Richie e9a80a0308 added vector_engine to fix name postgres name space issue 2026-06-14 15:41:09 -04:00
Richie eb76edb740 reworked ebook_search routers 2026-06-14 15:41:09 -04:00
Richie c53afb3c70 made fastapi tools 2026-06-14 15:41:09 -04:00
Richie 8301db39e5 added proper cache invalidation to load_bm25_corpus 2026-06-14 15:40:31 -04:00
Richie 45a9e90524 updated tests 2026-06-14 15:40:31 -04:00
Richie dd67de3993 improved reranking weights 2026-06-14 15:40:31 -04:00
Richie 6e3635ca01 fixed duplicat enrichment 2026-06-14 15:40:31 -04:00
Richie 11cbe31152 improved queary for vector search 2026-06-14 15:40:31 -04:00
Richie 5544dfa61c add .ebook_search_bm25 to gitignore 2026-06-14 15:40:31 -04:00
Richie 911df63513 updated python 2026-06-14 15:40:31 -04:00
Richie 07eb170b34 setup tests 2026-06-14 15:40:04 -04:00
Richie 6715bbf0a5 build api and frountend 2026-06-14 15:40:04 -04:00
Richie 26ff1f0fd3 added answer.py and config 2026-06-14 15:40:04 -04:00
Richie 666ea97754 added __init__ 2026-06-14 15:40:04 -04:00
Richie e01c687625 made llm_interface.py 2026-06-14 15:40:04 -04:00
Richie 82a367a2b6 added rerank 2026-06-14 15:40:04 -04:00
Richie ad1834c537 built ingest 2026-06-14 15:40:04 -04:00
Richie aed1e14d95 built rag search setup 2026-06-14 15:40:04 -04:00
Richie 0a2d4c08cb set up embedding system 2026-06-14 15:40:04 -04:00
Richie db98bd3559 built BM25 search foundation 2026-06-14 15:40:04 -04:00
Richie cdded5da12 added ebook embedding to orm 2026-06-14 15:40:04 -04:00
Richie d022251a58 removed hedgedoc 2026-06-14 15:40:04 -04:00
Richie e1ef4de6a3 adding embedding Models to jeeves 2026-06-14 15:40:04 -04:00
Richie 5c230a267c ran treefmt
build_systems / build-brain (pull_request) Successful in 46s
build_systems / build-bob (pull_request) Successful in 46s
build_systems / build-leviathan (pull_request) Successful in 54s
build_systems / build-rhapsody-in-green (pull_request) Successful in 58s
build_systems / build-jeeves (pull_request) Successful in 2m33s
treefmt / nix fmt (push) Successful in 4s
build_systems / build-brain (push) Successful in 30s
build_systems / build-bob (push) Successful in 9s
pytest / pytest (push) Successful in 27s
build_systems / build-leviathan (push) Successful in 40s
build_systems / build-rhapsody-in-green (push) Successful in 44s
build_systems / build-jeeves (push) Successful in 2m25s
treefmt / nix fmt (pull_request) Successful in 4s
pytest / pytest (pull_request) Successful in 25s
2026-06-14 13:53:02 -04:00
Richie 5215d66d40 adding noqa to DbSession
build_systems / build-jeeves (pull_request) Successful in 2m38s
treefmt / nix fmt (pull_request) Failing after 5s
pytest / pytest (pull_request) Successful in 26s
build_systems / build-bob (pull_request) Successful in 46s
build_systems / build-leviathan (pull_request) Successful in 53s
build_systems / build-brain (pull_request) Successful in 55s
build_systems / build-rhapsody-in-green (pull_request) Successful in 57s
2026-06-14 13:50:49 -04:00
Richie 7ad198416b added TYPE_CHECKING to contact main.py 2026-06-14 13:50:27 -04:00
Richie 1461c2552a removed van-inventory from pyproject.toml 2026-06-14 13:48:17 -04:00
Richie 736717c2f8 added TYPE_CHECKING to middleware.py 2026-06-14 13:48:02 -04:00
Richie ab2521867e added TYPE_CHECKING to dependencies.py 2026-06-14 13:47:50 -04:00
Richie 8e0ab4190b added TYPE_CHECKING to heater main.py 2026-06-14 13:46:40 -04:00
Richie 734fd7641e moved fetch_weather to masked lat lon and 2026-06-14 13:46:03 -04:00
Richie e898e08c48 added noqa to validate system 2026-06-14 13:41:50 -04:00
Richie d916ea903c removed dead code 2026-06-14 13:39:55 -04:00
Richie d8e916dbe6 fixed type bug in get_snapshots 2026-06-14 13:38:14 -04:00
Richie 48e9f0199d deleting van_inventory 2026-06-14 13:37:55 -04:00
Richie a526420c8d fixed un needed noqa's 2026-06-14 12:58:50 -04:00
Richie 41e3e265af cleaned up audiobook.py mapped_column 2026-06-14 12:53:54 -04:00
Richie 38a17f6146 cleaned uo python dependencies 2026-06-14 12:49:22 -04:00
Richie fe48d4c1ad removing splendor 2026-06-14 12:00:39 -04:00
Richie 9290cb46ee updated series_index to float and added UniqueConstraint to audiobook and audiobook_author
treefmt / nix fmt (push) Successful in 5s
build_systems / build-bob (push) Successful in 32s
build_systems / build-leviathan (push) Successful in 41s
build_systems / build-rhapsody-in-green (push) Successful in 44s
pytest / pytest (push) Successful in 27s
build_systems / build-brain (push) Successful in 31s
build_systems / build-jeeves (push) Successful in 2m28s
pytest / pytest (pull_request) Successful in 26s
build_systems / build-bob (pull_request) Successful in 47s
treefmt / nix fmt (pull_request) Successful in 5s
build_systems / build-brain (pull_request) Successful in 44s
build_systems / build-leviathan (pull_request) Successful in 52s
build_systems / build-rhapsody-in-green (pull_request) Successful in 58s
build_systems / build-jeeves (pull_request) Successful in 2m27s
2026-06-13 22:29:56 -04:00
Richie acd3f2d3ac fixed omnibus for audio books 2026-06-13 22:29:56 -04:00
Richie 08e716f66a deleted frontend dir 2026-06-13 22:29:56 -04:00
Richie d197731af4 added llm_tool_calling.py 2026-06-13 22:29:56 -04:00
Richie 1ffc48bb02 built workflow 2026-06-13 22:29:56 -04:00
Richie b6395ef18f Add catalog.py for manually adding authors and series to the database. 2026-06-13 22:29:56 -04:00
Richie aff6f4e1bd adding audiobook data to DB 2026-06-13 22:29:56 -04:00
Richie a9a96db944 cleaned up old_installer.py
treefmt / nix fmt (pull_request) Successful in 5s
pytest / pytest (pull_request) Successful in 25s
pytest / pytest (push) Successful in 24s
build_systems / build-rhapsody-in-green (pull_request) Successful in 1m5s
treefmt / nix fmt (push) Successful in 4s
build_systems / build-brain (pull_request) Successful in 49s
build_systems / build-bob (pull_request) Successful in 50s
build_systems / build-brain (push) Successful in 44s
build_systems / build-bob (push) Successful in 45s
build_systems / build-leviathan (pull_request) Successful in 57s
build_systems / build-leviathan (push) Successful in 53s
build_systems / build-rhapsody-in-green (push) Successful in 55s
build_systems / build-jeeves (pull_request) Successful in 2m41s
build_systems / build-jeeves (push) Successful in 2m37s
2026-06-13 22:27:11 -04:00
Richie d34154541d moved installer.py to old_installer.py 2026-06-13 22:20:58 -04:00
Richie 5d3a851137 deleting data_science code
build_systems / build-bob (pull_request) Successful in 46s
build_systems / build-leviathan (pull_request) Successful in 53s
build_systems / build-rhapsody-in-green (pull_request) Successful in 57s
build_systems / build-jeeves (pull_request) Successful in 2m33s
build_systems / build-brain (pull_request) Successful in 46s
treefmt / nix fmt (push) Successful in 4s
build_systems / build-brain (push) Successful in 29s
pytest / pytest (push) Successful in 23s
build_systems / build-bob (push) Successful in 31s
build_systems / build-leviathan (push) Successful in 38s
build_systems / build-rhapsody-in-green (push) Successful in 43s
build_systems / build-jeeves (push) Successful in 2m29s
treefmt / nix fmt (pull_request) Successful in 6s
pytest / pytest (pull_request) Successful in 23s
this code was moved to https://gitea.tmmworkshop.com/Nornsight/weave
2026-06-13 21:14:42 -04:00
Richie e05e5c77bc deleting signal bot 2026-06-13 21:09:34 -04:00
Richie b0a2ebc052 deleting unneeded files 2026-06-13 20:47:55 -04:00
Richie f77c9657a3 chore: update flake.lock
treefmt / nix fmt (pull_request) Successful in 6s
pytest / pytest (pull_request) Successful in 27s
build_systems / build-brain (pull_request) Successful in 41s
build_systems / build-bob (pull_request) Successful in 46s
build_systems / build-leviathan (pull_request) Successful in 46s
build_systems / build-rhapsody-in-green (pull_request) Successful in 52s
build_systems / build-jeeves (pull_request) Successful in 2m34s
pytest / pytest (push) Successful in 26s
build_systems / build-leviathan (push) Successful in 41s
treefmt / nix fmt (push) Successful in 6s
build_systems / build-brain (push) Successful in 31s
build_systems / build-bob (push) Successful in 32s
build_systems / build-rhapsody-in-green (push) Successful in 48s
build_systems / build-jeeves (push) Successful in 2m18s
2026-06-12 13:06:18 -04:00
Richie f908f969d3 opening port for vllm
treefmt / nix fmt (pull_request) Successful in 7s
pytest / pytest (pull_request) Successful in 35s
build_systems / build-brain (pull_request) Successful in 47s
build_systems / build-bob (pull_request) Successful in 50s
build_systems / build-leviathan (pull_request) Successful in 56s
build_systems / build-rhapsody-in-green (pull_request) Successful in 1m1s
build_systems / build-jeeves (pull_request) Successful in 2m31s
treefmt / nix fmt (push) Successful in 6s
pytest / pytest (push) Successful in 25s
build_systems / build-brain (push) Successful in 31s
build_systems / build-bob (push) Successful in 34s
build_systems / build-leviathan (push) Successful in 42s
build_systems / build-rhapsody-in-green (push) Successful in 45s
build_systems / build-jeeves (push) Successful in 2m13s
2026-06-08 19:43:07 -04:00
Richie 3cf49c5479 fixing update-flake-lock.yaml permissions
pytest / pytest (pull_request) Successful in 25s
treefmt / nix fmt (pull_request) Successful in 6s
build_systems / build-bob (pull_request) Successful in 45s
build_systems / build-brain (pull_request) Successful in 46s
build_systems / build-leviathan (pull_request) Successful in 54s
build_systems / build-rhapsody-in-green (pull_request) Successful in 59s
build_systems / build-jeeves (pull_request) Successful in 2m31s
treefmt / nix fmt (push) Successful in 6s
pytest / pytest (push) Successful in 29s
build_systems / build-brain (push) Successful in 31s
build_systems / build-bob (push) Successful in 33s
build_systems / build-leviathan (push) Successful in 42s
build_systems / build-rhapsody-in-green (push) Successful in 48s
build_systems / build-jeeves (push) Successful in 2m34s
2026-06-07 11:21:10 -04:00
Richie b34354f5e5 adding storage to bob
treefmt / nix fmt (pull_request) Successful in 6s
pytest / pytest (pull_request) Successful in 26s
build_systems / build-brain (pull_request) Successful in 47s
build_systems / build-bob (pull_request) Successful in 47s
build_systems / build-leviathan (pull_request) Successful in 53s
build_systems / build-rhapsody-in-green (pull_request) Successful in 1m0s
build_systems / build-jeeves (pull_request) Successful in 2m32s
treefmt / nix fmt (push) Successful in 7s
build_systems / build-brain (push) Successful in 8s
pytest / pytest (push) Successful in 25s
build_systems / build-bob (push) Successful in 33s
build_systems / build-leviathan (push) Successful in 41s
build_systems / build-rhapsody-in-green (push) Successful in 46s
build_systems / build-jeeves (push) Successful in 2m19s
2026-06-07 10:48:47 -04:00
Richie 44826464de flake update fro claud code
treefmt / nix fmt (pull_request) Successful in 5s
pytest / pytest (pull_request) Successful in 26s
build_systems / build-brain (pull_request) Successful in 46s
build_systems / build-bob (pull_request) Successful in 48s
build_systems / build-leviathan (pull_request) Successful in 53s
build_systems / build-rhapsody-in-green (pull_request) Successful in 1m0s
build_systems / build-jeeves (pull_request) Successful in 2m33s
treefmt / nix fmt (push) Successful in 6s
pytest / pytest (push) Successful in 25s
build_systems / build-brain (push) Successful in 31s
build_systems / build-bob (push) Successful in 34s
build_systems / build-leviathan (push) Successful in 41s
build_systems / build-rhapsody-in-green (push) Successful in 46s
build_systems / build-jeeves (push) Successful in 2m19s
2026-06-07 10:01:51 -04:00
Richie 3de0ffccb0 adding workflow dispatch for gitea_flake_lock.py
treefmt / nix fmt (pull_request) Successful in 6s
pytest / pytest (pull_request) Successful in 25s
build_systems / build-brain (pull_request) Successful in 47s
build_systems / build-bob (pull_request) Successful in 49s
build_systems / build-leviathan (pull_request) Successful in 54s
build_systems / build-rhapsody-in-green (pull_request) Successful in 1m0s
pytest / pytest (push) Successful in 29s
build_systems / build-bob (push) Successful in 32s
build_systems / build-jeeves (pull_request) Successful in 2m35s
build_systems / build-leviathan (push) Successful in 44s
treefmt / nix fmt (push) Successful in 6s
build_systems / build-rhapsody-in-green (push) Successful in 18s
build_systems / build-brain (push) Successful in 32s
build_systems / build-jeeves (push) Successful in 2m19s
2026-06-06 22:56:34 -04:00
Richie c6c98b3e26 updated Primary nic
pytest / pytest (pull_request) Successful in 26s
build_systems / build-bob (pull_request) Successful in 49s
build_systems / build-rhapsody-in-green (pull_request) Successful in 1m1s
treefmt / nix fmt (pull_request) Successful in 6s
build_systems / build-brain (pull_request) Successful in 48s
build_systems / build-leviathan (pull_request) Successful in 55s
build_systems / build-jeeves (pull_request) Successful in 2m39s
treefmt / nix fmt (push) Successful in 6s
build_systems / build-rhapsody-in-green (push) Successful in 14s
pytest / pytest (push) Successful in 25s
build_systems / build-brain (push) Successful in 29s
build_systems / build-bob (push) Successful in 33s
build_systems / build-leviathan (push) Successful in 41s
build_systems / build-jeeves (push) Successful in 2m19s
2026-06-04 18:10:41 -04:00
Richie d459f3d675 adding brave 2026-06-04 18:10:41 -04:00
Richie 33e4b37cce fixing jeeves dns 2026-06-04 18:10:41 -04:00
Richie 2a8e7e7f2b updated my ssh_config.nix
treefmt / nix fmt (pull_request) Successful in 7s
build_systems / build-brain (pull_request) Successful in 50s
build_systems / build-bob (pull_request) Successful in 50s
build_systems / build-leviathan (pull_request) Successful in 1m1s
build_systems / build-jeeves (pull_request) Successful in 2m49s
pytest / pytest (pull_request) Successful in 31s
build_systems / build-rhapsody-in-green (pull_request) Successful in 1m8s
treefmt / nix fmt (push) Successful in 5s
build_systems / build-leviathan (push) Successful in 12s
pytest / pytest (push) Successful in 24s
build_systems / build-bob (push) Successful in 32s
build_systems / build-rhapsody-in-green (push) Successful in 51s
build_systems / build-jeeves (push) Successful in 2m23s
build_systems / build-brain (push) Successful in 30s
2026-06-03 22:11:19 -04:00
Richie 07759353be flake update
treefmt / nix fmt (pull_request) Successful in 6s
pytest / pytest (pull_request) Successful in 28s
build_systems / build-leviathan (pull_request) Successful in 18m28s
build_systems / build-rhapsody-in-green (pull_request) Successful in 19m7s
build_systems / build-brain (pull_request) Successful in 19m51s
build_systems / build-jeeves (pull_request) Successful in 19m54s
build_systems / build-bob (pull_request) Successful in 30s
build_systems / build-bob (push) Successful in 30s
treefmt / nix fmt (push) Successful in 5s
build_systems / build-brain (push) Successful in 29s
pytest / pytest (push) Successful in 25s
build_systems / build-leviathan (push) Successful in 39s
build_systems / build-rhapsody-in-green (push) Successful in 46s
build_systems / build-jeeves (push) Successful in 2m19s
2026-05-29 22:30:33 -04:00
Richie 38fb14520e removed --reload
treefmt / nix fmt (pull_request) Successful in 6s
pytest / pytest (pull_request) Successful in 27s
build_systems / build-brain (pull_request) Successful in 52s
build_systems / build-bob (pull_request) Successful in 54s
build_systems / build-leviathan (pull_request) Successful in 1m4s
build_systems / build-rhapsody-in-green (pull_request) Successful in 1m5s
build_systems / build-jeeves (pull_request) Successful in 2m45s
build_systems / build-bob (push) Successful in 34s
build_systems / build-brain (push) Successful in 32s
treefmt / nix fmt (push) Successful in 6s
pytest / pytest (push) Successful in 26s
build_systems / build-leviathan (push) Successful in 43s
build_systems / build-rhapsody-in-green (push) Successful in 47s
build_systems / build-jeeves (push) Successful in 2m26s
2026-05-29 20:26:32 -04:00
Richie 006ae6079a moved nornsight off my_python 2026-05-29 20:15:51 -04:00
Richie 7d507fb7e1 adding nornsight.nix
treefmt / nix fmt (pull_request) Successful in 6s
build_systems / build-brain (pull_request) Successful in 51s
build_systems / build-bob (pull_request) Successful in 56s
pytest / pytest (pull_request) Successful in 28s
build_systems / build-leviathan (pull_request) Successful in 1m24s
build_systems / build-rhapsody-in-green (pull_request) Successful in 1m30s
build_systems / build-jeeves (pull_request) Successful in 2m45s
2026-05-29 18:39:27 -04:00
Richie 0f69022e51 disabled terminal bell
treefmt / nix fmt (pull_request) Successful in 7s
pytest / pytest (pull_request) Successful in 29s
build_systems / build-brain (pull_request) Successful in 48s
build_systems / build-bob (pull_request) Successful in 48s
build_systems / build-jeeves (pull_request) Successful in 2m42s
build_systems / build-brain (push) Successful in 30s
build_systems / build-leviathan (pull_request) Successful in 1m0s
build_systems / build-rhapsody-in-green (pull_request) Successful in 1m4s
treefmt / nix fmt (push) Successful in 6s
build_systems / build-bob (push) Successful in 33s
pytest / pytest (push) Successful in 25s
build_systems / build-leviathan (push) Successful in 41s
build_systems / build-rhapsody-in-green (push) Successful in 46s
build_systems / build-jeeves (push) Successful in 2m23s
2026-05-29 13:52:46 -04:00
Richie a260ae2470 adding ffmpeg to jeeves and rhapsody-in-green
treefmt / nix fmt (pull_request) Successful in 7s
build_systems / build-bob (pull_request) Successful in 32s
pytest / pytest (pull_request) Successful in 26s
build_systems / build-brain (pull_request) Successful in 44s
build_systems / build-leviathan (pull_request) Successful in 55s
build_systems / build-rhapsody-in-green (pull_request) Successful in 1m30s
build_systems / build-jeeves (pull_request) Successful in 2m40s
treefmt / nix fmt (push) Successful in 6s
build_systems / build-bob (push) Successful in 33s
build_systems / build-brain (push) Successful in 34s
pytest / pytest (push) Successful in 26s
build_systems / build-leviathan (push) Successful in 44s
build_systems / build-rhapsody-in-green (push) Successful in 45s
build_systems / build-jeeves (push) Successful in 2m21s
2026-05-28 22:14:59 -04:00
Richie 820b4a53d2 adding photos to syncthing
treefmt / nix fmt (pull_request) Successful in 6s
pytest / pytest (pull_request) Successful in 1m16s
build_systems / build-jeeves (pull_request) Successful in 5m29s
build_systems / build-brain (pull_request) Successful in 6m4s
build_systems / build-rhapsody-in-green (pull_request) Successful in 16m47s
build_systems / build-leviathan (pull_request) Successful in 16m49s
build_systems / build-bob (pull_request) Successful in 31s
treefmt / nix fmt (push) Successful in 6s
build_systems / build-bob (push) Successful in 31s
build_systems / build-brain (push) Successful in 32s
pytest / pytest (push) Successful in 26s
build_systems / build-leviathan (push) Successful in 40s
build_systems / build-rhapsody-in-green (push) Successful in 14s
build_systems / build-jeeves (push) Successful in 2m33s
2026-05-28 22:08:46 -04:00
Richie ea77e83f06 setting forceImportRoot to false
pytest / pytest (pull_request) Successful in 53s
treefmt / nix fmt (pull_request) Successful in 9s
build_systems / build-brain (pull_request) Successful in 2m33s
build_systems / build-bob (pull_request) Successful in 2m41s
build_systems / build-leviathan (pull_request) Successful in 3m22s
build_systems / build-rhapsody-in-green (pull_request) Successful in 3m32s
build_systems / build-jeeves (pull_request) Successful in 8m52s
build_systems / build-bob (push) Successful in 33s
treefmt / nix fmt (push) Successful in 6s
build_systems / build-brain (push) Successful in 31s
pytest / pytest (push) Successful in 26s
build_systems / build-leviathan (push) Successful in 41s
build_systems / build-rhapsody-in-green (push) Successful in 47s
build_systems / build-jeeves (push) Successful in 2m28s
2026-05-14 15:12:53 -04:00
Richie a9da208bc3 added --accept-flake-config to nixos-rebuild step
treefmt / nix fmt (pull_request) Successful in 9s
pytest / pytest (pull_request) Successful in 1m17s
build_systems / build-brain (pull_request) Successful in 2m14s
build_systems / build-bob (pull_request) Successful in 2m25s
build_systems / build-leviathan (pull_request) Successful in 4m32s
build_systems / build-rhapsody-in-green (pull_request) Successful in 4m35s
build_systems / build-jeeves (pull_request) Successful in 8m45s
pytest / pytest (push) Successful in 1m1s
treefmt / nix fmt (push) Successful in 8s
build_systems / build-bob (push) Successful in 44s
build_systems / build-leviathan (push) Successful in 38s
build_systems / build-brain (push) Successful in 1m39s
build_systems / build-rhapsody-in-green (push) Successful in 3m0s
build_systems / build-jeeves (push) Successful in 7m3s
2026-05-14 13:39:13 -04:00
Richie 739d7dd28c droped whisper from my_python 2026-05-14 13:38:41 -04:00
Richie 651599796e moved ./llm_tools.nix to gui only
treefmt / nix fmt (pull_request) Successful in 9s
pytest / pytest (pull_request) Successful in 1m24s
build_systems / build-brain (pull_request) Successful in 4m7s
build_systems / build-leviathan (pull_request) Successful in 4m11s
build_systems / build-rhapsody-in-green (pull_request) Successful in 4m41s
build_systems / build-jeeves (pull_request) Successful in 8m38s
build_systems / build-bob (pull_request) Failing after 14m11s
2026-05-14 12:58:15 -04:00
Richie b9d440597c removed llm tools from gui
treefmt / nix fmt (pull_request) Successful in 9s
pytest / pytest (pull_request) Successful in 1m4s
build_systems / build-brain (pull_request) Successful in 2m31s
build_systems / build-leviathan (pull_request) Successful in 3m21s
build_systems / build-rhapsody-in-green (pull_request) Successful in 3m21s
build_systems / build-jeeves (pull_request) Successful in 6m55s
build_systems / build-bob (pull_request) Failing after 16m4s
2026-05-13 10:03:15 -04:00
Richie 311cc5d7a7 adding pi-coding-agenta
treefmt / nix fmt (pull_request) Successful in 6s
pytest / pytest (pull_request) Successful in 1m24s
build_systems / build-brain (pull_request) Successful in 6m28s
build_systems / build-leviathan (pull_request) Failing after 7m21s
build_systems / build-rhapsody-in-green (pull_request) Failing after 7m22s
build_systems / build-jeeves (pull_request) Successful in 11m47s
build_systems / build-bob (pull_request) Failing after 19m3s
2026-05-13 08:57:45 -04:00
Richie fb2519046d moved codex and opencode to master pkgs 2026-05-13 08:56:18 -04:00
Richie bc6b1585ec flake update 2026-05-10 13:49:53 -04:00
Richie d71330a85a updated firefox configPath
treefmt / nix fmt (pull_request) Successful in 6s
pytest / pytest (pull_request) Successful in 29s
build_systems / build-brain (pull_request) Successful in 5m41s
build_systems / build-leviathan (pull_request) Successful in 5m43s
build_systems / build-jeeves (pull_request) Successful in 6m58s
build_systems / build-rhapsody-in-green (pull_request) Successful in 27m16s
build_systems / build-bob (pull_request) Failing after 12m14s
2026-05-10 12:36:54 -04:00
Richie df51aa5200 removing sunshine
sunshine is a cool idea but has been causing annoying ui glitches and started preventing the display manning for starting
Its a cool idea in theory but not useful enough for me to want to debug
2026-05-10 12:31:06 -04:00
Richie e93cc816db flake update 2026-05-09 17:38:13 -04:00
Richie 19050b4cf4 removing llms from rhapsody-in-green 2026-05-07 18:06:21 -04:00
Richie 6676c15f75 adding qwen3.6:27b 2026-05-07 18:05:00 -04:00
Richie 27e487e322 removing signal_bot
treefmt / nix fmt (pull_request) Successful in 5s
pytest / pytest (pull_request) Successful in 27s
build_systems / build-bob (pull_request) Successful in 48s
build_systems / build-brain (pull_request) Successful in 46s
build_systems / build-leviathan (pull_request) Successful in 54s
build_systems / build-rhapsody-in-green (pull_request) Successful in 1m0s
build_systems / build-jeeves (pull_request) Successful in 2m34s
treefmt / nix fmt (push) Successful in 5s
build_systems / build-bob (push) Successful in 34s
build_systems / build-brain (push) Successful in 31s
pytest / pytest (push) Successful in 27s
build_systems / build-leviathan (push) Successful in 40s
build_systems / build-rhapsody-in-green (push) Successful in 43s
build_systems / build-jeeves (push) Successful in 2m31s
2026-05-03 21:23:20 -04:00
Richie 4f28050eff added nixfmt and nix
build_systems / build-bob (pull_request) Failing after 52s
build_systems / build-brain (pull_request) Failing after 50s
pytest / pytest (pull_request) Failing after 4s
treefmt / nix fmt (pull_request) Failing after 4s
build_systems / build-leviathan (pull_request) Failing after 57s
build_systems / build-rhapsody-in-green (pull_request) Failing after 52s
build_systems / build-jeeves (pull_request) Failing after 3m17s
2026-05-03 20:47:03 -04:00
Richie b58ea60557 adding hostPackages
pytest / pytest (pull_request) Failing after 10s
treefmt / nix fmt (pull_request) Failing after 13s
build_systems / build-brain (pull_request) Failing after 29s
build_systems / build-bob (pull_request) Failing after 29s
build_systems / build-rhapsody-in-green (pull_request) Failing after 46s
build_systems / build-jeeves (pull_request) Failing after 2m29s
build_systems / build-leviathan (pull_request) Failing after 35s
2026-05-03 19:16:37 -04:00
Richie e95eedffe4 updated br-nix-builder
build_systems / build-bob (pull_request) Failing after 2s
build_systems / build-brain (pull_request) Failing after 1s
build_systems / build-jeeves (pull_request) Failing after 1s
build_systems / build-leviathan (pull_request) Failing after 1s
build_systems / build-rhapsody-in-green (pull_request) Failing after 1s
treefmt / nix fmt (pull_request) Failing after 2s
pytest / pytest (pull_request) Failing after 9s
2026-05-03 16:30:51 -04:00
Richie 1abd53987c made nix_builders not ephemeral and depended on gitea 2026-05-03 16:29:56 -04:00
Richie d1a3e7338a added permittedInsecurePackages for discord-canary 2026-05-03 00:39:23 -04:00
Richie 687ef0c167 moved acme_challenge backend 2026-05-03 00:39:19 -04:00
Richie 3a86148352 working nix builder 2026-05-02 17:10:02 -04:00
Richie fe9a2912e1 added words to spell check 2026-04-30 12:46:55 -04:00
Richie 29a99fc210 flake lock update 2026-04-30 12:46:55 -04:00
Richie d7651bf588 set update.nix to gitea 2026-04-30 12:46:55 -04:00
Richie 2865dcbe9c set dbus.implementation = "dbus"; 2026-04-30 12:46:55 -04:00
Richie d920b77bab removed verilux 2026-04-30 12:46:55 -04:00
Richie 1b53167b53 updated nix builders 2026-04-30 12:46:55 -04:00
Richie 9dabb9dc07 updated actions 2026-04-30 12:46:55 -04:00
Richie 95630fe151 made Prometheus require zfs-media-database-prometheus.mount 2026-04-30 10:16:37 -04:00
Richie d3a889f100 fixed typo 2026-04-30 10:16:37 -04:00
Richie 6ce0671f51 ran treefmt 2026-04-30 10:16:37 -04:00
Richie 25ab6b2ab6 added gitlens.pushRepositories key shourtcut 2026-04-30 10:16:37 -04:00
Richie 374d7e8d38 setting up resource monitoring for bob and jeeves 2026-04-30 10:16:37 -04:00
Richie 957110b7e9 increasing kitty scrollback_lines 2026-04-28 12:07:03 -04:00
Richie e7dc60f2c3 adding tiktoken 2026-04-28 12:07:03 -04:00
Richie 353a9d6787 adding pgvector 2026-04-27 13:02:04 -04:00
Richie 9f2d3a3c89 updated .gitignore 2026-04-25 16:33:45 -04:00
Richie 73e221716f adding nornsight 2026-04-25 16:33:45 -04:00
Richie 0d0ed5445a moved models 2026-04-19 21:05:56 -04:00
Richie 9e4c6f6f56 adding qwen3.6 2026-04-19 21:05:56 -04:00
Richie 1cf4b99d18 updating signing.format for programs.git 2026-04-19 10:35:48 -04:00
Richie b536fb9f09 removed fallbackToPassword = true; 2026-04-19 08:08:33 -04:00
github-actions[bot] c41a2ce3bd flake.lock: Update
Flake lock file updates:

• Updated input 'firefox-addons':
    'gitlab:rycee/nur-expressions/81e28f4?dir=pkgs/firefox-addons' (2026-03-20)
  → 'gitlab:rycee/nur-expressions/0581568?dir=pkgs/firefox-addons' (2026-04-17)
• Updated input 'home-manager':
    'github:nix-community/home-manager/9670de2' (2026-03-20)
  → 'github:nix-community/home-manager/565e534' (2026-04-17)
• Updated input 'nixos-hardware':
    'github:nixos/nixos-hardware/2d4b471' (2026-03-20)
  → 'github:nixos/nixos-hardware/c775c27' (2026-04-06)
• Updated input 'nixpkgs':
    'github:nixos/nixpkgs/b40629e' (2026-03-18)
  → 'github:nixos/nixpkgs/4bd9165' (2026-04-14)
• Updated input 'nixpkgs-master':
    'github:nixos/nixpkgs/8620c0b' (2026-03-21)
  → 'github:nixos/nixpkgs/025c852' (2026-04-17)
• Updated input 'sops-nix':
    'github:Mic92/sops-nix/29b6519' (2026-03-19)
  → 'github:Mic92/sops-nix/d4971dd' (2026-04-13)
2026-04-19 08:08:33 -04:00
Richie 8ef776f859 updating download-buffer-size 2026-04-18 22:12:47 -04:00
Richie d350c2d074 adding codex 2026-04-18 20:14:11 -04:00
Richie 93d6914e9d enabling appimages 2026-04-18 20:14:11 -04:00
Richie 7db063a240 setting up whisper transcriber 2026-04-18 19:09:02 -04:00
Richie dfe5997e0b removing brain_substituter.nix from bob 2026-04-18 12:00:59 -04:00
Richie 68671a1e84 adding steve 2026-04-18 11:56:56 -04:00
Richie bcc2227cfd updating syncthing phone id 2026-04-18 11:53:20 -04:00
Richie d6eec926e7 made web_services dir 2026-04-13 19:12:32 -04:00
Richie 5ddf1c4cab ran tree fmt 2026-04-13 19:12:32 -04:00
Richie 5a2171b9c7 updated gitea ssh settings 2026-04-13 19:12:32 -04:00
Richie 95c6ade154 moving off cloudflare tunnel 2026-04-13 19:12:32 -04:00
Richie a0bbc2896a added math the bob 2026-04-13 19:08:42 -04:00
Richie 736596c387 made bob a server 2026-04-13 19:08:42 -04:00
Richie 67622c0e51 setting up hedgedoc 2026-04-11 11:42:08 -04:00
Richie d2f447a1af disabling kafka 2026-04-11 11:11:21 -04:00
Richie af365fce9a setup sunshine.nix 2026-04-03 17:12:24 -04:00
Richie 6430049e92 updated postgres snapshot settings 2026-03-30 14:07:08 -04:00
Richie 26e4620f8f fixed systemd sandboxing 2026-03-30 14:07:08 -04:00
Richie 93fc700fa2 removed preStart step 2026-03-30 14:07:08 -04:00
Richie 8d1c1fc628 added mountpoint= to postgres zfs create 2026-03-30 14:07:08 -04:00
Richie dda318753b improving postgres wal 2026-03-30 14:07:08 -04:00
Richie 261ff139f7 removed ds table from richie DB 2026-03-29 15:54:54 -04:00
Richie ba8ff35109 updated ingest_congress to use congress-legislators for legislator info 2026-03-29 15:54:54 -04:00
Richie e368402eea adding LegislatorSocialMedia 2026-03-29 15:54:54 -04:00
Richie dd9329d218 fixed tests 2026-03-29 15:54:54 -04:00
Richie 89f6627bed converted session.execute(select to session.scalars(select 2026-03-29 15:54:54 -04:00
Richie c5babf8bad ran treefmt 2026-03-29 15:54:54 -04:00
Richie dae38ffd9b added ingest_congress.py 2026-03-29 15:54:54 -04:00
Richie ca62cc36a7 adding congress data to new DS DB 2026-03-29 15:54:54 -04:00
Richie 035410f39e adding nemotron-3-nano 2026-03-29 15:54:54 -04:00
Richie e40ab757ca making more generic exception handling 2026-03-29 15:54:54 -04:00
Richie 345ba94a59 ran ingest_posts 2026-03-29 15:54:54 -04:00
Richie f2084206b6 adding tables for 2023 2026-03-29 15:54:54 -04:00
Richie 50e764146a added ingest_posts.py 2026-03-29 15:54:54 -04:00
Richie ea97b5eb19 adding 2026 partitions 2026-03-29 15:54:54 -04:00
Richie 1ef2512daa adding post table 2026-03-29 15:54:54 -04:00
Richie f9a9e5395c added media/temp for fast dir when working with data 2026-03-29 15:54:54 -04:00
Richie d8e166a340 adding data_science_dev 2026-03-29 15:54:54 -04:00
Richie c266ba79f4 updated snapshot_config.toml 2026-03-29 14:12:06 -04:00
Richie f627a5ac6e enabling kafka 2026-03-26 09:59:31 -04:00
223 changed files with 11312 additions and 12381 deletions
+1 -1
View File
@@ -23,6 +23,6 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Build default package
run: "nixos-rebuild build --flake ./#${{ matrix.system }}"
run: "nixos-rebuild build --accept-flake-config --flake ./#${{ matrix.system }}"
- name: copy to nix-cache
run: nix copy --accept-flake-config --to unix:///host-nix/var/nix/daemon-socket/socket .#nixosConfigurations.${{ matrix.system }}.config.system.build.toplevel
-30
View File
@@ -1,30 +0,0 @@
name: fix_eval_warnings
on:
workflow_run:
workflows: ["build_systems"]
types: [completed]
jobs:
check-warnings:
if: >-
github.event.workflow_run.conclusion != 'cancelled' &&
github.event.workflow_run.head_branch == 'main' &&
(github.event.workflow_run.event == 'push' || github.event.workflow_run.event == 'schedule')
runs-on: self-hosted
permissions:
contents: write
pull-requests: write
steps:
- uses: actions/checkout@v4
- name: Fix eval warnings
env:
GH_TOKEN: ${{ secrets.GH_TOKEN_FOR_UPDATES }}
run: >-
nix develop .#devShells.x86_64-linux.default -c
python -m python.eval_warnings.main
--run-id "${{ github.event.workflow_run.id }}"
--repo "${{ github.repository }}"
--ollama-url "${{ secrets.OLLAMA_URL }}"
--run-url "${{ github.event.workflow_run.html_url }}"
+7 -13
View File
@@ -6,24 +6,18 @@ on:
jobs:
merge:
runs-on: ubuntu-latest
runs-on: self-hosted
permissions:
contents: write
pull-requests: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: merge_flake_lock_update
run: |
pr_number=$(gh pr list --state open --author RichieCahill --label flake_lock_update --json number --jq '.[0].number')
echo "pr_number=$pr_number" >> $GITHUB_ENV
if [ -n "$pr_number" ]; then
gh pr merge "$pr_number" --rebase
else
echo "No open PR found with label flake_lock_update"
fi
run: >-
nix develop .#devShells.x86_64-linux.default -c
python -m python.gitea_flake_lock merge
--repo "${{ github.repository }}"
env:
GITHUB_TOKEN: ${{ secrets.GH_TOKEN_FOR_UPDATES }}
GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }}
GITEA_URL: https://gitea.tmmworkshop.com
+1 -1
View File
@@ -1,13 +1,13 @@
name: pytest
on:
workflow_dispatch:
push:
branches:
- main
pull_request:
branches:
- main
merge_group:
jobs:
pytest:
+14 -11
View File
@@ -6,18 +6,21 @@ on:
jobs:
lockfile:
runs-on: ubuntu-latest
runs-on: self-hosted
permissions:
actions: write
contents: write
pull-requests: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install Nix
uses: DeterminateSystems/nix-installer-action@main
- name: Update flake.lock
uses: DeterminateSystems/update-flake-lock@main
with:
token: ${{ secrets.GH_TOKEN_FOR_UPDATES }}
pr-title: "Update flake.lock"
pr-labels: |
dependencies
automated
flake_lock_update
run: nix flake update
- name: Create or update flake.lock PR
env:
GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }}
GITEA_URL: https://gitea.tmmworkshop.com
run: >-
nix develop .#devShells.x86_64-linux.default -c
python -m python.gitea_flake_lock update
--repo "${{ github.repository }}"
+4
View File
@@ -169,3 +169,7 @@ test.*
# Frontend build output
frontend/dist/
frontend/node_modules/
# data from testing llms
data/*
.ebook_search_bm25
+2 -1
View File
@@ -40,7 +40,6 @@
"cgroupdriver",
"charliermarsh",
"Checkpointing",
"cloudflared",
"codellama",
"codezombiech",
"compactmode",
@@ -204,6 +203,7 @@
"peerconnection",
"PESKYFOX",
"PGID",
"pgvector",
"pipewire",
"pkgs",
"plugdev",
@@ -308,6 +308,7 @@
"usernamehw",
"userprefs",
"vaninventory",
"vdev",
"vfat",
"victron",
"virt",
-12
View File
@@ -1,12 +0,0 @@
## Dev environment tips
- use treefmt to format all files
- make python code ruff compliant
- use pytest to test python code
- always use the minimum amount of complexity
- if judgment calls are easy to reverse make them. if not ask me first
- Match existing code style.
- Use builtin helpers getenv() over os.environ.get.
- Prefer single-purpose functions over “do everything” helpers.
- Avoid compatibility branches like PG_USER and POSTGRESQL_URL unless requested.
- Keep helpers only if reused or they simplify the code otherwise inline.
+12 -2
View File
@@ -23,7 +23,10 @@
boot = {
tmp.useTmpfs = true;
kernelPackages = lib.mkDefault pkgs.linuxPackages_6_12;
zfs.package = lib.mkDefault pkgs.zfs_2_4;
zfs = {
package = lib.mkDefault pkgs.zfs_2_4;
forceImportRoot = lib.mkDefault false;
};
};
hardware.enableRedistributableFirmware = true;
@@ -37,10 +40,17 @@
nixpkgs = {
overlays = builtins.attrValues outputs.overlays;
config.allowUnfree = true;
config = {
allowUnfree = true;
permittedInsecurePackages = [
"openssl-1.1.1w" # This is for discord-canary
];
};
};
services = {
dbus.implementation = "dbus";
# firmware update
fwupd.enable = true;
+1
View File
@@ -34,6 +34,7 @@ in
warn-dirty = false;
flake-registry = ""; # disable global flake registries
connect-timeout = 10;
download-buffer-size = 536870912;
fallback = true;
};
+256
View File
@@ -0,0 +1,256 @@
{
config,
lib,
pkgs,
...
}:
let
monitoringInterface = "ztwfunumly";
nodeTextfileDir = "/var/lib/prometheus-node-exporter-textfile";
mkProcessNameTemplate =
perPid: template: if perPid then "${template}:{{.PID}}:{{.StartTime}}" else template;
mkProcessMatchers = perPid: [
{
name = mkProcessNameTemplate perPid "{{.Username}}:{{.Matches.Module}}";
cmdline = [ "^/nix/store[^ ]*/bin/python[^ ]* -m (?P<Module>[^ ]+)" ];
}
{
name = mkProcessNameTemplate perPid "{{.Username}}:{{.Matches.Wrapped}}";
cmdline = [
"^/nix/store[^ ]*/bin/python[^ ]* /nix/store[^ ]*/bin/\\.?(?P<Wrapped>[^ /]+?)(?:-wrapped)?(?:\\s|$)"
];
}
{
name = mkProcessNameTemplate perPid "{{.Username}}:{{.Matches.Wrapped}}";
cmdline = [
"^/nix/store[^ ]*/bin/node /nix/store[^ ]*-(?P<Wrapped>[A-Za-z0-9._+-]+)-[0-9][^ /]*/"
];
}
{
name = mkProcessNameTemplate perPid "{{.Username}}:{{.Matches.Wrapped}}";
cmdline = [ "^/nix/store[^ ]*/(?:bin/|lib/[^ ]*/)?\\.?(?P<Wrapped>[^ /]+?)(?:-wrapped)?(?:\\s|$)" ];
}
{
name = mkProcessNameTemplate perPid "{{.Username}}:{{.ExeBase}}";
cmdline = [ ".+" ];
}
];
perPidConfig = pkgs.writeText "process-exporter-per-pid.yaml" (
builtins.toJSON {
process_names = mkProcessMatchers true;
}
);
zpoolLatencyScript = pkgs.writeShellScript "zpool-latency-exporter" ''
set -euo pipefail
out_dir=${lib.escapeShellArg nodeTextfileDir}
host=${lib.escapeShellArg config.networking.hostName}
tmp_file="$(mktemp "$out_dir/zpool.prom.XXXXXX")"
trap 'rm -f "$tmp_file"' EXIT
pools="$(zpool list -H -o name | paste -sd, -)"
cat >"$tmp_file" <<'EOF'
# HELP zpool_iostat_total_wait_read_ns Average total read wait time reported by zpool iostat.
# TYPE zpool_iostat_total_wait_read_ns gauge
# HELP zpool_iostat_total_wait_write_ns Average total write wait time reported by zpool iostat.
# TYPE zpool_iostat_total_wait_write_ns gauge
# HELP zpool_iostat_disk_wait_read_ns Average disk read wait time reported by zpool iostat.
# TYPE zpool_iostat_disk_wait_read_ns gauge
# HELP zpool_iostat_disk_wait_write_ns Average disk write wait time reported by zpool iostat.
# TYPE zpool_iostat_disk_wait_write_ns gauge
# HELP zpool_iostat_syncq_wait_read_ns Average synchronous queue read wait time reported by zpool iostat.
# TYPE zpool_iostat_syncq_wait_read_ns gauge
# HELP zpool_iostat_syncq_wait_write_ns Average synchronous queue write wait time reported by zpool iostat.
# TYPE zpool_iostat_syncq_wait_write_ns gauge
# HELP zpool_iostat_asyncq_wait_read_ns Average asynchronous queue read wait time reported by zpool iostat.
# TYPE zpool_iostat_asyncq_wait_read_ns gauge
# HELP zpool_iostat_asyncq_wait_write_ns Average asynchronous queue write wait time reported by zpool iostat.
# TYPE zpool_iostat_asyncq_wait_write_ns gauge
EOF
zpool iostat -Hplvy -y 1 1 | awk -F '\t' -v host="$host" -v pools="$pools" '
function esc(str, out) {
out = str
gsub(/\\/, "\\\\", out)
gsub(/"/, "\\\"", out)
return out
}
function emit(metric, pool, vdev, value) {
if (value == "" || value == "-") {
return
}
printf "%s{host=\"%s\",pool=\"%s\",vdev=\"%s\"} %s\n",
metric,
esc(host),
esc(pool),
esc(vdev),
value
}
BEGIN {
split(pools, pool_names, ",")
for (idx in pool_names) {
if (pool_names[idx] != "") {
known_pools[pool_names[idx]] = 1
}
}
}
NF == 0 {
next
}
{
row_name = $1
if (row_name in known_pools) {
current_pool = row_name
current_vdev = "_pool"
} else if (current_pool == "") {
next
} else {
current_vdev = row_name
}
emit("zpool_iostat_total_wait_read_ns", current_pool, current_vdev, $8)
emit("zpool_iostat_total_wait_write_ns", current_pool, current_vdev, $9)
emit("zpool_iostat_disk_wait_read_ns", current_pool, current_vdev, $10)
emit("zpool_iostat_disk_wait_write_ns", current_pool, current_vdev, $11)
emit("zpool_iostat_syncq_wait_read_ns", current_pool, current_vdev, $12)
emit("zpool_iostat_syncq_wait_write_ns", current_pool, current_vdev, $13)
emit("zpool_iostat_asyncq_wait_read_ns", current_pool, current_vdev, $14)
emit("zpool_iostat_asyncq_wait_write_ns", current_pool, current_vdev, $15)
}
' >>"$tmp_file"
mv "$tmp_file" "$out_dir/zpool.prom"
trap - EXIT
'';
in
{
networking.firewall.interfaces.${monitoringInterface}.allowedTCPPorts = [
9100
9134
9256
9257
9633
];
services.prometheus.exporters = {
node = {
enable = true;
enabledCollectors = [
"pressure"
"processes"
"systemd"
];
extraFlags = [ "--collector.textfile.directory=${nodeTextfileDir}" ];
};
process = {
enable = true;
user = "root";
group = "root";
settings.process_names = mkProcessMatchers false;
extraFlags = [
"-gather-smaps=false"
"-remove-empty-groups=true"
"-threads=false"
];
};
smartctl.enable = true;
zfs.enable = true;
};
programs.atop = {
enable = true;
atopService.enable = true;
atopRotateTimer.enable = true;
atopacctService.enable = true;
settings.interval = 30;
};
systemd = {
services = {
prometheus-process-pid-exporter = {
description = "Prometheus process exporter with per-PID naming";
wantedBy = [ "multi-user.target" ];
after = [ "network.target" ];
serviceConfig = {
ExecStart = ''
${pkgs.prometheus-process-exporter}/bin/process-exporter \
--web.listen-address 0.0.0.0:9257 \
--config.path ${perPidConfig} \
-children=false \
-gather-smaps=false \
-remove-empty-groups=true \
-threads=false
'';
User = "root";
Group = "root";
Restart = "always";
WorkingDirectory = "/tmp";
CapabilityBoundingSet = [ "" ];
DeviceAllow = [ "" ];
LockPersonality = true;
MemoryDenyWriteExecute = true;
NoNewPrivileges = true;
PrivateDevices = true;
PrivateTmp = true;
ProtectClock = true;
ProtectControlGroups = true;
ProtectHome = true;
ProtectHostname = true;
ProtectKernelLogs = true;
ProtectKernelModules = true;
ProtectKernelTunables = true;
ProtectSystem = "strict";
RemoveIPC = true;
RestrictAddressFamilies = [
"AF_INET"
"AF_INET6"
];
RestrictNamespaces = true;
RestrictRealtime = true;
RestrictSUIDSGID = true;
SystemCallArchitectures = "native";
UMask = "0077";
};
};
zpool-latency-exporter = {
description = "Exports ZFS latency metrics for node_exporter textfile collection";
after = [ "zfs-import.target" ];
requires = [ "zfs-import.target" ];
path = [
config.boot.zfs.package
pkgs.coreutils
pkgs.gawk
];
serviceConfig = {
Type = "oneshot";
ExecStart = zpoolLatencyScript;
};
};
};
timers.zpool-latency-exporter = {
wantedBy = [ "timers.target" ];
timerConfig = {
OnBootSec = "2m";
OnUnitActiveSec = "60s";
Unit = "zpool-latency-exporter.service";
};
};
tmpfiles.rules = [ "d ${nodeTextfileDir} 0755 root root - -" ];
};
}
+1 -1
View File
@@ -12,7 +12,7 @@
brain.id = "SSCGIPI-IV3VYKB-TRNIJE3-COV4T2H-CDBER7F-I2CGHYA-NWOEUDU-3T5QAAN"; # cspell:disable-line
ipad.id = "KI76T3X-SFUGV2L-VSNYTKR-TSIUV5L-SHWD3HE-GQRGRCN-GY4UFMD-CW6Z6AX"; # cspell:disable-line
jeeves.id = "ICRHXZW-ECYJCUZ-I4CZ64R-3XRK7CG-LL2HAAK-FGOHD22-BQA4AI6-5OAL6AG"; # cspell:disable-line
phone.id = "TBRULKD-7DZPGGZ-F6LLB7J-MSO54AY-7KLPBIN-QOFK6PX-W2HBEWI-PHM2CQI"; # cspell:disable-line
phone.id = "JPVQKQW-CFXOJXT-Q5G5F3H-QIDHDRE-GKHPTQB-GXZUQSP-U7FR7F7-INP3AAH"; # cspell:disable-line
rhapsody-in-green.id = "ASL3KC4-3XEN6PA-7BQBRKE-A7JXLI6-DJT43BY-Q4WPOER-7UALUAZ-VTPQ6Q4"; # cspell:disable-line
};
};
+1 -1
View File
@@ -4,7 +4,7 @@
flags = [ "--accept-flake-config" ];
randomizedDelaySec = "1h";
persistent = true;
flake = "github:RichieCahill/dotfiles";
flake = "git+https://gitea.tmmworkshop.com/richie/dotfiles?ref=main";
allowReboot = true;
dates = "Sat *-*-* 06:00:00";
};
+76
View File
@@ -0,0 +1,76 @@
# ZFS failed root import recovery
## Fast path
If the machine fails to boot because ZFS refuses to import `root_pool`:
### GRUB
1. At the bootloader menu, select the normal NixOS entry.
2. Press `e`.
3. Find the line that starts with `linux`.
4. Append this to the end of that line:
```text
zfs_force=1
```
5. Boot once with `Ctrl+x` or `F10`.
### systemd-boot
1. At the bootloader menu, highlight the normal NixOS entry.
2. Press `e`.
3. Append this to the end of the options line:
```text
zfs_force=1
```
4. Press `Enter` to boot once.
## After boot
Run:
```bash
sudo zpool status
sudo zpool import
journalctl -b | rg "ZFS|zfs|import|root_pool"
```
## Expected result
`sudo zpool status` should show `root_pool` as `ONLINE`.
## Reboot test
Run:
```bash
sudo reboot
```
Do not add `zfs_force=1` the second time.
## If it still fails
Boot once more with:
```text
zfs_force=1
```
Then run:
```bash
sudo zpool status -v
sudo zpool history | tail -n 50
journalctl -b | rg "ZFS|zfs|import|root_pool"
```
## Notes
- Root pool name is `root_pool`.
- This is a one-time recovery path after disk moves, controller changes, dirty exports, or interrupted imports.
- Some hosts also need the LUKS unlock USB key inserted before boot.
File diff suppressed because one or more lines are too long
Generated
+42 -26
View File
@@ -8,11 +8,11 @@
},
"locked": {
"dir": "pkgs/firefox-addons",
"lastModified": 1773979456,
"narHash": "sha256-9kBMJ5IvxqNlkkj/swmE8uK1Sc7TL/LIRUI958m7uBM=",
"lastModified": 1781150628,
"narHash": "sha256-b4mp8l3qWuSCyYYo9HSngDtcB3PpecYiOXjULrjwwlw=",
"owner": "rycee",
"repo": "nur-expressions",
"rev": "81e28f47ac18d9e89513929c77e711e657b64851",
"rev": "753319310f4673a2dabbfab87482187b40bf9bac",
"type": "gitlab"
},
"original": {
@@ -29,11 +29,11 @@
]
},
"locked": {
"lastModified": 1774007980,
"narHash": "sha256-FOnZjElEI8pqqCvB6K/1JRHTE8o4rer8driivTpq2uo=",
"lastModified": 1781189114,
"narHash": "sha256-5inaamLgUMWy+MOBE9ChF9QAF1o/74LFuHkI0W/9rqc=",
"owner": "nix-community",
"repo": "home-manager",
"rev": "9670de2921812bc4e0452f6e3efd8c859696c183",
"rev": "486595d2cf49cfcd649b58a284fa11ac0e34da22",
"type": "github"
},
"original": {
@@ -43,12 +43,15 @@
}
},
"nixos-hardware": {
"inputs": {
"nixpkgs": "nixpkgs"
},
"locked": {
"lastModified": 1774018263,
"narHash": "sha256-HHYEwK1A22aSaxv2ibhMMkKvrDGKGlA/qObG4smrSqc=",
"lastModified": 1781168557,
"narHash": "sha256-LOnLQ2tpYF9gqIDDr3+j3DbpJJr/QCH6zPRT2GzEUOE=",
"owner": "nixos",
"repo": "nixos-hardware",
"rev": "2d4b4717b2534fad5c715968c1cece04a172b365",
"rev": "6358ff76821101c178e3ab4919a62799bfe3652e",
"type": "github"
},
"original": {
@@ -60,27 +63,24 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1773821835,
"narHash": "sha256-TJ3lSQtW0E2JrznGVm8hOQGVpXjJyXY2guAxku2O9A4=",
"owner": "nixos",
"repo": "nixpkgs",
"rev": "b40629efe5d6ec48dd1efba650c797ddbd39ace0",
"type": "github"
"lastModified": 1767892417,
"narHash": "sha256-8bW3q88CEg2u4hSP66Vf4lpbLonHz7hqDNBMcCY7E9U=",
"rev": "3497aa5c9457a9d88d71fa93a4a8368816fbeeba",
"type": "tarball",
"url": "https://releases.nixos.org/nixos/unstable/nixos-26.05pre924538.3497aa5c9457/nixexprs.tar.xz"
},
"original": {
"owner": "nixos",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
"type": "tarball",
"url": "https://channels.nixos.org/nixos-unstable/nixexprs.tar.xz"
}
},
"nixpkgs-master": {
"locked": {
"lastModified": 1774051532,
"narHash": "sha256-d3CGMweyYIcPuTj5BKq+1Lx4zwlgL31nVtN647tOZKo=",
"lastModified": 1781229721,
"narHash": "sha256-ORvqDbb/LYxiJljGIejapjkc/kJbVote2N1WSb9W45I=",
"owner": "nixos",
"repo": "nixpkgs",
"rev": "8620c0b5cc8fbe76502442181be1d0514bc3a1b7",
"rev": "173d0ad7a974f8543a9ab01d2271b2e290341b33",
"type": "github"
},
"original": {
@@ -106,12 +106,28 @@
"type": "github"
}
},
"nixpkgs_2": {
"locked": {
"lastModified": 1781074563,
"narHash": "sha256-md8WlXOlfnIeHeOScMTTHFyf2d6iaTwPl2apR5EQ3P4=",
"owner": "nixos",
"repo": "nixpkgs",
"rev": "9ae611a455b90cf061d8f332b977e387bda8e1ca",
"type": "github"
},
"original": {
"owner": "nixos",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"root": {
"inputs": {
"firefox-addons": "firefox-addons",
"home-manager": "home-manager",
"nixos-hardware": "nixos-hardware",
"nixpkgs": "nixpkgs",
"nixpkgs": "nixpkgs_2",
"nixpkgs-master": "nixpkgs-master",
"nixpkgs-stable": "nixpkgs-stable",
"sops-nix": "sops-nix",
@@ -125,11 +141,11 @@
]
},
"locked": {
"lastModified": 1773889674,
"narHash": "sha256-+ycaiVAk3MEshJTg35cBTUa0MizGiS+bgpYw/f8ohkg=",
"lastModified": 1780547341,
"narHash": "sha256-Gq8KNx5A7hBB3uGJaj6eQfLDIz5YdLu92gqBcvHvoUo=",
"owner": "Mic92",
"repo": "sops-nix",
"rev": "29b6519f3e0780452bca0ac0be4584f04ac16cc5",
"rev": "9ed65852b6257fbeae4355bc24ecfea307ca759a",
"type": "github"
},
"original": {
-24
View File
@@ -1,24 +0,0 @@
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
pnpm-debug.log*
lerna-debug.log*
node_modules
dist
dist-ssr
*.local
# Editor directories and files
.vscode/*
!.vscode/extensions.json
.idea
.DS_Store
*.suo
*.ntvs*
*.njsproj
*.sln
*.sw?
+29 -3
View File
@@ -17,16 +17,41 @@
python-env = final: _prev: {
my_python = final.python314.withPackages (
ps: with ps; [
ps:
let
bm25s = ps.buildPythonPackage rec {
pname = "bm25s";
version = "0.3.9";
pyproject = true;
src = final.fetchPypi {
inherit pname version;
hash = "sha256-iVxnnZUrfeg1XttfPhpiCh4vKU0dQrkZvwghzOLi9Zc=";
};
build-system = [ ps.setuptools ];
dependencies = with ps; [
numpy
scipy
];
pythonImportsCheck = [ "bm25s" ];
};
in
with ps;
[
alembic
apprise
apscheduler
confluent-kafka
beautifulsoup4
ebooklib
fastapi
fastapi-cli
httpx
mypy
numpy
orjson
pgvector
polars
psycopg
pydantic
@@ -39,9 +64,10 @@
ruff
scalene
sqlalchemy
sqlalchemy
bm25s
tenacity
textual
tiktoken
tinytuya
typer
websockets
+10 -11
View File
@@ -3,7 +3,7 @@ name = "system_tools"
version = "0.1.0"
description = ""
authors = [{ name = "Richie Cahill", email = "richie@tmmworkshop.com" }]
requires-python = "~=3.13.0"
requires-python = "~=3.14.0"
readme = "README.md"
license = "MIT"
# these dependencies are a best effort and aren't guaranteed to work
@@ -12,20 +12,23 @@ dependencies = [
"alembic",
"apprise",
"apscheduler",
"fastapi",
"fastapi-cli",
"httpx",
"python-multipart",
"polars",
"psycopg[binary]",
"pydantic",
"pyyaml",
"python-multipart",
"sqlalchemy",
"tenacity",
"tinytuya",
"typer",
"websockets",
]
[project.scripts]
database = "python.database_cli:app"
van-inventory = "python.van_inventory.main:serve"
whisper-transcribe = "python.tools.whisper.transcribe:main"
[dependency-groups]
dev = [
@@ -40,7 +43,7 @@ dev = [
[tool.ruff]
target-version = "py313"
target-version = "py314"
line-length = 120
@@ -50,6 +53,7 @@ lint.ignore = [
"COM812", # (TEMP) conflicts when used with the formatter
"ISC001", # (TEMP) conflicts when used with the formatter
"S603", # (PERM) This is known to cause a false positive
"S607", # (PERM) This is becoming a consistent annoyance
]
[tool.ruff.lint.per-file-ignores]
@@ -78,15 +82,10 @@ lint.ignore = [
"python/congress_tracker/**" = [
"TC003", # (perm) this creates issues because sqlalchemy uses these at runtime
]
"python/eval_warnings/**" = [
"S607", # (perm) gh and git are expected on PATH in the runner environment
]
"python/alembic/**" = [
"INP001", # (perm) this creates LSP issues for alembic
]
"python/signal_bot/**" = [
"D107", # (perm) class docstrings cover __init__
]
[tool.ruff.lint.pydocstyle]
convention = "google"
@@ -1,50 +0,0 @@
"""adding FailedIngestion.
Revision ID: 2f43120e3ffc
Revises: f99be864fe69
Create Date: 2026-03-24 23:46:17.277897
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import sqlalchemy as sa
from alembic import op
from python.orm import DataScienceDevBase
if TYPE_CHECKING:
from collections.abc import Sequence
# revision identifiers, used by Alembic.
revision: str = "2f43120e3ffc"
down_revision: str | None = "f99be864fe69"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
schema = DataScienceDevBase.schema_name
def upgrade() -> None:
"""Upgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.create_table(
"failed_ingestion",
sa.Column("raw_line", sa.Text(), nullable=False),
sa.Column("error", sa.Text(), nullable=False),
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.PrimaryKeyConstraint("id", name=op.f("pk_failed_ingestion")),
schema=schema,
)
# ### end Alembic commands ###
def downgrade() -> None:
"""Downgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table("failed_ingestion", schema=schema)
# ### end Alembic commands ###
@@ -1,80 +0,0 @@
"""Attach all partition tables to the posts parent table.
Alembic autogenerate creates partition tables as standalone tables but does not
emit the ALTER TABLE ... ATTACH PARTITION statements needed for PostgreSQL to
route inserts to the correct partition.
Revision ID: a1b2c3d4e5f6
Revises: 605b1794838f
Create Date: 2026-03-25 10:00:00.000000
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from alembic import op
from sqlalchemy import text
from python.orm import DataScienceDevBase
from python.orm.data_science_dev.posts.partitions import (
PARTITION_END_YEAR,
PARTITION_START_YEAR,
iso_weeks_in_year,
week_bounds,
)
if TYPE_CHECKING:
from collections.abc import Sequence
# revision identifiers, used by Alembic.
revision: str = "a1b2c3d4e5f6"
down_revision: str | None = "605b1794838f"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
schema = DataScienceDevBase.schema_name
ALREADY_ATTACHED_QUERY = text("""
SELECT inhrelid::regclass::text
FROM pg_inherits
WHERE inhparent = :parent::regclass
""")
def upgrade() -> None:
"""Attach all weekly partition tables to the posts parent table."""
connection = op.get_bind()
already_attached = {
row[0]
for row in connection.execute(
ALREADY_ATTACHED_QUERY, {"parent": f"{schema}.posts"}
)
}
for year in range(PARTITION_START_YEAR, PARTITION_END_YEAR + 1):
for week in range(1, iso_weeks_in_year(year) + 1):
table_name = f"posts_{year}_{week:02d}"
qualified_name = f"{schema}.{table_name}"
if qualified_name in already_attached:
continue
start, end = week_bounds(year, week)
start_str = start.strftime("%Y-%m-%d %H:%M:%S")
end_str = end.strftime("%Y-%m-%d %H:%M:%S")
op.execute(
f"ALTER TABLE {schema}.posts "
f"ATTACH PARTITION {qualified_name} "
f"FOR VALUES FROM ('{start_str}') TO ('{end_str}')"
)
def downgrade() -> None:
"""Detach all weekly partition tables from the posts parent table."""
for year in range(PARTITION_START_YEAR, PARTITION_END_YEAR + 1):
for week in range(1, iso_weeks_in_year(year) + 1):
table_name = f"posts_{year}_{week:02d}"
op.execute(
f"ALTER TABLE {schema}.posts "
f"DETACH PARTITION {schema}.{table_name}"
)
-1
View File
@@ -3,7 +3,6 @@
from __future__ import annotations
import logging
import re
import sys
from pathlib import Path
from typing import TYPE_CHECKING, Any, Literal
@@ -0,0 +1,187 @@
"""removed ds table from richie DB.
Revision ID: c8a794340928
Revises: 6b275323f435
Create Date: 2026-03-29 15:29:23.643146
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects import postgresql
from python.orm import RichieBase
if TYPE_CHECKING:
from collections.abc import Sequence
# revision identifiers, used by Alembic.
revision: str = "c8a794340928"
down_revision: str | None = "6b275323f435"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
schema = RichieBase.schema_name
def upgrade() -> None:
"""Upgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table("vote_record", schema=schema)
op.drop_index(op.f("ix_vote_congress_chamber"), table_name="vote", schema=schema)
op.drop_index(op.f("ix_vote_date"), table_name="vote", schema=schema)
op.drop_index(op.f("ix_legislator_bioguide_id"), table_name="legislator", schema=schema)
op.drop_table("legislator", schema=schema)
op.drop_table("vote", schema=schema)
op.drop_index(op.f("ix_bill_congress"), table_name="bill", schema=schema)
op.drop_table("bill", schema=schema)
# ### end Alembic commands ###
def downgrade() -> None:
"""Downgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.create_table(
"vote",
sa.Column("congress", sa.INTEGER(), autoincrement=False, nullable=False),
sa.Column("chamber", sa.VARCHAR(), autoincrement=False, nullable=False),
sa.Column("session", sa.INTEGER(), autoincrement=False, nullable=False),
sa.Column("number", sa.INTEGER(), autoincrement=False, nullable=False),
sa.Column("vote_type", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("question", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("result", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("result_text", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("vote_date", sa.DATE(), autoincrement=False, nullable=False),
sa.Column("yea_count", sa.INTEGER(), autoincrement=False, nullable=True),
sa.Column("nay_count", sa.INTEGER(), autoincrement=False, nullable=True),
sa.Column("not_voting_count", sa.INTEGER(), autoincrement=False, nullable=True),
sa.Column("present_count", sa.INTEGER(), autoincrement=False, nullable=True),
sa.Column("bill_id", sa.INTEGER(), autoincrement=False, nullable=True),
sa.Column("id", sa.INTEGER(), autoincrement=True, nullable=False),
sa.Column(
"created",
postgresql.TIMESTAMP(timezone=True),
server_default=sa.text("now()"),
autoincrement=False,
nullable=False,
),
sa.Column(
"updated",
postgresql.TIMESTAMP(timezone=True),
server_default=sa.text("now()"),
autoincrement=False,
nullable=False,
),
sa.ForeignKeyConstraint(["bill_id"], [f"{schema}.bill.id"], name=op.f("fk_vote_bill_id_bill")),
sa.PrimaryKeyConstraint("id", name=op.f("pk_vote")),
sa.UniqueConstraint(
"congress",
"chamber",
"session",
"number",
name=op.f("uq_vote_congress_chamber_session_number"),
postgresql_include=[],
postgresql_nulls_not_distinct=False,
),
schema=schema,
)
op.create_index(op.f("ix_vote_date"), "vote", ["vote_date"], unique=False, schema=schema)
op.create_index(op.f("ix_vote_congress_chamber"), "vote", ["congress", "chamber"], unique=False, schema=schema)
op.create_table(
"vote_record",
sa.Column("vote_id", sa.INTEGER(), autoincrement=False, nullable=False),
sa.Column("legislator_id", sa.INTEGER(), autoincrement=False, nullable=False),
sa.Column("position", sa.VARCHAR(), autoincrement=False, nullable=False),
sa.ForeignKeyConstraint(
["legislator_id"],
[f"{schema}.legislator.id"],
name=op.f("fk_vote_record_legislator_id_legislator"),
ondelete="CASCADE",
),
sa.ForeignKeyConstraint(
["vote_id"], [f"{schema}.vote.id"], name=op.f("fk_vote_record_vote_id_vote"), ondelete="CASCADE"
),
sa.PrimaryKeyConstraint("vote_id", "legislator_id", name=op.f("pk_vote_record")),
schema=schema,
)
op.create_table(
"legislator",
sa.Column("bioguide_id", sa.TEXT(), autoincrement=False, nullable=False),
sa.Column("thomas_id", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("lis_id", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("govtrack_id", sa.INTEGER(), autoincrement=False, nullable=True),
sa.Column("opensecrets_id", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("fec_ids", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("first_name", sa.VARCHAR(), autoincrement=False, nullable=False),
sa.Column("last_name", sa.VARCHAR(), autoincrement=False, nullable=False),
sa.Column("official_full_name", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("nickname", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("birthday", sa.DATE(), autoincrement=False, nullable=True),
sa.Column("gender", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("current_party", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("current_state", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("current_district", sa.INTEGER(), autoincrement=False, nullable=True),
sa.Column("current_chamber", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("id", sa.INTEGER(), autoincrement=True, nullable=False),
sa.Column(
"created",
postgresql.TIMESTAMP(timezone=True),
server_default=sa.text("now()"),
autoincrement=False,
nullable=False,
),
sa.Column(
"updated",
postgresql.TIMESTAMP(timezone=True),
server_default=sa.text("now()"),
autoincrement=False,
nullable=False,
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_legislator")),
schema=schema,
)
op.create_index(op.f("ix_legislator_bioguide_id"), "legislator", ["bioguide_id"], unique=True, schema=schema)
op.create_table(
"bill",
sa.Column("congress", sa.INTEGER(), autoincrement=False, nullable=False),
sa.Column("bill_type", sa.VARCHAR(), autoincrement=False, nullable=False),
sa.Column("number", sa.INTEGER(), autoincrement=False, nullable=False),
sa.Column("title", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("title_short", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("official_title", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("status", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("status_at", sa.DATE(), autoincrement=False, nullable=True),
sa.Column("sponsor_bioguide_id", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("subjects_top_term", sa.VARCHAR(), autoincrement=False, nullable=True),
sa.Column("id", sa.INTEGER(), autoincrement=True, nullable=False),
sa.Column(
"created",
postgresql.TIMESTAMP(timezone=True),
server_default=sa.text("now()"),
autoincrement=False,
nullable=False,
),
sa.Column(
"updated",
postgresql.TIMESTAMP(timezone=True),
server_default=sa.text("now()"),
autoincrement=False,
nullable=False,
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_bill")),
sa.UniqueConstraint(
"congress",
"bill_type",
"number",
name=op.f("uq_bill_congress_type_number"),
postgresql_include=[],
postgresql_nulls_not_distinct=False,
),
schema=schema,
)
op.create_index(op.f("ix_bill_congress"), "bill", ["congress"], unique=False, schema=schema)
# ### end Alembic commands ###
@@ -0,0 +1,93 @@
"""adding audiobook libreary metadata.
Revision ID: d7864d1ffc17
Revises: c8a794340928
Create Date: 2026-06-03 20:24:09.200837
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import sqlalchemy as sa
from alembic import op
from python.orm import RichieBase
if TYPE_CHECKING:
from collections.abc import Sequence
# revision identifiers, used by Alembic.
revision: str = "d7864d1ffc17"
down_revision: str | None = "c8a794340928"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
schema = RichieBase.schema_name
def upgrade() -> None:
"""Upgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.create_table(
"audiobook_author",
sa.Column("name", sa.String(), nullable=False),
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.PrimaryKeyConstraint("id", name=op.f("pk_audiobook_author")),
sa.UniqueConstraint("name", name=op.f("uq_audiobook_author_name")),
schema=schema,
)
op.create_table(
"audiobook_series",
sa.Column("name", sa.String(), nullable=False),
sa.Column("author_id", sa.Integer(), nullable=False),
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.ForeignKeyConstraint(
["author_id"],
[f"{schema}.audiobook_author.id"],
name=op.f("fk_audiobook_series_author_id_audiobook_author"),
ondelete="CASCADE",
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_audiobook_series")),
sa.UniqueConstraint("author_id", "name", name=op.f("uq_audiobook_series_author_id")),
schema=schema,
)
op.create_table(
"audiobook",
sa.Column("title", sa.String(), nullable=False),
sa.Column("author_id", sa.Integer(), nullable=False),
sa.Column("series_id", sa.Integer(), nullable=True),
sa.Column("series_index", sa.Integer(), nullable=False),
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.ForeignKeyConstraint(
["author_id"],
[f"{schema}.audiobook_author.id"],
name=op.f("fk_audiobook_author_id_audiobook_author"),
ondelete="CASCADE",
),
sa.ForeignKeyConstraint(
["series_id"],
[f"{schema}.audiobook_series.id"],
name=op.f("fk_audiobook_series_id_audiobook_series"),
ondelete="SET NULL",
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_audiobook")),
schema=schema,
)
# ### end Alembic commands ###
def downgrade() -> None:
"""Downgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table("audiobook", schema=schema)
op.drop_table("audiobook_series", schema=schema)
op.drop_table("audiobook_author", schema=schema)
# ### end Alembic commands ###
@@ -0,0 +1,200 @@
"""add ebook search tables.
Revision ID: 2db132cace1a
Revises: b3c60cc5beb5
Create Date: 2026-06-10 22:10:54.379159
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import pgvector
import sqlalchemy as sa
from alembic import op
from python.orm import RichieBase
if TYPE_CHECKING:
from collections.abc import Sequence
# revision identifiers, used by Alembic.
revision: str = "2db132cace1a"
down_revision: str | None = "b3c60cc5beb5"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
schema = RichieBase.schema_name
def upgrade() -> None:
"""Upgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.create_table(
"ebook_embedding_model",
sa.Column("name", sa.String(), nullable=False),
sa.Column("dimension", sa.Integer(), nullable=False),
sa.Column("is_default", sa.Boolean(), nullable=False),
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.PrimaryKeyConstraint("id", name=op.f("pk_ebook_embedding_model")),
sa.UniqueConstraint("name", name=op.f("uq_ebook_embedding_model_name")),
schema=schema,
)
op.create_table(
"ebook_source",
sa.Column("title", sa.String(), nullable=False),
sa.Column("author", sa.String(), nullable=True),
sa.Column("language", sa.String(), nullable=True),
sa.Column("publisher", sa.String(), nullable=True),
sa.Column("identifier", sa.String(), nullable=True),
sa.Column("file_path", sa.String(), nullable=False),
sa.Column("file_sha256", sa.String(length=64), nullable=False),
sa.Column("file_mtime", sa.DateTime(timezone=True), nullable=False),
sa.Column("file_size", sa.BigInteger(), nullable=False),
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.PrimaryKeyConstraint("id", name=op.f("pk_ebook_source")),
sa.UniqueConstraint("file_path", name=op.f("uq_ebook_source_file_path")),
sa.UniqueConstraint("file_sha256", name=op.f("uq_ebook_source_file_sha256")),
schema=schema,
)
op.create_table(
"ebook_chapter",
sa.Column("source_id", sa.Integer(), nullable=False),
sa.Column("spine_index", sa.Integer(), nullable=False),
sa.Column("title", sa.String(), nullable=True),
sa.Column("href", sa.String(), nullable=True),
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.ForeignKeyConstraint(
["source_id"],
[f"{schema}.ebook_source.id"],
name=op.f("fk_ebook_chapter_source_id_ebook_source"),
ondelete="CASCADE",
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_ebook_chapter")),
sa.UniqueConstraint("source_id", "spine_index", name=op.f("uq_ebook_chapter_source_id")),
schema=schema,
)
op.create_table(
"ebook_chunk",
sa.Column("source_id", sa.Integer(), nullable=False),
sa.Column("chapter_id", sa.Integer(), nullable=True),
sa.Column("chunk_index", sa.Integer(), nullable=False),
sa.Column("text", sa.String(), nullable=False),
sa.Column("token_start", sa.Integer(), nullable=False),
sa.Column("token_count", sa.Integer(), nullable=False),
sa.Column("page_label", sa.String(), nullable=True),
sa.Column("content_sha256", sa.String(length=64), nullable=False),
sa.Column("search_text", sa.String(), nullable=False),
sa.Column("id", sa.BigInteger(), nullable=False),
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.ForeignKeyConstraint(
["chapter_id"],
[f"{schema}.ebook_chapter.id"],
name=op.f("fk_ebook_chunk_chapter_id_ebook_chapter"),
ondelete="SET NULL",
),
sa.ForeignKeyConstraint(
["source_id"],
[f"{schema}.ebook_source.id"],
name=op.f("fk_ebook_chunk_source_id_ebook_source"),
ondelete="CASCADE",
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_ebook_chunk")),
sa.UniqueConstraint("source_id", "chunk_index", name="uq_ebook_chunk_source_id_chunk_index"),
sa.UniqueConstraint("source_id", "content_sha256", name="uq_ebook_chunk_source_id_content_sha256"),
schema=schema,
)
op.create_table(
"ebook_chunk_embedding_1024",
sa.Column("chunk_id", sa.BigInteger(), nullable=False),
sa.Column("model_id", sa.Integer(), nullable=False),
sa.Column("embedding", pgvector.sqlalchemy.vector.VECTOR(dim=1024), nullable=False),
sa.Column("id", sa.BigInteger(), nullable=False),
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.ForeignKeyConstraint(
["chunk_id"],
[f"{schema}.ebook_chunk.id"],
name=op.f("fk_ebook_chunk_embedding_1024_chunk_id_ebook_chunk"),
ondelete="CASCADE",
),
sa.ForeignKeyConstraint(
["model_id"],
[f"{schema}.ebook_embedding_model.id"],
name=op.f("fk_ebook_chunk_embedding_1024_model_id_ebook_embedding_model"),
ondelete="CASCADE",
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_ebook_chunk_embedding_1024")),
sa.UniqueConstraint("chunk_id", "model_id", name=op.f("uq_ebook_chunk_embedding_1024_chunk_id")),
schema=schema,
)
op.create_table(
"ebook_chunk_embedding_2560",
sa.Column("chunk_id", sa.BigInteger(), nullable=False),
sa.Column("model_id", sa.Integer(), nullable=False),
sa.Column("embedding", pgvector.sqlalchemy.vector.VECTOR(dim=2560), nullable=False),
sa.Column("id", sa.BigInteger(), nullable=False),
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.ForeignKeyConstraint(
["chunk_id"],
[f"{schema}.ebook_chunk.id"],
name=op.f("fk_ebook_chunk_embedding_2560_chunk_id_ebook_chunk"),
ondelete="CASCADE",
),
sa.ForeignKeyConstraint(
["model_id"],
[f"{schema}.ebook_embedding_model.id"],
name=op.f("fk_ebook_chunk_embedding_2560_model_id_ebook_embedding_model"),
ondelete="CASCADE",
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_ebook_chunk_embedding_2560")),
sa.UniqueConstraint("chunk_id", "model_id", name=op.f("uq_ebook_chunk_embedding_2560_chunk_id")),
schema=schema,
)
op.create_table(
"ebook_chunk_embedding_4096",
sa.Column("chunk_id", sa.BigInteger(), nullable=False),
sa.Column("model_id", sa.Integer(), nullable=False),
sa.Column("embedding", pgvector.sqlalchemy.vector.VECTOR(dim=4096), nullable=False),
sa.Column("id", sa.BigInteger(), nullable=False),
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.ForeignKeyConstraint(
["chunk_id"],
[f"{schema}.ebook_chunk.id"],
name=op.f("fk_ebook_chunk_embedding_4096_chunk_id_ebook_chunk"),
ondelete="CASCADE",
),
sa.ForeignKeyConstraint(
["model_id"],
[f"{schema}.ebook_embedding_model.id"],
name=op.f("fk_ebook_chunk_embedding_4096_model_id_ebook_embedding_model"),
ondelete="CASCADE",
),
sa.PrimaryKeyConstraint("id", name=op.f("pk_ebook_chunk_embedding_4096")),
sa.UniqueConstraint("chunk_id", "model_id", name=op.f("uq_ebook_chunk_embedding_4096_chunk_id")),
schema=schema,
)
# ### end Alembic commands ###
def downgrade() -> None:
"""Downgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table("ebook_chunk_embedding_4096", schema=schema)
op.drop_table("ebook_chunk_embedding_2560", schema=schema)
op.drop_table("ebook_chunk_embedding_1024", schema=schema)
op.drop_table("ebook_chunk", schema=schema)
op.drop_table("ebook_chapter", schema=schema)
op.drop_table("ebook_source", schema=schema)
op.drop_table("ebook_embedding_model", schema=schema)
# ### end Alembic commands ###
@@ -0,0 +1,63 @@
"""updated series_index to float and added UniqueConstraint to audiobook and audiobook_author.
Revision ID: b3c60cc5beb5
Revises: d7864d1ffc17
Create Date: 2026-06-10 20:02:43.073725
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import sqlalchemy as sa
from alembic import op
from python.orm import RichieBase
if TYPE_CHECKING:
from collections.abc import Sequence
# revision identifiers, used by Alembic.
revision: str = "b3c60cc5beb5"
down_revision: str | None = "d7864d1ffc17"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
schema = RichieBase.schema_name
def upgrade() -> None:
"""Upgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.alter_column(
"audiobook",
"series_index",
existing_type=sa.INTEGER(),
type_=sa.Float(),
existing_nullable=False,
schema=schema,
)
op.create_unique_constraint(
op.f("uq_audiobook_author_id"),
"audiobook",
["author_id", "series_id", "title"],
schema=schema,
postgresql_nulls_not_distinct=True,
)
# ### end Alembic commands ###
def downgrade() -> None:
"""Downgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_constraint(op.f("uq_audiobook_author_id"), "audiobook", schema=schema, type_="unique")
op.alter_column(
"audiobook",
"series_index",
existing_type=sa.Float(),
type_=sa.INTEGER(),
existing_nullable=False,
schema=schema,
)
# ### end Alembic commands ###
@@ -0,0 +1,54 @@
"""add 1024 ebook embedding cosine index.
Revision ID: c460105682d2
Revises: 2db132cace1a
Create Date: 2026-06-13 19:53:45.680289
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from alembic import op
from python.orm import RichieBase
if TYPE_CHECKING:
from collections.abc import Sequence
# revision identifiers, used by Alembic.
revision: str = "c460105682d2"
down_revision: str | None = "2db132cace1a"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
schema = RichieBase.schema_name
def upgrade() -> None:
"""Upgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.create_index(
"ix_ebook_chunk_embedding_1024_embedding_cosine",
"ebook_chunk_embedding_1024",
["embedding"],
unique=False,
schema=schema,
postgresql_using="hnsw",
postgresql_ops={"embedding": "vector_cosine_ops"},
)
# ### end Alembic commands ###
def downgrade() -> None:
"""Downgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_index(
"ix_ebook_chunk_embedding_1024_embedding_cosine",
table_name="ebook_chunk_embedding_1024",
schema=schema,
postgresql_using="hnsw",
postgresql_ops={"embedding": "vector_cosine_ops"},
)
# ### end Alembic commands ###
@@ -1,100 +0,0 @@
"""seprating signal_bot database.
Revision ID: 6eaf696e07a5
Revises:
Create Date: 2026-03-17 21:35:37.612672
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects import postgresql
from python.orm import SignalBotBase
if TYPE_CHECKING:
from collections.abc import Sequence
# revision identifiers, used by Alembic.
revision: str = "6eaf696e07a5"
down_revision: str | None = None
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
schema = SignalBotBase.schema_name
def upgrade() -> None:
"""Upgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.create_table(
"dead_letter_message",
sa.Column("source", sa.String(), nullable=False),
sa.Column("message", sa.Text(), nullable=False),
sa.Column("received_at", sa.DateTime(timezone=True), nullable=False),
sa.Column(
"status", postgresql.ENUM("UNPROCESSED", "PROCESSED", name="message_status", schema=schema), nullable=False
),
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.PrimaryKeyConstraint("id", name=op.f("pk_dead_letter_message")),
schema=schema,
)
op.create_table(
"role",
sa.Column("name", sa.String(length=50), nullable=False),
sa.Column("id", sa.SmallInteger(), nullable=False),
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.PrimaryKeyConstraint("id", name=op.f("pk_role")),
sa.UniqueConstraint("name", name=op.f("uq_role_name")),
schema=schema,
)
op.create_table(
"signal_device",
sa.Column("phone_number", sa.String(length=50), nullable=False),
sa.Column("safety_number", sa.String(), nullable=True),
sa.Column(
"trust_level",
postgresql.ENUM("VERIFIED", "UNVERIFIED", "BLOCKED", name="trust_level", schema=schema),
nullable=False,
),
sa.Column("last_seen", sa.DateTime(timezone=True), nullable=False),
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.PrimaryKeyConstraint("id", name=op.f("pk_signal_device")),
sa.UniqueConstraint("phone_number", name=op.f("uq_signal_device_phone_number")),
schema=schema,
)
op.create_table(
"device_role",
sa.Column("device_id", sa.Integer(), nullable=False),
sa.Column("role_id", sa.SmallInteger(), nullable=False),
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("created", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.Column("updated", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
sa.ForeignKeyConstraint(
["device_id"], [f"{schema}.signal_device.id"], name=op.f("fk_device_role_device_id_signal_device")
),
sa.ForeignKeyConstraint(["role_id"], [f"{schema}.role.id"], name=op.f("fk_device_role_role_id_role")),
sa.PrimaryKeyConstraint("id", name=op.f("pk_device_role")),
sa.UniqueConstraint("device_id", "role_id", name="uq_device_role_device_role"),
schema=schema,
)
# ### end Alembic commands ###
def downgrade() -> None:
"""Downgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table("device_role", schema=schema)
op.drop_table("signal_device", schema=schema)
op.drop_table("role", schema=schema)
op.drop_table("dead_letter_message", schema=schema)
# ### end Alembic commands ###
@@ -1,72 +0,0 @@
"""test.
Revision ID: 66bdd532bcab
Revises: 6eaf696e07a5
Create Date: 2026-03-18 19:21:14.561568
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects import postgresql
from python.orm import SignalBotBase
if TYPE_CHECKING:
from collections.abc import Sequence
# revision identifiers, used by Alembic.
revision: str = "66bdd532bcab"
down_revision: str | None = "6eaf696e07a5"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
schema = SignalBotBase.schema_name
def upgrade() -> None:
"""Upgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.alter_column(
"dead_letter_message",
"status",
existing_type=postgresql.ENUM("UNPROCESSED", "PROCESSED", name="message_status", schema=schema),
type_=sa.Enum("UNPROCESSED", "PROCESSED", name="message_status", native_enum=False),
existing_nullable=False,
schema=schema,
)
op.alter_column(
"signal_device",
"trust_level",
existing_type=postgresql.ENUM("VERIFIED", "UNVERIFIED", "BLOCKED", name="trust_level", schema=schema),
type_=sa.Enum("VERIFIED", "UNVERIFIED", "BLOCKED", name="trust_level", native_enum=False),
existing_nullable=False,
schema=schema,
)
# ### end Alembic commands ###
def downgrade() -> None:
"""Downgrade."""
# ### commands auto generated by Alembic - please adjust! ###
op.alter_column(
"signal_device",
"trust_level",
existing_type=sa.Enum("VERIFIED", "UNVERIFIED", "BLOCKED", name="trust_level", native_enum=False),
type_=postgresql.ENUM("VERIFIED", "UNVERIFIED", "BLOCKED", name="trust_level", schema=schema),
existing_nullable=False,
schema=schema,
)
op.alter_column(
"dead_letter_message",
"status",
existing_type=sa.Enum("UNPROCESSED", "PROCESSED", name="message_status", native_enum=False),
type_=postgresql.ENUM("UNPROCESSED", "PROCESSED", name="message_status", schema=schema),
existing_nullable=False,
schema=schema,
)
# ### end Alembic commands ###
+7 -3
View File
@@ -1,19 +1,23 @@
"""FastAPI interface for Contact database."""
from __future__ import annotations
import logging
from collections.abc import AsyncIterator
from contextlib import asynccontextmanager
from typing import Annotated
from typing import TYPE_CHECKING, Annotated
import typer
import uvicorn
from fastapi import FastAPI
from python.api.middleware import ZstdMiddleware
from python.api.routers import contact_router, views_router
from python.common import configure_logger
from python.fastapi_tools import ZstdMiddleware
from python.orm.common import get_postgres_engine
if TYPE_CHECKING:
from collections.abc import AsyncIterator
logger = logging.getLogger(__name__)
+1 -1
View File
@@ -9,7 +9,7 @@ from pydantic import BaseModel
from sqlalchemy import select
from sqlalchemy.orm import selectinload
from python.api.dependencies import DbSession
from python.fastapi_tools.db import DbSession # noqa: TC001 this is a FastAPI needed at runtime
from python.orm.richie.contact import Contact, ContactRelationship, Need, RelationshipType
TEMPLATES_DIR = Path(__file__).parent.parent / "templates"
+1 -1
View File
@@ -9,7 +9,7 @@ from fastapi.templating import Jinja2Templates
from sqlalchemy import select
from sqlalchemy.orm import Session, selectinload
from python.api.dependencies import DbSession
from python.fastapi_tools.db import DbSession # noqa: TC001 this is a FastAPI needed at runtime
from python.orm.richie.contact import Contact, ContactRelationship, Need, RelationshipType
TEMPLATES_DIR = Path(__file__).parent.parent / "templates"
-3
View File
@@ -1,3 +0,0 @@
"""Data science CLI tools."""
from __future__ import annotations
-104
View File
@@ -1,104 +0,0 @@
"""Utilities for converting Bluesky identifiers to numeric database IDs.
Handles DID-to-user_id hashing, TID-to-post_id decoding, and AT-URI parsing.
"""
from __future__ import annotations
import hashlib
TID_CHARSET = "234567abcdefghijklmnopqrstuvwxyz"
_TID_LENGTH = 13
_BIGINT_MASK = 0x7FFFFFFFFFFFFFFF
_AT_URI_SEGMENT_COUNT = 3
def did_to_user_id(did: str) -> int:
"""Convert a DID string to a deterministic 63-bit integer for user_id.
Uses SHA-256, truncated to 63 bits (positive signed BigInteger range).
Collision probability is negligible at Bluesky's scale (~tens of millions of users).
Args:
did: A Bluesky DID string, e.g. "did:plc:abc123".
Returns:
A positive 63-bit integer suitable for BigInteger storage.
"""
digest = hashlib.sha256(did.encode()).digest()
return int.from_bytes(digest[:8], "big") & _BIGINT_MASK
def tid_to_integer(tid: str) -> int:
"""Decode a Bluesky TID (base32-sortbase) into a 64-bit integer for post_id.
TIDs are 13-character, base32-sortbase encoded identifiers that encode a
microsecond timestamp plus a clock ID. They are globally unique by construction.
Args:
tid: A 13-character TID string, e.g. "3abc2defghijk".
Returns:
A positive integer suitable for BigInteger storage.
Raises:
ValueError: If the TID is malformed (wrong length or invalid characters).
"""
if len(tid) != _TID_LENGTH:
message = f"TID must be {_TID_LENGTH} characters, got {len(tid)}: {tid!r}"
raise ValueError(message)
result = 0
for char in tid:
index = TID_CHARSET.find(char)
if index == -1:
message = f"Invalid character {char!r} in TID {tid!r}"
raise ValueError(message)
result = result * 32 + index
return result
def parse_at_uri(uri: str) -> tuple[str, str, str]:
"""Parse an AT-URI into its components.
Args:
uri: An AT-URI string, e.g. "at://did:plc:abc123/app.bsky.feed.post/3abc2defghijk".
Returns:
A tuple of (did, collection, rkey).
Raises:
ValueError: If the URI doesn't have the expected format.
"""
stripped = uri.removeprefix("at://")
parts = stripped.split("/", maxsplit=2)
if len(parts) != _AT_URI_SEGMENT_COUNT:
message = f"Expected {_AT_URI_SEGMENT_COUNT} path segments in AT-URI, got {len(parts)}: {uri!r}"
raise ValueError(message)
return parts[0], parts[1], parts[2]
def post_id_from_uri(uri: str) -> int:
"""Extract and decode the post_id (TID) from an AT-URI.
Args:
uri: An AT-URI pointing to a post.
Returns:
The post_id as an integer.
"""
_did, _collection, rkey = parse_at_uri(uri)
return tid_to_integer(rkey)
def user_id_from_uri(uri: str) -> int:
"""Extract and hash the user_id (DID) from an AT-URI.
Args:
uri: An AT-URI pointing to a post.
Returns:
The user_id as an integer.
"""
did, _collection, _rkey = parse_at_uri(uri)
return did_to_user_id(did)
-143
View File
@@ -1,143 +0,0 @@
"""Transform Bluesky Jetstream messages into rows matching the Posts table schema."""
from __future__ import annotations
import json
import logging
from datetime import datetime
from python.data_science.bluesky_ids import (
did_to_user_id,
post_id_from_uri,
tid_to_integer,
user_id_from_uri,
)
logger = logging.getLogger(__name__)
INSTANCE = "bsky"
POST_COLLECTION = "app.bsky.feed.post"
EMBED_RECORD_TYPE = "app.bsky.embed.record"
EMBED_RECORD_WITH_MEDIA_TYPE = "app.bsky.embed.recordWithMedia"
def transform_jetstream_post(message: dict) -> dict:
"""Transform a Jetstream commit message into a dict matching Posts table columns.
Expects a Jetstream message with kind=commit, operation=create,
collection=app.bsky.feed.post.
Args:
message: The full Jetstream JSON message.
Returns:
A dict with keys matching the Posts table columns.
"""
did = message["did"]
commit = message["commit"]
record = commit["record"]
row: dict = {
"post_id": tid_to_integer(commit["rkey"]),
"user_id": did_to_user_id(did),
"instance": INSTANCE,
"date": datetime.fromisoformat(record["createdAt"]),
"text": record.get("text", ""),
"langs": _extract_langs(record),
"like_count": 0,
"reply_count": 0,
"repost_count": 0,
"reply_to": None,
"replied_author": None,
"thread_root": None,
"thread_root_author": None,
"repost_from": None,
"reposted_author": None,
"quotes": None,
"quoted_author": None,
"labels": _extract_labels(record),
"sent_label": None,
"sent_score": None,
}
_extract_reply_refs(record, row)
_extract_quote_refs(record, row)
return row
def is_post_create(message: dict) -> bool:
"""Check if a Jetstream message is a post creation event.
Args:
message: The full Jetstream JSON message.
Returns:
True if this is a create commit for app.bsky.feed.post.
"""
if message.get("kind") != "commit":
return False
commit = message.get("commit", {})
return commit.get("operation") == "create" and commit.get("collection") == POST_COLLECTION
def _extract_langs(record: dict) -> str | None:
"""Extract langs array as a JSON string, or None if absent."""
langs = record.get("langs")
if langs is None:
return None
return json.dumps(langs)
def _extract_labels(record: dict) -> str | None:
"""Extract self-labels as a JSON string, or None if absent."""
labels_obj = record.get("labels")
if labels_obj is None:
return None
values = labels_obj.get("values", [])
if not values:
return None
label_strings = [label.get("val", "") for label in values]
return json.dumps(label_strings)
def _extract_reply_refs(record: dict, row: dict) -> None:
"""Populate reply_to, replied_author, thread_root, thread_root_author from record.reply."""
reply = record.get("reply")
if reply is None:
return
parent = reply.get("parent", {})
parent_uri = parent.get("uri")
if parent_uri:
row["reply_to"] = post_id_from_uri(parent_uri)
row["replied_author"] = user_id_from_uri(parent_uri)
root = reply.get("root", {})
root_uri = root.get("uri")
if root_uri:
row["thread_root"] = post_id_from_uri(root_uri)
row["thread_root_author"] = user_id_from_uri(root_uri)
def _extract_quote_refs(record: dict, row: dict) -> None:
"""Populate quotes and quoted_author from embed record references."""
embed = record.get("embed")
if embed is None:
return
embed_type = embed.get("$type", "")
if embed_type == EMBED_RECORD_TYPE:
_set_quote_from_record(embed.get("record", {}), row)
elif embed_type == EMBED_RECORD_WITH_MEDIA_TYPE:
inner_record = embed.get("record", {}).get("record", {})
_set_quote_from_record(inner_record, row)
def _set_quote_from_record(record_ref: dict, row: dict) -> None:
"""Set quotes and quoted_author from a record reference object."""
uri = record_ref.get("uri")
if uri and POST_COLLECTION in uri:
row["quotes"] = post_id_from_uri(uri)
row["quoted_author"] = user_id_from_uri(uri)
-203
View File
@@ -1,203 +0,0 @@
"""Kafka consumer that ingests Bluesky posts into the partitioned Posts table.
Consumes Jetstream messages from Kafka, transforms them into Posts rows,
and batch-inserts them into PostgreSQL with manual offset commits.
Usage:
firehose-consumer
firehose-consumer --kafka-servers kafka:9092 --batch-size 500
"""
from __future__ import annotations
import json
import logging
import signal
from os import getenv
from threading import Event
from typing import Annotated
import typer
from confluent_kafka import Consumer, KafkaError, KafkaException
from sqlalchemy.orm import Session
from python.data_science.bluesky_transform import is_post_create, transform_jetstream_post
from python.data_science.ingest_posts import ingest_batch
from python.orm.common import get_postgres_engine
from python.orm.data_science_dev.posts.failed_ingestion import FailedIngestion
logger = logging.getLogger(__name__)
DEFAULT_TOPIC = "bluesky.firehose.posts"
DEFAULT_KAFKA_SERVERS = "localhost:9092"
DEFAULT_GROUP_ID = "bluesky-posts-ingestor"
DEFAULT_BATCH_SIZE = 500
POLL_TIMEOUT_SECONDS = 5.0
shutdown_event = Event()
app = typer.Typer(help="Consume Bluesky posts from Kafka and ingest into PostgreSQL.")
@app.command()
def main(
kafka_servers: Annotated[str, typer.Option(help="Kafka bootstrap servers")] = "",
topic: Annotated[str, typer.Option(help="Kafka topic to consume from")] = "",
group_id: Annotated[str, typer.Option(help="Kafka consumer group ID")] = "",
batch_size: Annotated[int, typer.Option(help="Messages per DB insert batch")] = DEFAULT_BATCH_SIZE,
) -> None:
"""Consume Bluesky posts from Kafka and ingest into the partitioned posts table."""
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
datefmt="%H:%M:%S",
)
servers = kafka_servers or getenv("KAFKA_BOOTSTRAP_SERVERS", DEFAULT_KAFKA_SERVERS)
topic_name = topic or getenv("BLUESKY_FIREHOSE_TOPIC", DEFAULT_TOPIC)
group = group_id or getenv("KAFKA_GROUP_ID", DEFAULT_GROUP_ID)
signal.signal(signal.SIGTERM, _handle_shutdown)
signal.signal(signal.SIGINT, _handle_shutdown)
consumer = _create_consumer(servers, group)
consumer.subscribe([topic_name])
engine = get_postgres_engine(name="DATA_SCIENCE_DEV")
total_inserted = 0
logger.info("Starting firehose consumer: topic=%s group=%s batch_size=%d", topic_name, group, batch_size)
try:
with Session(engine) as session:
while not shutdown_event.is_set():
inserted = _consume_batch(consumer, session, batch_size)
total_inserted += inserted
if inserted > 0:
logger.info("Batch inserted %d rows (total: %d)", inserted, total_inserted)
except KafkaException:
logger.exception("Fatal Kafka error")
finally:
logger.info("Closing consumer (total inserted: %d)", total_inserted)
consumer.close()
def _consume_batch(consumer: Consumer, session: Session, batch_size: int) -> int:
"""Poll a batch of messages, transform, and insert into the database.
Args:
consumer: The Kafka consumer instance.
session: SQLAlchemy database session.
batch_size: Maximum number of messages to consume per batch.
Returns:
Number of rows successfully inserted.
"""
messages = consumer.consume(num_messages=batch_size, timeout=POLL_TIMEOUT_SECONDS)
if not messages:
return 0
rows: list[dict] = []
for message in messages:
error = message.error()
if error is not None:
if error.code() == KafkaError._PARTITION_EOF: # noqa: SLF001 — confluent-kafka exposes this as a pseudo-private constant; no public alternative exists
continue
logger.error("Consumer error: %s", error)
continue
row = _safe_transform(message.value(), session)
if row is not None:
rows.append(row)
if not rows:
consumer.commit(asynchronous=False)
return 0
inserted = ingest_batch(session, rows)
consumer.commit(asynchronous=False)
return inserted
def _safe_transform(raw_value: bytes | None, session: Session) -> dict | None:
"""Transform a Kafka message value into a Posts row, logging failures.
Args:
raw_value: Raw message bytes from Kafka.
session: SQLAlchemy session for logging failures.
Returns:
A transformed row dict, or None if transformation failed.
"""
if raw_value is None:
return None
try:
message = json.loads(raw_value)
except (json.JSONDecodeError, UnicodeDecodeError):
logger.exception("Failed to decode Kafka message")
_log_failed_ingestion(session, raw_value, "JSON decode error")
return None
if not is_post_create(message):
return None
try:
return transform_jetstream_post(message)
except (KeyError, ValueError, TypeError):
logger.exception("Failed to transform Jetstream message")
_log_failed_ingestion(session, raw_value, "Transform error")
return None
def _log_failed_ingestion(session: Session, raw_value: bytes, error: str) -> None:
"""Log a failed ingestion to the FailedIngestion table.
Args:
session: SQLAlchemy session.
raw_value: The raw message bytes.
error: Description of the error.
"""
try:
session.add(
FailedIngestion(
raw_line=raw_value.decode(errors="replace")[:10000],
error=error,
)
)
session.commit()
except Exception:
session.rollback()
logger.exception("Failed to log ingestion failure")
def _create_consumer(servers: str, group: str) -> Consumer:
"""Create a configured Kafka consumer.
Args:
servers: Kafka bootstrap servers string.
group: Consumer group ID.
Returns:
A configured confluent_kafka.Consumer.
"""
config = {
"bootstrap.servers": servers,
"group.id": group,
"auto.offset.reset": "earliest",
"enable.auto.commit": False,
"max.poll.interval.ms": 300000,
"fetch.min.bytes": 1024,
"session.timeout.ms": 30000,
}
return Consumer(config)
def _handle_shutdown(_signum: int, _frame: object) -> None:
"""Signal handler to trigger graceful shutdown."""
logger.info("Shutdown signal received")
shutdown_event.set()
if __name__ == "__main__":
app()
-230
View File
@@ -1,230 +0,0 @@
"""Bluesky Jetstream firehose to Kafka producer.
Connects to the Bluesky Jetstream WebSocket API with zstd compression,
filters for post creation events, and produces them to a Kafka topic.
Usage:
firehose-producer
firehose-producer --kafka-servers kafka:9092 --topic bluesky.firehose.posts
"""
from __future__ import annotations
import json
import logging
import signal
from os import getenv
from threading import Event
from typing import Annotated
import typer
from compression import zstd
from confluent_kafka import KafkaError, KafkaException, Producer
from websockets.exceptions import ConnectionClosed
from websockets.sync.client import connect
logger = logging.getLogger(__name__)
JETSTREAM_URL = "wss://jetstream2.us-east.bsky.network/subscribe"
DEFAULT_TOPIC = "bluesky.firehose.posts"
DEFAULT_KAFKA_SERVERS = "localhost:9092"
POLL_INTERVAL = 100
POST_COLLECTION = "app.bsky.feed.post"
shutdown_event = Event()
app = typer.Typer(help="Stream Bluesky firehose posts into Kafka.")
@app.command()
def main(
kafka_servers: Annotated[str, typer.Option(help="Kafka bootstrap servers")] = "",
topic: Annotated[str, typer.Option(help="Kafka topic to produce to")] = "",
collections: Annotated[str, typer.Option(help="Comma-separated collections to subscribe to")] = POST_COLLECTION,
) -> None:
"""Connect to Bluesky Jetstream and produce post events to Kafka."""
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
datefmt="%H:%M:%S",
)
servers = kafka_servers or getenv("KAFKA_BOOTSTRAP_SERVERS", DEFAULT_KAFKA_SERVERS)
topic_name = topic or getenv("BLUESKY_FIREHOSE_TOPIC", DEFAULT_TOPIC)
signal.signal(signal.SIGTERM, _handle_shutdown)
signal.signal(signal.SIGINT, _handle_shutdown)
producer = _create_producer(servers)
cursor: int | None = None
logger.info("Starting firehose producer → %s on %s", topic_name, servers)
while not shutdown_event.is_set():
try:
cursor = _stream_loop(producer, topic_name, collections, cursor)
except (ConnectionClosed, OSError):
logger.exception("WebSocket disconnected, reconnecting")
except KafkaException:
logger.exception("Kafka error, reconnecting")
if not shutdown_event.is_set():
logger.info("Reconnecting in 5 seconds (cursor=%s)", cursor)
shutdown_event.wait(timeout=5)
logger.info("Shutting down, flushing producer")
producer.flush(timeout=30)
logger.info("Producer shutdown complete")
def _stream_loop(
producer: Producer,
topic: str,
collections: str,
cursor: int | None,
) -> int | None:
"""Connect to Jetstream and stream messages to Kafka until disconnected.
Args:
producer: The Kafka producer instance.
topic: Kafka topic name.
collections: Comma-separated AT Protocol collections to subscribe to.
cursor: Optional microsecond timestamp to resume from.
Returns:
The last processed time_us cursor value.
"""
url = _build_jetstream_url(collections, cursor)
logger.info("Connecting to %s", url)
message_count = 0
last_cursor = cursor
with connect(url, additional_headers={"Accept-Encoding": "zstd"}) as websocket:
logger.info("Connected to Jetstream")
while not shutdown_event.is_set():
try:
raw_frame = websocket.recv(timeout=10)
except TimeoutError:
producer.poll(0)
continue
text = _decode_frame(raw_frame)
message = json.loads(text)
time_us = message.get("time_us")
if time_us is not None:
last_cursor = time_us
if not _is_post_create(message):
continue
did = message.get("did", "")
try:
producer.produce(
topic,
key=did.encode(),
value=text.encode() if isinstance(text, str) else text,
callback=_delivery_callback,
)
except BufferError:
logger.warning("Producer buffer full, flushing")
producer.flush(timeout=10)
producer.produce(
topic,
key=did.encode(),
value=text.encode() if isinstance(text, str) else text,
callback=_delivery_callback,
)
message_count += 1
if message_count % POLL_INTERVAL == 0:
producer.poll(0)
if message_count % 10000 == 0:
logger.info("Produced %d messages (cursor=%s)", message_count, last_cursor)
return last_cursor
def _build_jetstream_url(collections: str, cursor: int | None) -> str:
"""Build the Jetstream WebSocket URL with query parameters.
Args:
collections: Comma-separated collection names.
cursor: Optional microsecond timestamp for resumption.
Returns:
The full WebSocket URL.
"""
params = ["compress=true"]
for raw_collection in collections.split(","):
cleaned = raw_collection.strip()
if cleaned:
params.append(f"wantedCollections={cleaned}")
if cursor is not None:
params.append(f"cursor={cursor}")
return f"{JETSTREAM_URL}?{'&'.join(params)}"
def _decode_frame(frame: str | bytes) -> str:
"""Decode a WebSocket frame, decompressing zstd if binary.
Jetstream with compress=true sends zstd-compressed binary frames.
Args:
frame: Raw WebSocket frame data.
Returns:
The decoded JSON string.
"""
if isinstance(frame, bytes):
return zstd.decompress(frame).decode()
return frame
def _is_post_create(message: dict) -> bool:
"""Check if a Jetstream message is a post creation commit."""
if message.get("kind") != "commit":
return False
commit = message.get("commit", {})
return commit.get("operation") == "create" and commit.get("collection") == POST_COLLECTION
def _create_producer(servers: str) -> Producer:
"""Create a configured Kafka producer.
Args:
servers: Kafka bootstrap servers string.
Returns:
A configured confluent_kafka.Producer.
"""
config = {
"bootstrap.servers": servers,
"linger.ms": 50,
"batch.size": 65536,
"compression.type": "zstd",
"acks": "all",
"retries": 5,
"retry.backoff.ms": 500,
}
return Producer(config)
def _delivery_callback(error: KafkaError | None, _message: object) -> None:
"""Log delivery failures from the Kafka producer."""
if error is not None:
logger.error("Kafka delivery failed: %s", error)
def _handle_shutdown(_signum: int, _frame: object) -> None:
"""Signal handler to trigger graceful shutdown."""
logger.info("Shutdown signal received")
shutdown_event.set()
if __name__ == "__main__":
app()
-247
View File
@@ -1,247 +0,0 @@
"""Ingestion pipeline for loading JSONL post files into the weekly-partitioned posts table.
Usage:
ingest-posts /path/to/files/
ingest-posts /path/to/single_file.jsonl
ingest-posts /data/dir/ --workers 4 --batch-size 5000
"""
from __future__ import annotations
import logging
from datetime import UTC, datetime
from pathlib import Path # noqa: TC003 this is needed for typer
from typing import TYPE_CHECKING, Annotated
import orjson
import psycopg
import typer
from python.common import configure_logger
from python.orm.common import get_connection_info
from python.parallelize import parallelize_process
if TYPE_CHECKING:
from collections.abc import Iterator
logger = logging.getLogger(__name__)
app = typer.Typer(help="Ingest JSONL post files into the partitioned posts table.")
@app.command()
def main(
path: Annotated[Path, typer.Argument(help="Directory containing JSONL files, or a single JSONL file")],
batch_size: Annotated[int, typer.Option(help="Rows per INSERT batch")] = 10000,
workers: Annotated[int, typer.Option(help="Parallel workers for multi-file ingestion")] = 4,
pattern: Annotated[str, typer.Option(help="Glob pattern for JSONL files")] = "*.jsonl",
) -> None:
"""Ingest JSONL post files into the weekly-partitioned posts table."""
configure_logger(level="INFO")
logger.info("starting ingest-posts")
logger.info("path=%s batch_size=%d workers=%d pattern=%s", path, batch_size, workers, pattern)
if path.is_file():
ingest_file(path, batch_size=batch_size)
elif path.is_dir():
ingest_directory(path, batch_size=batch_size, max_workers=workers, pattern=pattern)
else:
typer.echo(f"Path does not exist: {path}", err=True)
raise typer.Exit(code=1)
logger.info("ingest-posts done")
def ingest_directory(
directory: Path,
*,
batch_size: int,
max_workers: int,
pattern: str = "*.jsonl",
) -> None:
"""Ingest all JSONL files in a directory using parallel workers."""
files = sorted(directory.glob(pattern))
if not files:
logger.warning("No JSONL files found in %s", directory)
return
logger.info("Found %d JSONL files to ingest", len(files))
kwargs_list = [{"path": fp, "batch_size": batch_size} for fp in files]
parallelize_process(ingest_file, kwargs_list, max_workers=max_workers)
SCHEMA = "main"
COLUMNS = (
"post_id",
"user_id",
"instance",
"date",
"text",
"langs",
"like_count",
"reply_count",
"repost_count",
"reply_to",
"replied_author",
"thread_root",
"thread_root_author",
"repost_from",
"reposted_author",
"quotes",
"quoted_author",
"labels",
"sent_label",
"sent_score",
)
INSERT_FROM_STAGING = f"""
INSERT INTO {SCHEMA}.posts ({", ".join(COLUMNS)})
SELECT {", ".join(COLUMNS)} FROM pg_temp.staging
ON CONFLICT (post_id, date) DO NOTHING
""" # noqa: S608
FAILED_INSERT = f"""
INSERT INTO {SCHEMA}.failed_ingestion (raw_line, error)
VALUES (%(raw_line)s, %(error)s)
""" # noqa: S608
def get_psycopg_connection() -> psycopg.Connection:
"""Create a raw psycopg3 connection from environment variables."""
database, host, port, username, password = get_connection_info("DATA_SCIENCE_DEV")
return psycopg.connect(
dbname=database,
host=host,
port=int(port),
user=username,
password=password,
autocommit=False,
)
def ingest_file(path: Path, *, batch_size: int) -> None:
"""Ingest a single JSONL file into the posts table."""
log_trigger = max(100_000 // batch_size, 1)
failed_lines: list[dict] = []
try:
with get_psycopg_connection() as connection:
for index, batch in enumerate(read_jsonl_batches(path, batch_size, failed_lines), 1):
ingest_batch(connection, batch)
if index % log_trigger == 0:
logger.info("Ingested %d batches (%d rows) from %s", index, index * batch_size, path)
if failed_lines:
logger.warning("Recording %d malformed lines from %s", len(failed_lines), path.name)
with connection.cursor() as cursor:
cursor.executemany(FAILED_INSERT, failed_lines)
connection.commit()
except Exception:
logger.exception("Failed to ingest file: %s", path)
raise
def ingest_batch(connection: psycopg.Connection, batch: list[dict]) -> None:
"""COPY batch into a temp staging table, then INSERT ... ON CONFLICT into posts."""
if not batch:
return
try:
with connection.cursor() as cursor:
cursor.execute(f"""
CREATE TEMP TABLE IF NOT EXISTS staging
(LIKE {SCHEMA}.posts INCLUDING DEFAULTS)
ON COMMIT DELETE ROWS
""")
cursor.execute("TRUNCATE pg_temp.staging")
with cursor.copy(f"COPY pg_temp.staging ({', '.join(COLUMNS)}) FROM STDIN") as copy:
for row in batch:
copy.write_row(tuple(row.get(column) for column in COLUMNS))
cursor.execute(INSERT_FROM_STAGING)
connection.commit()
except Exception as error:
connection.rollback()
if len(batch) == 1:
logger.exception("Skipping bad row post_id=%s", batch[0].get("post_id"))
with connection.cursor() as cursor:
cursor.execute(
FAILED_INSERT,
{
"raw_line": orjson.dumps(batch[0], default=str).decode(),
"error": str(error),
},
)
connection.commit()
return
midpoint = len(batch) // 2
ingest_batch(connection, batch[:midpoint])
ingest_batch(connection, batch[midpoint:])
def read_jsonl_batches(file_path: Path, batch_size: int, failed_lines: list[dict]) -> Iterator[list[dict]]:
"""Stream a JSONL file and yield batches of transformed rows."""
batch: list[dict] = []
with file_path.open("r", encoding="utf-8") as handle:
for raw_line in handle:
line = raw_line.strip()
if not line:
continue
batch.extend(parse_line(line, file_path, failed_lines))
if len(batch) >= batch_size:
yield batch
batch = []
if batch:
yield batch
def parse_line(line: str, file_path: Path, failed_lines: list[dict]) -> Iterator[dict]:
"""Parse a JSONL line, handling concatenated JSON objects."""
try:
yield transform_row(orjson.loads(line))
except orjson.JSONDecodeError:
if "}{" not in line:
logger.warning("Skipping malformed line in %s: %s", file_path.name, line[:120])
failed_lines.append({"raw_line": line, "error": "malformed JSON"})
return
fragments = line.replace("}{", "}\n{").split("\n")
for fragment in fragments:
try:
yield transform_row(orjson.loads(fragment))
except (orjson.JSONDecodeError, KeyError, ValueError) as error:
logger.warning("Skipping malformed fragment in %s: %s", file_path.name, fragment[:120])
failed_lines.append({"raw_line": fragment, "error": str(error)})
except Exception as error:
logger.exception("Skipping bad row in %s: %s", file_path.name, line[:120])
failed_lines.append({"raw_line": line, "error": str(error)})
def transform_row(raw: dict) -> dict:
"""Transform a raw JSONL row into a dict matching the Posts table columns."""
raw["date"] = parse_date(raw["date"])
if raw.get("langs") is not None:
raw["langs"] = orjson.dumps(raw["langs"])
if raw.get("text") is not None:
raw["text"] = raw["text"].replace("\x00", "")
return raw
def parse_date(raw_date: int) -> datetime:
"""Parse compact YYYYMMDDHHmm integer into a naive datetime (input is UTC by spec)."""
return datetime(
raw_date // 100000000,
(raw_date // 1000000) % 100,
(raw_date // 10000) % 100,
(raw_date // 100) % 100,
raw_date % 100,
tzinfo=UTC,
)
if __name__ == "__main__":
app()
+3 -29
View File
@@ -4,12 +4,10 @@ Usage:
database <db_name> <command> [args...]
Examples:
database van_inventory upgrade head
database van_inventory downgrade head-1
database van_inventory revision --autogenerate -m "add meals table"
database van_inventory check
database richie check
database richie upgrade head
database richie downgrade head-1
database richie revision --autogenerate -m "add meals table"
"""
from __future__ import annotations
@@ -48,10 +46,7 @@ class DatabaseConfig:
def alembic_config(self) -> Config:
"""Build an alembic Config for this database."""
# Runtime import needed — Config is in TYPE_CHECKING for the return type annotation
from alembic.config import Config as AlembicConfig # noqa: PLC0415
cfg = AlembicConfig()
cfg = Config()
cfg.set_main_option("script_location", self.script_location)
cfg.set_main_option("file_template", self.file_template)
cfg.set_main_option("prepend_sys_path", ".")
@@ -76,27 +71,6 @@ DATABASES: dict[str, DatabaseConfig] = {
base_class_name="RichieBase",
models_module="python.orm.richie",
),
"van_inventory": DatabaseConfig(
env_prefix="VAN_INVENTORY",
version_location="python/alembic/van_inventory/versions",
base_module="python.orm.van_inventory.base",
base_class_name="VanInventoryBase",
models_module="python.orm.van_inventory.models",
),
"signal_bot": DatabaseConfig(
env_prefix="SIGNALBOT",
version_location="python/alembic/signal_bot/versions",
base_module="python.orm.signal_bot.base",
base_class_name="SignalBotBase",
models_module="python.orm.signal_bot.models",
),
"data_science_dev": DatabaseConfig(
env_prefix="DATA_SCIENCE_DEV",
version_location="python/alembic/data_science_dev/versions",
base_module="python.orm.data_science_dev.base",
base_class_name="DataScienceDevBase",
models_module="python.orm.data_science_dev.models",
),
}
+1
View File
@@ -0,0 +1 @@
"""EPUB search package."""
+57
View File
@@ -0,0 +1,57 @@
"""Grounded answer generation."""
from __future__ import annotations
import logging
from typing import TYPE_CHECKING
from python.ebook_search.llm_interface import request_chat_completion
if TYPE_CHECKING:
from python.ebook_search.config import EbookSearchConfig
from python.ebook_search.search import SearchResult
logger = logging.getLogger(__name__)
def answer_query(query: str, results: list[SearchResult], config: EbookSearchConfig) -> str:
"""Answer a question using only retrieved chunks."""
if not config.answer_enabled:
logger.info("ebook_answer_skipped_disabled")
return "Answer generation is disabled. Source chunks are shown below."
if not results:
logger.info("ebook_answer_skipped_no_results")
return "No relevant sources were found."
logger.info(
"ebook_answer_request_start base_url=%s model=%s sources=%s query_length=%s",
config.vllm_base_url,
config.chat_model,
len(results),
len(query),
)
context = "\n\n".join(
f"[{index}] {result.source_title}{' - ' + result.chapter_title if result.chapter_title else ''}\n{result.text}"
for index, result in enumerate(results, start=1)
)
content = request_chat_completion(
config,
[
{
"role": "system",
"content": (
"Answer only from the provided context. Cite sources with bracketed numbers like [1]. "
"If the context is insufficient, say so."
),
},
{"role": "user", "content": f"Question:\n{query}\n\nContext:\n{context}"},
],
)
logger.info(
"ebook_answer_request_complete model=%s answer_length=%s",
config.chat_model,
len(content),
)
return content or "The model returned an empty answer."
+1
View File
@@ -0,0 +1 @@
"""Web and external API adapters for EPUB search."""
+60
View File
@@ -0,0 +1,60 @@
"""Background BM25 refresh tasks for the web app."""
from __future__ import annotations
import logging
from threading import Timer
from typing import TYPE_CHECKING
from sqlalchemy.orm import Session
from python.ebook_search.bm25_corpus import load_bm25_corpus, refresh_bm25_corpus
if TYPE_CHECKING:
from fastapi import FastAPI
from sqlalchemy.engine import Engine
from python.ebook_search.config import EbookSearchConfig
logger = logging.getLogger(__name__)
def schedule_bm25_refresh(app: FastAPI) -> None:
"""Schedule a delayed BM25 corpus refresh, replacing any pending refresh."""
existing_timer = getattr(app.state, "bm25_refresh_timer", None)
if existing_timer is not None:
existing_timer.cancel()
timer = Timer(app.state.config.bm25_refresh_delay_seconds, refresh_bm25_for_app, args=(app,))
timer.daemon = True
timer.start()
app.state.bm25_refresh_timer = timer
logger.info(
"ebook_bm25_refresh_scheduled delay_seconds=%s",
app.state.config.bm25_refresh_delay_seconds,
)
def cancel_bm25_refresh(app: FastAPI) -> None:
"""Cancel any pending BM25 corpus refresh."""
existing_timer = getattr(app.state, "bm25_refresh_timer", None)
if existing_timer is not None:
existing_timer.cancel()
app.state.bm25_refresh_timer = None
logger.info("ebook_bm25_refresh_cancelled")
def refresh_bm25_for_app(app: FastAPI) -> None:
"""Refresh the BM25 corpus using the app engine and config."""
try:
refresh_bm25_for_engine(app.state.engine, app.state.config)
except Exception:
logger.exception("ebook_bm25_refresh_failed")
def refresh_bm25_for_engine(engine: Engine, config: EbookSearchConfig) -> None:
"""Refresh the BM25 corpus using a SQLAlchemy engine."""
with Session(engine) as session:
refresh_bm25_corpus(session, config)
load_bm25_corpus.cache_clear()
logger.info("ebook_bm25_corpus_cache_cleared_after_refresh")
+79
View File
@@ -0,0 +1,79 @@
"""FastAPI HTMX app for EPUB search."""
from __future__ import annotations
import logging
from contextlib import asynccontextmanager
from typing import TYPE_CHECKING, Annotated
import typer
import uvicorn
from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
from sqlalchemy.orm import Session
from python.common import configure_logger
from python.ebook_search.api.bm25_tasks import cancel_bm25_refresh
from python.ebook_search.api.routes import admin_router, page_router, search_router
from python.ebook_search.api.web import STATIC_DIR
from python.ebook_search.bm25_corpus import ensure_bm25_corpus
from python.ebook_search.config import load_config
from python.fastapi_tools import ZstdMiddleware
from python.orm.common import get_postgres_engine
if TYPE_CHECKING:
from collections.abc import AsyncIterator
logger = logging.getLogger(__name__)
@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncIterator[None]:
"""Manage application startup and shutdown resources."""
logger.info("ebook_search_startup")
app.state.engine = get_postgres_engine(name="RICHIE", vector_engine=True)
with Session(app.state.engine) as session:
ensure_bm25_corpus(session, app.state.config)
try:
yield
finally:
logger.info("ebook_search_shutdown")
cancel_bm25_refresh(app)
app.state.engine.dispose()
def create_app() -> FastAPI:
"""Create the EPUB search web app."""
app = FastAPI(title="EPUB Search", lifespan=lifespan)
app.add_middleware(ZstdMiddleware)
app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
app.state.config = load_config()
logger.info(
"ebook_search_config_loaded top_k=%s embedding_model=%s rerank_enabled=%s answer_enabled=%s library_paths=%s",
app.state.config.top_k,
app.state.config.embedding_model,
app.state.config.rerank.enabled,
app.state.config.answer_enabled,
len(app.state.config.library_paths),
)
app.include_router(admin_router)
app.include_router(page_router)
app.include_router(search_router)
return app
def serve(
host: Annotated[str, typer.Option("--host", "-h", help="Host to bind to")] = "127.0.0.1",
port: Annotated[int, typer.Option("--port", "-p", help="Port to bind to")] = 8070,
log_level: Annotated[str, typer.Option("--log-level", "-l", help="Log level")] = "INFO",
) -> None:
"""Start the EPUB search server."""
configure_logger(log_level)
uvicorn.run(create_app(), host=host, port=port)
if __name__ == "__main__":
typer.run(serve)
@@ -0,0 +1,11 @@
"""EPUB search web route modules."""
from python.ebook_search.api.routes.admin import router as admin_router
from python.ebook_search.api.routes.page import router as page_router
from python.ebook_search.api.routes.search import router as search_router
__all__ = [
"admin_router",
"page_router",
"search_router",
]
+107
View File
@@ -0,0 +1,107 @@
"""Admin routes for the EPUB search web UI."""
from __future__ import annotations
import logging
from dataclasses import replace
from fastapi import APIRouter, Request
from fastapi.responses import HTMLResponse
from sqlalchemy.orm import Session
from python.ebook_search.api.bm25_tasks import schedule_bm25_refresh
from python.ebook_search.api.web import templates
from python.ebook_search.embeddings import embed_missing_chunks, embedding_model_stats
from python.ebook_search.ingest import ingest_configured_paths
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/admin")
EMBED_ALL_BATCH_SIZE = 32
@router.get("", response_class=HTMLResponse)
def admin(request: Request) -> HTMLResponse:
"""Render the admin page."""
with Session(request.app.state.engine) as session:
stats = embedding_model_stats(session)
logger.info("ebook_admin_page_loaded models=%s", len(stats))
return templates.TemplateResponse(request, "admin.html", {"config": request.app.state.config, "stats": stats})
@router.post("/scan", response_class=HTMLResponse)
def scan_library(request: Request) -> HTMLResponse:
"""Scan configured library paths for EPUB changes."""
try:
with Session(request.app.state.engine) as session:
count = ingest_configured_paths(session, request.app.state.config)
session.commit()
except Exception as error:
logger.exception("ebook_admin_scan_failed")
return templates.TemplateResponse(request, "partials/error.html", {"message": str(error)}, status_code=500)
logger.info("ebook_admin_scan_complete changed_files=%s", count)
if count > 0:
schedule_bm25_refresh(request.app)
return templates.TemplateResponse(request, "partials/admin_status.html", {"message": f"Indexed {count} EPUBs"})
@router.post("/embed-missing", response_class=HTMLResponse)
def embed_missing(request: Request) -> HTMLResponse:
"""Embed chunks missing vectors for the configured model."""
try:
with Session(request.app.state.engine) as session:
count = embed_missing_chunks(session, request.app.state.config)
session.commit()
except Exception as error:
logger.exception("ebook_admin_embed_missing_failed")
return templates.TemplateResponse(request, "partials/error.html", {"message": str(error)}, status_code=500)
logger.info("ebook_admin_embed_missing_complete chunks=%s", count)
return templates.TemplateResponse(
request,
"partials/admin_status.html",
{"message": f"Embedded {count} chunks"},
)
@router.post("/embed-all", response_class=HTMLResponse)
def embed_all(request: Request) -> HTMLResponse:
"""Embed all chunks missing vectors in fixed-size batches."""
total = 0
batches = 0
config = replace(request.app.state.config, embedding_batch_size=EMBED_ALL_BATCH_SIZE)
try:
with Session(request.app.state.engine) as session:
while True:
count = embed_missing_chunks(session, config)
if count == 0:
break
session.commit()
total += count
batches += 1
logger.info(
"ebook_admin_embed_all_batch_complete batch=%s chunks=%s total_chunks=%s",
batches,
count,
total,
)
except Exception as error:
logger.exception(
"ebook_admin_embed_all_failed batches=%s chunks=%s",
batches,
total,
)
return templates.TemplateResponse(
request,
"partials/error.html",
{"message": f"Embed all failed after {total} chunks in {batches} batches: {error}"},
status_code=500,
)
logger.info("ebook_admin_embed_all_complete batches=%s chunks=%s", batches, total)
return templates.TemplateResponse(
request,
"partials/admin_status.html",
{"message": f"Embedded {total} chunks in {batches} batches of {EMBED_ALL_BATCH_SIZE}"},
)
+57
View File
@@ -0,0 +1,57 @@
"""Page routes for the EPUB search web UI."""
from __future__ import annotations
import logging
from fastapi import APIRouter, Request
from fastapi.responses import HTMLResponse
from sqlalchemy import select
from sqlalchemy.orm import Session
from python.ebook_search.api.web import templates
from python.orm.richie import EbookSource
logger = logging.getLogger(__name__)
router = APIRouter()
@router.get("/", response_class=HTMLResponse)
def index(request: Request) -> HTMLResponse:
"""Render the search page."""
return templates.TemplateResponse(request, "search.html", {"config": request.app.state.config})
@router.get("/books", response_class=HTMLResponse)
def books(request: Request) -> HTMLResponse:
"""Render the indexed books page."""
with Session(request.app.state.engine) as session:
sources = list(session.scalars(select(EbookSource).order_by(EbookSource.title)).all())
logger.info("ebook_books_page_loaded count=%s", len(sources))
return templates.TemplateResponse(request, "books.html", {"sources": sources})
@router.get("/books/{source_id}", response_class=HTMLResponse)
def book_detail(source_id: int, request: Request) -> HTMLResponse:
"""Render details for one indexed book."""
with Session(request.app.state.engine) as session:
source = session.get(EbookSource, source_id)
if source is not None:
chapter_count = len(source.chapters)
chunk_count = len(source.chunks)
else:
chapter_count = 0
chunk_count = 0
logger.info(
"ebook_book_detail_loaded source_id=%s found=%s chapters=%s chunks=%s",
source_id,
source is not None,
chapter_count,
chunk_count,
)
return templates.TemplateResponse(
request,
"book_detail.html",
{"chapter_count": chapter_count, "chunk_count": chunk_count, "source": source},
)
+58
View File
@@ -0,0 +1,58 @@
"""Search routes for the EPUB search web UI."""
from __future__ import annotations
import logging
from dataclasses import replace
from time import perf_counter
from typing import Annotated
from fastapi import APIRouter, Form, Request
from fastapi.responses import HTMLResponse
from python.ebook_search.answer import answer_query
from python.ebook_search.api.web import templates
from python.ebook_search.search import search_ebooks
from python.ebook_search.timing import runtime_step_from_start
logger = logging.getLogger(__name__)
router = APIRouter()
@router.post("/search", response_class=HTMLResponse)
def search(
request: Request,
query: Annotated[str, Form()],
rerank: Annotated[str | None, Form()] = None,
) -> HTMLResponse:
"""Run a search and render HTMX results."""
try:
response = search_ebooks(request.app.state.engine, query, request.app.state.config, rerank=rerank == "true")
except Exception as error:
logger.exception("ebook_search_request_failed")
return templates.TemplateResponse(request, "partials/error.html", {"message": str(error)}, status_code=500)
answer_start = perf_counter()
if request.app.state.config.answer_enabled:
try:
answer = answer_query(query, response.results, request.app.state.config)
except RuntimeError as error:
logger.warning("ebook_answer_request_failed_falling_back error=%s", error)
answer = "Answer generation failed. Source chunks are still shown below."
else:
logger.info("ebook_answer_skipped_disabled")
answer = "Answer generation is disabled. Source chunks are shown below."
answer_step_name = "Answer generation" if request.app.state.config.answer_enabled else "Answer skipped"
response = replace(
response,
timings=(*response.timings, runtime_step_from_start(answer_step_name, answer_start)),
)
logger.info(
"ebook_search_request_complete results=%s rank_label=%s runtime_ms=%.1f",
len(response.results),
response.rank_label,
response.total_runtime_ms,
)
return templates.TemplateResponse(request, "partials/results.html", {"answer": answer, "response": response})
+140
View File
@@ -0,0 +1,140 @@
body {
margin: 0;
background: #f7f7f4;
color: #202124;
font-family: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
}
main {
max-width: 960px;
margin: 0 auto;
padding: 24px;
}
nav {
display: flex;
gap: 12px;
align-items: center;
margin-bottom: 20px;
}
nav form {
margin: 0;
}
.actions {
display: flex;
flex-wrap: wrap;
gap: 12px;
margin-bottom: 24px;
}
textarea {
display: block;
width: 100%;
margin: 8px 0 12px;
}
button {
padding: 8px 14px;
}
.check {
display: inline-flex;
gap: 8px;
align-items: center;
margin-right: 12px;
}
.rank-label {
margin-top: 24px;
font-weight: 700;
}
.results {
padding-left: 24px;
}
.meta,
.scores,
.status {
color: #626a73;
}
.scores {
display: flex;
flex-wrap: wrap;
gap: 8px;
margin: 12px 0;
}
.scores div {
display: inline-flex;
gap: 4px;
align-items: baseline;
}
.scores dt {
font-weight: 700;
}
.scores dd {
margin: 0;
}
.runtime {
margin-top: 16px;
}
.timing-chart {
display: grid;
gap: 8px;
padding: 0;
list-style: none;
}
.timing-chart li {
display: grid;
grid-template-columns: minmax(150px, 1fr) minmax(160px, 2fr) auto auto;
gap: 8px;
align-items: center;
}
.timing-bar {
height: 10px;
overflow: hidden;
background: #e5e5df;
}
.timing-bar span {
display: block;
height: 100%;
background: #3767c8;
}
.timing-value,
.timing-remaining {
color: #626a73;
font-variant-numeric: tabular-nums;
}
table {
width: 100%;
border-collapse: collapse;
}
th,
td {
padding: 8px;
border-bottom: 1px solid #d8d8d2;
text-align: left;
}
th {
font-weight: 700;
}
.error {
color: #9f1d20;
font-weight: 700;
}
@@ -0,0 +1,57 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>EPUB Admin</title>
<script src="https://unpkg.com/htmx.org@2.0.4"></script>
<link rel="stylesheet" href="/static/style.css">
</head>
<body>
<main>
<nav>
<a href="/">Search</a>
<a href="/books">Books</a>
<a href="/admin">Admin</a>
</nav>
<h1>Admin</h1>
<section id="admin-status"></section>
<section class="actions">
<form hx-post="/admin/scan" hx-target="#admin-status" hx-swap="innerHTML">
<button type="submit">Scan</button>
</form>
<form hx-post="/admin/embed-missing" hx-target="#admin-status" hx-swap="innerHTML">
<button type="submit">Embed</button>
</form>
<form hx-post="/admin/embed-all" hx-target="#admin-status" hx-swap="innerHTML">
<button type="submit">Embed all</button>
</form>
</section>
<section>
<h2>Embeddings</h2>
<table>
<thead>
<tr>
<th>Model</th>
<th>Dimensions</th>
<th>Embedded</th>
<th>Missing</th>
<th>Total chunks</th>
</tr>
</thead>
<tbody>
{% for item in stats %}
<tr>
<td>{{ item.model_name }}</td>
<td>{{ item.dimension }}</td>
<td>{{ item.embedded_chunks }}</td>
<td>{{ item.missing_chunks }}</td>
<td>{{ item.total_chunks }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</section>
</main>
</body>
</html>
@@ -0,0 +1,32 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{% if source %}{{ source.title }}{% else %}Book not found{% endif %}</title>
<link rel="stylesheet" href="/static/style.css">
</head>
<body>
<main>
<nav>
<a href="/">Search</a>
<a href="/books">Books</a>
<a href="/admin">Admin</a>
</nav>
{% if source %}
<h1>{{ source.title }}</h1>
<p class="meta">{{ source.author or "Unknown author" }}</p>
<dl>
<dt>File</dt>
<dd>{{ source.file_path }}</dd>
<dt>Chapters</dt>
<dd>{{ chapter_count }}</dd>
<dt>Chunks</dt>
<dd>{{ chunk_count }}</dd>
</dl>
{% else %}
<h1>Book not found</h1>
{% endif %}
</main>
</body>
</html>
@@ -0,0 +1,31 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>EPUB Books</title>
<link rel="stylesheet" href="/static/style.css">
</head>
<body>
<main>
<nav>
<a href="/">Search</a>
<a href="/books">Books</a>
<a href="/admin">Admin</a>
</nav>
<h1>Books</h1>
{% if sources %}
<ol class="results">
{% for source in sources %}
<li>
<h2><a href="/books/{{ source.id }}">{{ source.title }}</a></h2>
<p class="meta">{{ source.author or "Unknown author" }}</p>
</li>
{% endfor %}
</ol>
{% else %}
<p>No EPUBs indexed.</p>
{% endif %}
</main>
</body>
</html>
@@ -0,0 +1 @@
<p class="status">{{ message }}</p>
@@ -0,0 +1 @@
<p class="error">{{ message }}</p>
@@ -0,0 +1,74 @@
<div class="rank-label">{{ response.rank_label }}</div>
{% if response.timings %}
<section class="runtime">
<h2>Runtime</h2>
<p class="meta">Total {{ "%.1f"|format(response.total_runtime_ms) }} ms</p>
<ol class="timing-chart">
{% set total = response.total_runtime_ms %}
{% set ns = namespace(remaining=total) %}
{% for step in response.timings %}
{% set width = (step.duration_ms / total * 100) if total else 0 %}
{% if step.counts_toward_total %}
{% set ns.remaining = ns.remaining - step.duration_ms %}
{% endif %}
<li>
<span class="timing-label">{{ step.name }}</span>
<span class="timing-bar"><span style="width: {{ "%.2f"|format(width) }}%"></span></span>
<span class="timing-value">{{ "%.1f"|format(step.duration_ms) }} ms</span>
<span class="timing-remaining">{{ "%.1f"|format([ns.remaining, 0]|max) }} ms left</span>
</li>
{% endfor %}
</ol>
</section>
{% endif %}
<section class="answer">
<h2>Answer</h2>
<p>{{ answer }}</p>
</section>
{% if response.results %}
<ol class="results">
{% for result in response.results %}
<li>
<h2>{{ result.source_title }}</h2>
<p class="meta">
{% if result.source_author %}{{ result.source_author }}{% endif %}
{% if result.chapter_title %} · {{ result.chapter_title }}{% endif %}
{% if result.page_label %} · page {{ result.page_label }}{% endif %}
</p>
<p>{{ result.text }}</p>
<dl class="scores">
<div>
<dt>final</dt>
<dd>{{ "%.3f"|format(result.score) }}</dd>
</div>
{% if result.rerank_score is not none %}
<div>
<dt>rerank</dt>
<dd>{{ "%.3f"|format(result.rerank_score) }}</dd>
</div>
{% endif %}
{% if result.vector_score is not none %}
<div>
<dt>vector cosine</dt>
<dd>{{ "%.3f"|format(result.vector_score) }}</dd>
</div>
{% endif %}
{% if result.bm25_score is not none %}
<div>
<dt>BM25</dt>
<dd>{{ "%.6f"|format(result.bm25_score) }}</dd>
</div>
{% endif %}
{% if result.fused_score is not none %}
<div>
<dt>RRF</dt>
<dd>{{ "%.3f"|format(result.fused_score) }}</dd>
</div>
{% endif %}
</dl>
</li>
{% endfor %}
</ol>
{% else %}
<p>No results.</p>
{% endif %}
@@ -0,0 +1,30 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>EPUB Search</title>
<script src="https://unpkg.com/htmx.org@2.0.4"></script>
<link rel="stylesheet" href="/static/style.css">
</head>
<body>
<main>
<nav>
<a href="/">Search</a>
<a href="/books">Books</a>
<a href="/admin">Admin</a>
</nav>
<h1>EPUB Search</h1>
<form hx-post="/search" hx-target="#results" hx-swap="innerHTML">
<label for="query">Search</label>
<textarea id="query" name="query" rows="4" required></textarea>
<label class="check">
<input type="checkbox" name="rerank" value="true" {% if config.rerank.enabled %}checked{% endif %}>
Rerank
</label>
<button type="submit">Search</button>
</form>
<section id="results"></section>
</main>
</body>
</html>
+13
View File
@@ -0,0 +1,13 @@
"""Shared web UI resources for EPUB search."""
from __future__ import annotations
from pathlib import Path
from fastapi.templating import Jinja2Templates
PACKAGE_DIR = Path(__file__).resolve().parent
TEMPLATE_DIR = PACKAGE_DIR / "templates"
STATIC_DIR = PACKAGE_DIR / "static"
templates = Jinja2Templates(directory=TEMPLATE_DIR)
+281
View File
@@ -0,0 +1,281 @@
"""Persisted BM25 corpus management."""
from __future__ import annotations
import json
import logging
import shutil
from dataclasses import dataclass
from datetime import UTC, datetime
from functools import cache
from pathlib import Path
from typing import TYPE_CHECKING
import bm25s
from sqlalchemy import func, select, union_all
from python.orm.richie import EbookChapter, EbookChunk, EbookSource
if TYPE_CHECKING:
from sqlalchemy.orm import Session
from python.ebook_search.config import EbookSearchConfig
logger = logging.getLogger(__name__)
MANIFEST_NAME = "manifest.json"
REQUIRED_INDEX_FILES = frozenset(
{
"data.csc.index.npy",
"indices.csc.index.npy",
"indptr.csc.index.npy",
"params.index.json",
"vocab.index.json",
"corpus.jsonl",
}
)
@dataclass(frozen=True)
class BM25Manifest:
"""Metadata describing a persisted BM25 corpus."""
created_at: datetime
db_updated_at: datetime | None
chunk_count: int
@dataclass(frozen=True)
class BM25Corpus:
"""Loaded persisted BM25 corpus and retriever."""
retriever: object | None
records: tuple[dict[str, object], ...]
manifest: BM25Manifest
class BM25CorpusUnavailableError(RuntimeError):
"""Raised when the persisted BM25 corpus cannot be loaded."""
def bm25_index_path(config: EbookSearchConfig) -> Path:
"""Return the configured BM25 index root path relative to the current working directory."""
path = Path(config.bm25_index_dir).expanduser()
if path.is_absolute():
return path
return Path.cwd() / path
def get_current_bm25_index(index_path: Path) -> Path:
"""Return the live BM25 index directory."""
current_path = index_path / "current"
if current_path.exists() or current_path.is_symlink():
return current_path
return index_path
def ensure_bm25_corpus(session: Session, config: EbookSearchConfig) -> None:
"""Create or refresh the persisted BM25 corpus when it is missing or stale."""
index_path = bm25_index_path(config)
manifest = read_bm25_manifest(index_path)
db_updated_at = corpus_last_updated_at(session)
if not bm25_index_exists(index_path, manifest):
logger.info("ebook_bm25_index_missing path=%s", index_path)
refresh_bm25_corpus(session, config, db_updated_at=db_updated_at)
return
if db_updated_at is not None and manifest is not None and manifest.created_at < db_updated_at:
logger.info(
"ebook_bm25_index_stale path=%s created_at=%s db_updated_at=%s",
index_path,
manifest.created_at.isoformat(),
db_updated_at.isoformat(),
)
refresh_bm25_corpus(session, config, db_updated_at=db_updated_at)
return
logger.info(
"ebook_bm25_index_current path=%s chunks=%s created_at=%s",
index_path,
manifest.chunk_count if manifest else 0,
manifest.created_at.isoformat() if manifest else None,
)
def refresh_bm25_corpus(
session: Session,
config: EbookSearchConfig,
*,
db_updated_at: datetime | None = None,
) -> BM25Manifest:
"""Rebuild and persist the BM25 corpus from the current database chunks."""
index_path = bm25_index_path(config)
records, texts = fetch_bm25_corpus_records(session)
manifest = BM25Manifest(
created_at=datetime.now(tz=UTC),
db_updated_at=db_updated_at if db_updated_at is not None else corpus_last_updated_at(session),
chunk_count=len(records),
)
write_bm25_corpus(index_path, records, texts, manifest)
logger.info(
"ebook_bm25_index_refreshed path=%s chunks=%s created_at=%s",
index_path,
manifest.chunk_count,
manifest.created_at.isoformat(),
)
return manifest
@cache
def load_bm25_corpus(config: EbookSearchConfig) -> BM25Corpus:
"""Load the BM25 corpus into memory once per process.
Background refresh tasks clear this cache after rebuilding the on-disk corpus.
"""
index_path = bm25_index_path(config)
active_index_path = get_current_bm25_index(index_path)
logger.info("ebook_bm25_corpus_cache_load path=%s active_path=%s", index_path, active_index_path)
manifest = read_bm25_manifest(index_path)
if manifest is None or not bm25_index_exists(index_path, manifest):
msg = f"BM25 corpus is not available: {index_path}"
raise BM25CorpusUnavailableError(msg)
if manifest.chunk_count == 0:
return BM25Corpus(retriever=None, records=(), manifest=manifest)
retriever = bm25s.BM25.load(active_index_path, load_corpus=True, mmap=True)
records = tuple(dict(record) for record in retriever.corpus)
return BM25Corpus(retriever=retriever, records=records, manifest=manifest)
def score_bm25_corpus(query: str, corpus: BM25Corpus, *, limit: int) -> list[tuple[dict[str, object], float]]:
"""Score a query against a loaded BM25 corpus."""
if corpus.retriever is None or not corpus.records:
return []
k = min(limit, len(corpus.records))
documents, scores = corpus.retriever.retrieve(
bm25s.tokenize(query, show_progress=False),
corpus=list(corpus.records),
k=k,
show_progress=False,
)
results: list[tuple[dict[str, object], float]] = []
for document, score in zip(documents[0], scores[0], strict=True):
score_value = float(score)
if score_value <= 0:
continue
results.append((dict(document), score_value))
return results
def fetch_bm25_corpus_records(session: Session) -> tuple[list[dict[str, object]], list[str]]:
"""Fetch persistable BM25 corpus records and their matching index texts from the database.
search_text is only needed to build the index, so it is returned separately instead of
being persisted into the corpus records, which would double the corpus size.
"""
statement = (
select(
EbookChunk.id.label("chunk_id"),
EbookChunk.text.label("text"),
EbookSource.title.label("source_title"),
EbookSource.author.label("source_author"),
EbookChapter.title.label("chapter_title"),
EbookChunk.page_label.label("page_label"),
EbookChunk.search_text.label("bm25_text"),
)
.select_from(EbookChunk)
.join(EbookSource, EbookSource.id == EbookChunk.source_id)
.outerjoin(EbookChapter, EbookChapter.id == EbookChunk.chapter_id)
.order_by(EbookChunk.id)
)
records: list[dict[str, object]] = []
texts: list[str] = []
for row in session.execute(statement).mappings():
record = dict(row)
texts.append(str(record.pop("bm25_text")))
records.append(record)
return records, texts
def corpus_last_updated_at(session: Session) -> datetime | None:
"""Return the latest source/chapter/chunk update timestamp relevant to BM25 text."""
update_times = union_all(
select(func.max(EbookSource.updated).label("updated")),
select(func.max(EbookChapter.updated).label("updated")),
select(func.max(EbookChunk.updated).label("updated")),
).subquery()
return session.scalar(select(func.max(update_times.c.updated)))
def write_bm25_corpus(
index_path: Path,
records: list[dict[str, object]],
texts: list[str],
manifest: BM25Manifest,
) -> None:
"""Write a BM25 corpus generation and publish it through the current symlink."""
index_path.mkdir(parents=True, exist_ok=True)
generations_path = index_path / "generations"
generations_path.mkdir(exist_ok=True)
generation_path = next_bm25_generation_path(generations_path, manifest.created_at)
current_path = index_path / "current"
next_current_path = index_path / f".current.{generation_path.name}.tmp"
try:
generation_path.mkdir()
# Empty corpora publish a manifest-only generation so startup succeeds before any chunks exist.
if records:
retriever = bm25s.BM25()
retriever.index(bm25s.tokenize(texts, show_progress=False), show_progress=False)
retriever.save(generation_path, corpus=records, show_progress=False)
write_bm25_manifest(generation_path, manifest)
next_current_path.unlink(missing_ok=True)
next_current_path.symlink_to(generation_path, target_is_directory=True)
next_current_path.replace(current_path)
except Exception:
next_current_path.unlink(missing_ok=True)
shutil.rmtree(generation_path, ignore_errors=True)
raise
def read_bm25_manifest(index_path: Path) -> BM25Manifest | None:
"""Read the BM25 manifest if it exists and is valid."""
manifest_path = get_current_bm25_index(index_path) / MANIFEST_NAME
if not manifest_path.exists():
return None
body = json.loads(manifest_path.read_text(encoding="utf-8"))
return BM25Manifest(
created_at=datetime.fromisoformat(str(body["created_at"])),
db_updated_at=datetime.fromisoformat(str(body["db_updated_at"])) if body.get("db_updated_at") else None,
chunk_count=int(body["chunk_count"]),
)
def write_bm25_manifest(index_path: Path, manifest: BM25Manifest) -> None:
"""Write the BM25 manifest to an index directory."""
body = {
"created_at": manifest.created_at.isoformat(),
"db_updated_at": manifest.db_updated_at.isoformat() if manifest.db_updated_at else None,
"chunk_count": manifest.chunk_count,
}
(index_path / MANIFEST_NAME).write_text(json.dumps(body, indent=2, sort_keys=True), encoding="utf-8")
def bm25_index_exists(index_path: Path, manifest: BM25Manifest | None) -> bool:
"""Return whether a usable persisted BM25 index exists."""
active_index_path = get_current_bm25_index(index_path)
if manifest is None or not active_index_path.is_dir():
return False
if manifest.chunk_count == 0:
return True
return all((active_index_path / file_name).exists() for file_name in REQUIRED_INDEX_FILES)
def next_bm25_generation_path(generations_path: Path, created_at: datetime) -> Path:
"""Return an unused dated BM25 generation path."""
base_name = created_at.astimezone(UTC).strftime("%Y%m%dT%H%M%S.%fZ")
generation_path = generations_path / base_name
suffix = 1
while generation_path.exists():
generation_path = generations_path / f"{base_name}.{suffix}"
suffix += 1
return generation_path
+117
View File
@@ -0,0 +1,117 @@
"""Configuration for the EPUB search app."""
from __future__ import annotations
from dataclasses import dataclass
from os import getenv
def getenv_bool(name: str, *, default: bool) -> bool:
"""Read a boolean environment variable with a default fallback."""
value = getenv(name)
if value is None:
return default
return value.strip().lower() in {"1", "true", "yes", "on"}
def getenv_int(name: str, *, default: int) -> int:
"""Read an integer environment variable with a default fallback."""
value = getenv(name)
if value is None or not value.strip():
return default
return int(value)
@dataclass(frozen=True)
class RerankConfig:
"""vLLM reranker settings."""
enabled: bool = False
base_url: str = "http://192.168.90.25:8001"
model: str = "qwen3-reranker-06b"
candidates: int = 24
timeout_seconds: float = 30.0
@dataclass(frozen=True)
class EbookSearchConfig:
"""Runtime settings for EPUB search."""
rerank: RerankConfig
top_k: int = 12
library_paths: tuple[str, ...] = ()
vllm_base_url: str = "https://ollama.com/v1"
vllm_api_key: str = "not-needed"
chat_model: str = "deepseek-v4-flash"
answer_enabled: bool = True
embedding_base_url: str = "http://192.168.90.25:8000/v1"
embedding_api_key: str = "not-needed"
embedding_model: str = "qwen3-embedding-0.6b"
embedding_batch_size: int = 32
bm25_index_dir: str = ".ebook_search_bm25"
bm25_refresh_delay_seconds: int = 60
def load_rerank_config() -> RerankConfig:
"""Load reranker config from environment variables."""
return RerankConfig(
enabled=getenv_bool("EBOOK_SEARCH_RERANK_ENABLED", default=False),
base_url=getenv("EBOOK_SEARCH_RERANK_BASE_URL", "http://192.168.90.25:8001"),
model=getenv("EBOOK_SEARCH_RERANK_MODEL", "qwen3-reranker-06b"),
candidates=getenv_int("EBOOK_SEARCH_RERANK_CANDIDATES", default=24),
timeout_seconds=float(getenv_int("EBOOK_SEARCH_RERANK_TIMEOUT_SECONDS", default=30)),
)
def load_config() -> EbookSearchConfig:
"""Load EPUB search config from environment variables."""
return EbookSearchConfig(
rerank=load_rerank_config(),
top_k=getenv_int("EBOOK_SEARCH_TOP_K", default=12),
library_paths=library_paths_from_env(),
vllm_base_url=getenv("EBOOK_SEARCH_VLLM_BASE_URL", "https://ollama.com/v1"),
vllm_api_key=getenv("EBOOK_SEARCH_VLLM_API_KEY") or getenv("OLLAMA_API_KEY") or "not-needed",
chat_model=getenv("EBOOK_SEARCH_CHAT_MODEL", "deepseek-v4-flash"),
answer_enabled=getenv_bool("EBOOK_SEARCH_ANSWER_ENABLED", default=True),
embedding_base_url=getenv("EBOOK_SEARCH_EMBEDDING_BASE_URL", "http://192.168.90.25:8000/v1"),
embedding_api_key=getenv("EBOOK_SEARCH_EMBEDDING_API_KEY", "not-needed"),
embedding_model=normalize_embedding_model(),
embedding_batch_size=getenv_int("EBOOK_SEARCH_EMBEDDING_BATCH_SIZE", default=32),
bm25_index_dir=getenv("EBOOK_SEARCH_BM25_INDEX_DIR", ".ebook_search_bm25"),
bm25_refresh_delay_seconds=getenv_int("EBOOK_SEARCH_BM25_REFRESH_DELAY_SECONDS", default=60),
)
def normalize_embedding_model(default: str = "qwen3-embedding-0.6b") -> str:
"""Normalize supported embedding aliases to provider model names."""
aliases = {
"Qwen3-Embedding-0.6B": "qwen3-embedding-0.6b",
"Qwen3-Embedding-4B": "qwen3-embedding-4b",
"Qwen3-Embedding-8B": "qwen3-embedding-8b",
"Qwen/Qwen3-Embedding-0.6B": "qwen3-embedding-0.6b",
"Qwen/Qwen3-Embedding-4B": "qwen3-embedding-4b",
"Qwen/Qwen3-Embedding-8B": "qwen3-embedding-8b",
"qwen3-embedding:0.6b": "qwen3-embedding-0.6b",
"qwen3-embedding:4b": "qwen3-embedding-4b",
"qwen3-embedding:8b": "qwen3-embedding-8b",
"qwen3-embedding-0.6b": "qwen3-embedding-0.6b",
"qwen3-embedding-4b": "qwen3-embedding-4b",
"qwen3-embedding-8b": "qwen3-embedding-8b",
}
model = getenv("EBOOK_SEARCH_EMBEDDING_MODEL", default)
standard_model = aliases.get(model)
if standard_model is None:
error = f"Embedding model {model} is not supported. Supported models are {aliases.keys()}"
raise ValueError(error)
return standard_model
def library_paths_from_env() -> tuple[str, ...]:
"""Read configured EPUB library paths from the environment."""
value = getenv("EBOOK_SEARCH_LIBRARY_PATHS")
if value is None:
return ()
return tuple(path for path in value.split(":") if path)
+170
View File
@@ -0,0 +1,170 @@
"""Embedding model helpers."""
from __future__ import annotations
import logging
from dataclasses import dataclass
from typing import TYPE_CHECKING
from sqlalchemy import func, select
from sqlalchemy.dialects.postgresql import insert
from python.ebook_search.llm_interface import request_embeddings
from python.orm.richie import (
EbookChunk,
EbookChunkEmbedding1024,
EbookChunkEmbedding2560,
EbookChunkEmbedding4096,
EbookEmbeddingModel,
)
logger = logging.getLogger(__name__)
if TYPE_CHECKING:
from collections.abc import Sequence
from sqlalchemy.orm import Session
from python.ebook_search.config import EbookSearchConfig
MODEL_DIMENSIONS = {
"qwen3-embedding-0.6b": 1024,
"qwen3-embedding-4b": 2560,
"qwen3-embedding-8b": 4096,
}
def get_embedding_table(
dimension: int,
) -> type[EbookChunkEmbedding1024 | EbookChunkEmbedding2560 | EbookChunkEmbedding4096]:
"""Return the embedding table mapped to an embedding dimension."""
embedding_tables = {
1024: EbookChunkEmbedding1024,
2560: EbookChunkEmbedding2560,
4096: EbookChunkEmbedding4096,
}
table = embedding_tables.get(dimension)
if not table:
msg = f"Embedding dimension {dimension} is not supported"
raise ValueError(msg)
return table
@dataclass(frozen=True)
class EmbeddingModelStats:
"""Embedding coverage for one model."""
model_name: str
dimension: int
embedded_chunks: int
total_chunks: int
@property
def missing_chunks(self) -> int:
"""Return chunks missing this embedding model."""
return max(self.total_chunks - self.embedded_chunks, 0)
def embed_texts(texts: Sequence[str], config: EbookSearchConfig) -> list[list[float]]:
"""Embed text with the configured vLLM embedding model."""
logger.info(
"ebook_embed_request_start base_url=%s model=%s count=%s",
config.embedding_base_url,
config.embedding_model,
len(texts),
)
vectors = request_embeddings(texts, config)
expected_dimension = MODEL_DIMENSIONS[config.embedding_model]
for vector in vectors:
if len(vector) != expected_dimension:
msg = f"Expected {expected_dimension} dimensions, got {len(vector)}"
raise ValueError(msg)
logger.info(
"ebook_embed_request_complete model=%s count=%s dimension=%s",
config.embedding_model,
len(vectors),
expected_dimension,
)
return vectors
def embed_query(query: str, config: EbookSearchConfig) -> list[float]:
"""Embed a search query with the Qwen retrieval instruction."""
instructed_query = f"Instruct: Retrieve relevant passages for the query.\nQuery: {query}"
return embed_texts([instructed_query], config)[0]
def ensure_embedding_models(session: Session) -> None:
"""Ensure supported embedding model rows exist."""
for name, dimension in MODEL_DIMENSIONS.items():
existing = session.scalar(select(EbookEmbeddingModel).where(EbookEmbeddingModel.name == name))
if existing is None:
session.add(EbookEmbeddingModel(name=name, dimension=dimension, is_default=name == "qwen3-embedding-0.6b"))
logger.info("ebook_embedding_model_created model=%s dimension=%s", name, dimension)
session.flush()
def embedding_model_stats(session: Session) -> list[EmbeddingModelStats]:
"""Return embedding coverage counts for every supported model."""
total_chunks = session.scalar(select(func.count(EbookChunk.id))) or 0
models = {
model.name: model
for model in session.scalars(
select(EbookEmbeddingModel)
.where(EbookEmbeddingModel.name.in_(MODEL_DIMENSIONS))
.order_by(EbookEmbeddingModel.name)
)
}
stats: list[EmbeddingModelStats] = []
for model_name, dimension in MODEL_DIMENSIONS.items():
model = models.get(model_name)
embedded_chunks = 0
if model is not None:
table = get_embedding_table(dimension)
embedded_chunks = session.scalar(select(func.count(table.id)).where(table.model_id == model.id)) or 0
stats.append(
EmbeddingModelStats(
model_name=model_name,
dimension=dimension,
embedded_chunks=embedded_chunks,
total_chunks=total_chunks,
)
)
return stats
def embed_missing_chunks(session: Session, config: EbookSearchConfig) -> int:
"""Embed chunks missing embeddings for the configured model."""
ensure_embedding_models(session)
model = session.scalar(select(EbookEmbeddingModel).where(EbookEmbeddingModel.name == config.embedding_model))
if model is None:
supported_models = ", ".join(MODEL_DIMENSIONS)
msg = f"Unknown embedding model: {config.embedding_model}. Supported models: {supported_models}"
raise ValueError(msg)
table = get_embedding_table(model.dimension)
chunks = list(
session.scalars(
select(EbookChunk)
.outerjoin(table, (table.chunk_id == EbookChunk.id) & (table.model_id == model.id))
.where(table.id.is_(None))
.order_by(EbookChunk.id)
.limit(config.embedding_batch_size)
)
)
if not chunks:
logger.info("ebook_embed_missing_none model=%s", config.embedding_model)
return 0
logger.info("ebook_embed_missing_batch_start model=%s count=%s", config.embedding_model, len(chunks))
vectors = embed_texts([chunk.text for chunk in chunks], config)
rows = [
{"chunk_id": chunk.id, "model_id": model.id, "embedding": vector}
for chunk, vector in zip(chunks, vectors, strict=True)
]
statement = insert(table).values(rows).on_conflict_do_nothing(index_elements=["chunk_id", "model_id"])
session.execute(statement)
session.flush()
logger.info("ebook_embed_missing_batch_complete model=%s count=%s", config.embedding_model, len(rows))
return len(rows)
+95
View File
@@ -0,0 +1,95 @@
"""EPUB parsing helpers."""
from __future__ import annotations
import re
from dataclasses import dataclass
from typing import TYPE_CHECKING
from bs4 import BeautifulSoup
from ebooklib import ITEM_DOCUMENT, epub
if TYPE_CHECKING:
from pathlib import Path
WHITESPACE_RE = re.compile(r"\s+")
@dataclass(frozen=True)
class ParsedChapter:
"""Text extracted from one EPUB spine document."""
title: str | None
href: str | None
text: str
page_labels: tuple[str, ...]
@dataclass(frozen=True)
class ParsedEpub:
"""Parsed EPUB metadata and text."""
title: str
author: str | None
language: str | None
publisher: str | None
identifier: str | None
chapters: tuple[ParsedChapter, ...]
def parse_epub(path: Path) -> ParsedEpub:
"""Parse EPUB metadata and spine text."""
book = epub.read_epub(path)
chapters = []
for item in book.get_items_of_type(ITEM_DOCUMENT):
soup = BeautifulSoup(item.get_content(), "html.parser")
title = chapter_title(soup)
page_labels = tuple(extract_page_labels(soup))
text = clean_text(soup.get_text(" "))
if text:
chapters.append(ParsedChapter(title=title, href=item.get_name(), text=text, page_labels=page_labels))
return ParsedEpub(
title=metadata_value(book, "title") or path.stem,
author=metadata_value(book, "creator"),
language=metadata_value(book, "language"),
publisher=metadata_value(book, "publisher"),
identifier=metadata_value(book, "identifier"),
chapters=tuple(chapters),
)
def metadata_value(book: epub.EpubBook, name: str) -> str | None:
"""Return the first non-empty Dublin Core metadata value for a name."""
values = book.get_metadata("DC", name)
if not values:
return None
value = values[0][0]
return str(value).strip() or None
def chapter_title(soup: BeautifulSoup) -> str | None:
"""Extract the best available title from an EPUB document soup."""
heading = soup.find(["h1", "h2", "h3"])
if heading is None:
title = soup.find("title")
if title is None:
return None
return clean_text(title.get_text(" ")) or None
return clean_text(heading.get_text(" ")) or None
def extract_page_labels(soup: BeautifulSoup) -> list[str]:
"""Extract EPUB page-break labels from a document soup."""
labels: list[str] = []
for tag in soup.find_all(attrs={"epub:type": "pagebreak"}):
label = tag.get("title") or tag.get("aria-label") or tag.get_text(" ")
clean = clean_text(str(label))
if clean:
labels.append(clean)
return labels
def clean_text(text: str) -> str:
"""Normalize whitespace in extracted EPUB text."""
return WHITESPACE_RE.sub(" ", text).strip()
+190
View File
@@ -0,0 +1,190 @@
"""EPUB ingestion into Richie DB."""
from __future__ import annotations
import hashlib
import logging
from dataclasses import dataclass
from datetime import UTC, datetime
from pathlib import Path
from typing import TYPE_CHECKING
import tiktoken
from sqlalchemy import or_, select
from python.ebook_search.epub_parse import parse_epub
from python.orm.richie import EbookChapter, EbookChunk, EbookSource
logger = logging.getLogger(__name__)
DEFAULT_CHUNK_TOKENS = 700
DEFAULT_CHUNK_OVERLAP = 100
if TYPE_CHECKING:
from sqlalchemy.orm import Session
from python.ebook_search.config import EbookSearchConfig
from python.ebook_search.epub_parse import ParsedChapter
@dataclass(frozen=True)
class TextChunk:
"""A token-bounded chunk of text."""
text: str
token_start: int
token_count: int
def chunk_text(
text: str,
*,
chunk_tokens: int = DEFAULT_CHUNK_TOKENS,
overlap_tokens: int = DEFAULT_CHUNK_OVERLAP,
) -> list[TextChunk]:
"""Split text into overlapping token chunks."""
if chunk_tokens <= 0:
msg = "chunk_tokens must be positive"
raise ValueError(msg)
if overlap_tokens < 0 or overlap_tokens >= chunk_tokens:
msg = "overlap_tokens must be non-negative and smaller than chunk_tokens"
raise ValueError(msg)
encoding = tiktoken.get_encoding("cl100k_base")
tokens = encoding.encode(text)
if not tokens:
return []
chunks: list[TextChunk] = []
step = chunk_tokens - overlap_tokens
for start in range(0, len(tokens), step):
chunk = tokens[start : start + chunk_tokens]
if not chunk:
continue
chunks.append(
TextChunk(
text=encoding.decode(chunk).strip(),
token_start=start,
token_count=len(chunk),
)
)
if start + chunk_tokens >= len(tokens):
break
return [chunk for chunk in chunks if chunk.text]
def ingest_configured_paths(session: Session, config: EbookSearchConfig) -> int:
"""Ingest every EPUB found under configured library paths."""
count = 0
for library_path in config.library_paths:
path = Path(library_path).expanduser()
logger.info("ebook_ingest_path_start path=%s", path)
if path.is_file() and path.suffix.lower() == ".epub":
count += int(ingest_file(session, path))
elif path.is_dir():
for epub_path in sorted(path.rglob("*.epub")):
count += int(ingest_file(session, epub_path))
else:
logger.warning("ebook_ingest_path_missing path=%s", path)
logger.info("ebook_ingest_paths_complete changed_files=%s configured_paths=%s", count, len(config.library_paths))
return count
def ingest_file(session: Session, path: Path) -> bool:
"""Ingest one EPUB file. Return True when the database changed."""
resolved_path = path.expanduser().resolve()
logger.info("ebook_ingest_file_start path=%s", resolved_path)
file_hash = sha256_file(resolved_path)
existing = find_existing_source(session, resolved_path, file_hash)
if existing is not None and existing.file_sha256 == file_hash:
stat = resolved_path.stat()
existing.file_path = str(resolved_path)
existing.file_mtime = datetime.fromtimestamp(stat.st_mtime, tz=UTC)
existing.file_size = stat.st_size
session.flush()
logger.info("ebook_ingest_file_unchanged source_id=%s path=%s", existing.id, resolved_path)
return False
if existing is not None:
logger.info("ebook_ingest_file_replacing source_id=%s path=%s", existing.id, resolved_path)
session.delete(existing)
session.flush()
stat = resolved_path.stat()
parsed = parse_epub(resolved_path)
source = EbookSource(
title=parsed.title,
author=parsed.author,
language=parsed.language,
publisher=parsed.publisher,
identifier=parsed.identifier,
file_path=str(resolved_path),
file_sha256=file_hash,
file_mtime=datetime.fromtimestamp(stat.st_mtime, tz=UTC),
file_size=stat.st_size,
)
session.add(source)
session.flush()
chunk_index = 0
for spine_index, parsed_chapter in enumerate(parsed.chapters):
chapter = EbookChapter(
source_id=source.id,
spine_index=spine_index,
title=parsed_chapter.title,
href=parsed_chapter.href,
)
session.add(chapter)
session.flush()
chunk_index = add_chapter_chunks(session, source, chapter, parsed_chapter, chunk_index)
session.flush()
logger.info(
"ebook_ingest_file_complete source_id=%s path=%s chapters=%s chunks=%s",
source.id,
resolved_path,
len(parsed.chapters),
chunk_index,
)
return True
def find_existing_source(session: Session, path: Path, file_hash: str) -> EbookSource | None:
"""Find an existing source by canonical path or file hash."""
return session.scalar(
select(EbookSource).where(or_(EbookSource.file_path == str(path), EbookSource.file_sha256 == file_hash))
)
def add_chapter_chunks(
session: Session,
source: EbookSource,
chapter: EbookChapter,
parsed_chapter: ParsedChapter,
chunk_index: int,
) -> int:
"""Add chunk rows for one parsed chapter and return the next chunk index."""
page_label = parsed_chapter.page_labels[0] if parsed_chapter.page_labels else None
for text_chunk in chunk_text(parsed_chapter.text):
session.add(
EbookChunk(
source_id=source.id,
chapter_id=chapter.id,
chunk_index=chunk_index,
text=text_chunk.text,
token_start=text_chunk.token_start,
token_count=text_chunk.token_count,
page_label=page_label,
content_sha256=hashlib.sha256(text_chunk.text.encode()).hexdigest(),
search_text=f"{source.title} {source.author or ''} {chapter.title or ''} {text_chunk.text}",
)
)
chunk_index += 1
return chunk_index
def sha256_file(path: Path) -> str:
"""Calculate the SHA-256 digest for a file."""
digest = hashlib.sha256()
with path.open("rb") as file:
for block in iter(lambda: file.read(1024 * 1024), b""):
digest.update(block)
return digest.hexdigest()
+143
View File
@@ -0,0 +1,143 @@
"""LLM provider HTTP adapters."""
from __future__ import annotations
import logging
from typing import TYPE_CHECKING
import httpx
if TYPE_CHECKING:
from collections.abc import Sequence
from python.ebook_search.config import EbookSearchConfig, RerankConfig
logger = logging.getLogger(__name__)
def auth_headers(api_key: str) -> dict[str, str]:
"""Build authorization headers when an API key is configured."""
if api_key == "not-needed":
return {}
return {"Authorization": f"Bearer {api_key}"}
def request_embeddings(texts: Sequence[str], config: EbookSearchConfig) -> list[list[float]]:
"""Request embeddings from the configured OpenAI-compatible endpoint."""
try:
response = httpx.post(
f"{config.embedding_base_url.rstrip('/')}/embeddings",
headers=auth_headers(config.embedding_api_key),
json={"model": config.embedding_model, "input": list(texts)},
timeout=60,
)
response.raise_for_status()
return embedding_vectors_from_response(response.json())
except (httpx.HTTPError, ValueError, KeyError, TypeError) as error:
logger.exception(
"ebook_embed_request_failed base_url=%s model=%s count=%s",
config.embedding_base_url,
config.embedding_model,
len(texts),
)
msg = f"Embedding request failed. base_url={config.embedding_base_url} model={config.embedding_model}"
raise RuntimeError(msg) from error
def embedding_vectors_from_response(body: object) -> list[list[float]]:
"""Extract embedding vectors from an OpenAI-compatible embedding response."""
if not isinstance(body, dict):
msg = "Embedding response is not an object"
raise TypeError(msg)
data = body["data"]
if not isinstance(data, list):
msg = "Embedding response data is not a list"
raise TypeError(msg)
vectors: list[list[float]] = []
for item in data:
if not isinstance(item, dict):
msg = "Embedding item is not an object"
raise TypeError(msg)
embedding = item["embedding"]
if not isinstance(embedding, list):
msg = "Embedding value is not a list"
raise TypeError(msg)
vectors.append([float(value) for value in embedding])
return vectors
def request_rerank(
query: str,
documents: Sequence[str],
config: RerankConfig,
) -> object | None:
"""Request rerank scores from the configured vLLM endpoint."""
payload = {
"model": config.model,
"query": query,
"documents": list(documents),
}
response = httpx.post(
f"{config.base_url.rstrip('/')}/rerank",
json=payload,
timeout=config.timeout_seconds,
)
response.raise_for_status()
try:
return response.json()
except ValueError:
logger.debug("ebook_rerank_response_invalid_json", extra={"response": response.text})
return None
def request_chat_completion(
config: EbookSearchConfig,
messages: Sequence[dict[str, str]],
) -> str:
"""Request a chat completion from the configured OpenAI-compatible endpoint."""
try:
response = httpx.post(
f"{config.vllm_base_url.rstrip('/')}/chat/completions",
headers=auth_headers(config.vllm_api_key),
json={
"model": config.chat_model,
"messages": list(messages),
"temperature": 0,
},
timeout=60,
)
response.raise_for_status()
return chat_content_from_response(response.json())
except (httpx.HTTPError, ValueError, KeyError, TypeError) as error:
msg = f"Chat request failed. base_url={config.vllm_base_url} model={config.chat_model}"
raise RuntimeError(msg) from error
def chat_content_from_response(body: object) -> str:
"""Extract text content from an OpenAI-compatible chat response."""
if not isinstance(body, dict):
msg = "Chat response is not an object"
raise TypeError(msg)
choices = body["choices"]
if not isinstance(choices, list) or not choices:
msg = "Chat response has no choices"
raise ValueError(msg)
first = choices[0]
if not isinstance(first, dict):
msg = "Chat choice is not an object"
raise TypeError(msg)
message = first["message"]
if not isinstance(message, dict):
msg = "Chat message is not an object"
raise TypeError(msg)
content = message.get("content") or ""
if not isinstance(content, str):
msg = "Chat content is not text"
raise TypeError(msg)
return content
+129
View File
@@ -0,0 +1,129 @@
"""vLLM-backed optional reranking."""
from __future__ import annotations
import logging
from dataclasses import dataclass, replace
from typing import TYPE_CHECKING
from python.ebook_search.llm_interface import request_rerank
if TYPE_CHECKING:
from python.ebook_search.config import RerankConfig
from python.ebook_search.search import SearchResult
logger = logging.getLogger(__name__)
RERANK_SCORE_WEIGHT = 0.7
HYBRID_SCORE_WEIGHT = 0.3
@dataclass(frozen=True)
class RerankResult:
"""A relevance score for one candidate chunk."""
chunk_id: int
score: float
def rerank_chunks(query: str, candidates: list[SearchResult], config: RerankConfig) -> list[SearchResult]:
"""Rerank candidates with a vLLM rerank endpoint."""
if not candidates:
return []
logger.info(
"ebook_rerank_request_start base_url=%s model=%s candidates=%s",
config.base_url,
config.model,
len(candidates),
)
scores = score_candidates(query, candidates, config)
results = sorted(
(
replace(
result,
score=final_rerank_score(result, scores[result.chunk_id].score, candidates),
rerank_score=scores[result.chunk_id].score,
)
for result in candidates
),
key=lambda result: result.score,
reverse=True,
)
logger.info(
"ebook_rerank_request_complete base_url=%s model=%s candidates=%s",
config.base_url,
config.model,
len(results),
)
return results
def score_candidates(
query: str,
candidates: list[SearchResult],
config: RerankConfig,
) -> dict[int, RerankResult]:
"""Score candidate chunks with the configured rerank API."""
body = request_rerank(query, [candidate.text for candidate in candidates], config)
if body is None:
return zero_rerank_scores(candidates)
scores = parse_vllm_scores(body, candidates)
for result in scores.values():
logger.debug("ebook_rerank_candidate_scored chunk_id=%s score=%s", result.chunk_id, result.score)
return scores
def parse_vllm_scores(body: object, candidates: list[SearchResult]) -> dict[int, RerankResult]:
"""Parse vLLM rerank scores into chunk-id keyed results."""
if not isinstance(body, dict):
logger.debug("ebook_rerank_response_not_object", extra={"response": body})
return zero_rerank_scores(candidates)
results = body.get("results") or body.get("data")
if not isinstance(results, list):
logger.debug("ebook_rerank_response_missing_results", extra={"response": body})
return zero_rerank_scores(candidates)
scores = zero_rerank_scores(candidates)
for item in results:
if not isinstance(item, dict):
continue
index = item.get("index")
score = item.get("relevance_score", item.get("score"))
if not isinstance(index, int) or index < 0 or index >= len(candidates):
continue
if not isinstance(score, int | float):
continue
chunk_id = candidates[index].chunk_id
scores[chunk_id] = RerankResult(chunk_id=chunk_id, score=clamp_score(float(score)))
return scores
def zero_rerank_scores(candidates: list[SearchResult]) -> dict[int, RerankResult]:
"""Return zero relevance scores for all candidate chunks."""
return {candidate.chunk_id: RerankResult(chunk_id=candidate.chunk_id, score=0.0) for candidate in candidates}
def clamp_score(score: float) -> float:
"""Clamp a rerank score into the supported 0.0 to 1.0 range."""
return min(max(score, 0.0), 1.0)
def final_rerank_score(result: SearchResult, rerank_score: float, candidates: list[SearchResult]) -> float:
"""Combine rerank relevance with normalized hybrid retrieval evidence."""
return (RERANK_SCORE_WEIGHT * rerank_score) + (HYBRID_SCORE_WEIGHT * normalized_hybrid_score(result, candidates))
def normalized_hybrid_score(result: SearchResult, candidates: list[SearchResult]) -> float:
"""Normalize a candidate hybrid score against the rerank candidate set."""
hybrid_scores = [
candidate.fused_score if candidate.fused_score is not None else candidate.score for candidate in candidates
]
low = min(hybrid_scores)
high = max(hybrid_scores)
if high == low:
return 1.0
score = result.fused_score if result.fused_score is not None else result.score
return (score - low) / (high - low)
+383
View File
@@ -0,0 +1,383 @@
"""Hybrid search orchestration."""
from __future__ import annotations
import logging
import re
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass, replace
from typing import TYPE_CHECKING
from pgvector.sqlalchemy import Vector
from sqlalchemy import literal, select
from sqlalchemy.orm import Session
from python.ebook_search.bm25_corpus import (
BM25CorpusUnavailableError,
load_bm25_corpus,
score_bm25_corpus,
)
from python.ebook_search.embeddings import MODEL_DIMENSIONS, embed_query, get_embedding_table
from python.ebook_search.rerank import rerank_chunks
from python.ebook_search.timing import RuntimeStep, timed_result
from python.orm.richie import (
EbookChapter,
EbookChunk,
EbookEmbeddingModel,
EbookSource,
)
if TYPE_CHECKING:
from collections.abc import Mapping
from sqlalchemy.engine import Engine
from python.ebook_search.config import EbookSearchConfig
logger = logging.getLogger(__name__)
BM25_CANDIDATE_LIMIT = 120
@dataclass(frozen=True)
class SearchResult:
"""One source chunk returned by search."""
chunk_id: int
text: str
source_title: str
score: float = 0.0
vector_score: float | None = None
bm25_score: float | None = None
fused_score: float | None = None
rerank_score: float | None = None
source_author: str | None = None
chapter_title: str | None = None
page_label: str | None = None
rank_source: str = "Hybrid"
@dataclass(frozen=True)
class SearchResponse:
"""Search output for the UI."""
query: str
results: list[SearchResult]
rank_label: str
timings: tuple[RuntimeStep, ...] = ()
@property
def total_runtime_ms(self) -> float:
"""Return total measured runtime for the response."""
return sum(step.duration_ms for step in self.timings if step.counts_toward_total)
@dataclass(frozen=True)
class RetrievalResponse:
"""Parallel retrieval output for vector and BM25 candidates."""
vector_results: list[SearchResult]
lexical_results: list[SearchResult]
timings: tuple[RuntimeStep, ...]
def search_ebooks(
engine: Engine,
query: str,
config: EbookSearchConfig,
*,
rerank: bool = False,
) -> SearchResponse:
"""Run hybrid vector/BM25 search and optional reranking."""
if not query.strip():
logger.info("ebook_search_empty_query")
return SearchResponse(query=query, results=[], rank_label="Hybrid")
logger.info("ebook_search_start query_length=%s rerank=%s", len(query), rerank)
timings: list[RuntimeStep] = []
bm25_query, timing = timed_result("BM25 query preparation", retrieval_query_from_text, query)
timings.append(timing)
retrieval, timing = timed_result(
"Hybrid retrieval",
parallel_retrieval,
engine,
query,
bm25_query,
config,
)
timings.extend(retrieval.timings)
timings.append(timing)
fused, timing = timed_result(
"Reciprocal rank fusion",
reciprocal_rank_fusion,
retrieval.vector_results,
retrieval.lexical_results,
)
timings.append(timing)
if config.rerank.enabled and rerank:
response, timing = timed_result("Rerank", apply_rerank, query, fused, config)
else:
response, timing = timed_result("Rerank skipped", skip_rerank, query, fused, config)
timings.append(timing)
response = replace(response, timings=tuple(timings))
logger.info(
"ebook_search_complete vector_candidates=%s lexical_candidates=%s "
"fused_candidates=%s returned=%s rank_label=%s runtime_ms=%.1f",
len(retrieval.vector_results),
len(retrieval.lexical_results),
len(fused),
len(response.results),
response.rank_label,
response.total_runtime_ms,
)
return response
def parallel_retrieval(
engine: Engine,
vector_query: str,
bm25_query: str,
config: EbookSearchConfig,
) -> RetrievalResponse:
"""Run vector and BM25 candidate retrieval concurrently with separate database sessions."""
with ThreadPoolExecutor(max_workers=2, thread_name_prefix="ebook-search") as executor:
vector_future = executor.submit(
timed_result,
"Embedding + vector search",
vector_candidates,
engine,
vector_query,
config,
)
bm25_future = executor.submit(
timed_result,
"BM25 search",
bm25_candidates,
bm25_query,
config,
)
vector_results, vector_timing = vector_future.result()
lexical_results, lexical_timing = bm25_future.result()
logger.info(
"ebook_parallel_retrieval_complete vector_candidates=%s lexical_candidates=%s",
len(vector_results),
len(lexical_results),
)
return RetrievalResponse(
vector_results=vector_results,
lexical_results=lexical_results,
timings=(
replace(vector_timing, counts_toward_total=False),
replace(lexical_timing, counts_toward_total=False),
),
)
def skip_rerank(
query: str,
candidates: list[SearchResult],
config: EbookSearchConfig,
) -> SearchResponse:
"""Return fused hybrid results without reranking."""
logger.info("ebook_rerank_skipped candidates=%s", len(candidates))
return SearchResponse(query=query, results=candidates[: config.top_k], rank_label="Hybrid")
def apply_rerank(
query: str,
candidates: list[SearchResult],
config: EbookSearchConfig,
) -> SearchResponse:
"""Rerank already-fused hybrid candidates."""
reranked = rerank_chunks(query, candidates[: config.rerank.candidates], config.rerank)
logger.info(
"ebook_rerank_complete input_candidates=%s returned=%s",
min(len(candidates), config.rerank.candidates),
len(reranked),
)
return SearchResponse(
query=query,
results=[replace(result, rank_source="Hybrid + rerank") for result in reranked[: config.top_k]],
rank_label="Hybrid + rerank",
)
def vector_candidates(engine: Engine, query: str, config: EbookSearchConfig) -> list[SearchResult]:
"""Return pgvector cosine candidates for a natural-language query."""
with Session(engine) as session:
model = session.scalar(select(EbookEmbeddingModel).where(EbookEmbeddingModel.name == config.embedding_model))
if model is None:
msg = f"Embedding model is not registered: {config.embedding_model}"
raise ValueError(msg)
expected_dimension = MODEL_DIMENSIONS[config.embedding_model]
if model.dimension != expected_dimension:
msg = f"Model row dimension {model.dimension} does not match configured dimension {expected_dimension}"
raise ValueError(msg)
embedding = embed_query(query, config)
limit = max(config.rerank.candidates, config.top_k) * 4
embedding_table = get_embedding_table(model.dimension)
embedding_param = literal(embedding, type_=Vector(model.dimension))
distance = embedding_table.embedding.op("<=>")(embedding_param)
score = (literal(1.0) - distance).label("score")
statement = (
select(
EbookChunk.id.label("chunk_id"),
EbookChunk.text.label("text"),
EbookSource.title.label("source_title"),
EbookSource.author.label("source_author"),
EbookChapter.title.label("chapter_title"),
EbookChunk.page_label.label("page_label"),
score,
)
.select_from(embedding_table)
.join(EbookChunk, EbookChunk.id == embedding_table.chunk_id)
.join(EbookSource, EbookSource.id == EbookChunk.source_id)
.outerjoin(EbookChapter, EbookChapter.id == EbookChunk.chapter_id)
.where(embedding_table.model_id == model.id)
.order_by(distance)
.limit(limit)
)
rows = session.execute(statement).mappings()
results = [search_result_from_row(row) for row in rows]
logger.info(
"ebook_vector_search_complete model=%s dimension=%s candidates=%s",
config.embedding_model,
model.dimension,
len(results),
)
return results
def bm25_candidates(query: str, config: EbookSearchConfig) -> list[SearchResult]:
"""Return BM25-ranked lexical candidates using the persisted corpus."""
try:
corpus = load_bm25_corpus(config)
except BM25CorpusUnavailableError as error:
logger.warning("ebook_bm25_index_unavailable_skipping error=%s", error)
return []
if not corpus.records:
logger.info("ebook_bm25_search_complete corpus=0 candidates=0")
return []
scored_records = score_bm25_corpus(query, corpus, limit=BM25_CANDIDATE_LIMIT)
results = [
replace(search_result_from_row(record), score=score, vector_score=None, bm25_score=score)
for record, score in scored_records
]
max_score = results[0].bm25_score if results else 0.0
logger.info(
"ebook_bm25_search_complete corpus=%s candidates=%s max_score=%.6f",
len(corpus.records),
len(results),
max_score,
)
return results
def reciprocal_rank_fusion(
vector_results: list[SearchResult],
lexical_results: list[SearchResult],
*,
rank_constant: int = 60,
) -> list[SearchResult]:
"""Fuse vector and lexical rankings with Reciprocal Rank Fusion."""
by_chunk: dict[int, SearchResult] = {}
scores: dict[int, float] = {}
vector_scores: dict[int, float] = {}
bm25_scores: dict[int, float] = {}
for rank, result in enumerate(vector_results, start=1):
by_chunk.setdefault(result.chunk_id, result)
vector_scores[result.chunk_id] = result.vector_score if result.vector_score is not None else result.score
scores[result.chunk_id] = scores.get(result.chunk_id, 0.0) + (1 / (rank_constant + rank))
for rank, result in enumerate(lexical_results, start=1):
by_chunk.setdefault(result.chunk_id, result)
bm25_scores[result.chunk_id] = result.bm25_score if result.bm25_score is not None else result.score
scores[result.chunk_id] = scores.get(result.chunk_id, 0.0) + (1 / (rank_constant + rank))
return sorted(
(
replace(
result,
score=scores[result.chunk_id],
vector_score=vector_scores.get(result.chunk_id),
bm25_score=bm25_scores.get(result.chunk_id),
fused_score=scores[result.chunk_id],
rank_source="Hybrid",
)
for result in by_chunk.values()
),
key=lambda result: result.score,
reverse=True,
)
def search_result_from_row(row: Mapping[str, object]) -> SearchResult:
"""Convert a database row mapping into a search result."""
return SearchResult(
chunk_id=int(row["chunk_id"]),
text=str(row["text"]),
source_title=str(row["source_title"]),
source_author=optional_str(row["source_author"]),
chapter_title=optional_str(row["chapter_title"]),
page_label=optional_str(row["page_label"]),
score=float(row["score"]) if "score" in row else 0.0,
vector_score=float(row["score"]) if "score" in row else None,
)
def optional_str(value: object) -> str | None:
"""Convert nullable database values to optional strings."""
if value is None:
return None
return str(value)
TOKEN_RE = re.compile(r"[A-Za-z0-9_]+")
def tokens(text_value: str) -> list[str]:
"""Extract tokens from a text value.
This is a simple approximation of the tokenization used by PostgreSQL's full-text search,
which is sufficient for BM25 candidate retrieval. It lowercases tokens and includes alphanumeric characters and
underscores.
"""
return [match.group(0).lower() for match in TOKEN_RE.finditer(text_value)]
QUERY_STOP_WORDS = {
"a",
"an",
"and",
"are",
"as",
"at",
"does",
"for",
"in",
"is",
"of",
"the",
"to",
"what",
"when",
"where",
"which",
"who",
"why",
}
def retrieval_query_from_text(query: str) -> str:
"""Remove generic question words while preserving entity and series terms."""
keywords = [token for token in tokens(query) if token not in QUERY_STOP_WORDS]
if not keywords:
return query
return " ".join(keywords)
+36
View File
@@ -0,0 +1,36 @@
"""Runtime timing helpers for EPUB search."""
from __future__ import annotations
from dataclasses import dataclass
from time import perf_counter
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from collections.abc import Callable
@dataclass(frozen=True)
class RuntimeStep:
"""Elapsed runtime for one named search step."""
name: str
duration_ms: float
counts_toward_total: bool = True
def runtime_step_from_start(name: str, start_seconds: float) -> RuntimeStep:
"""Create a runtime step from a prior perf_counter timestamp."""
return RuntimeStep(name=name, duration_ms=(perf_counter() - start_seconds) * 1000)
def timed_result[T, **P](
name: str,
operation: Callable[P, T],
*args: P.args,
**kwargs: P.kwargs,
) -> tuple[T, RuntimeStep]:
"""Run an operation and return its result plus elapsed runtime."""
start_seconds = perf_counter()
result = operation(*args, **kwargs)
return result, runtime_step_from_start(name, start_seconds)
+6
View File
@@ -0,0 +1,6 @@
"""Reusable FastAPI tools."""
from python.fastapi_tools.db import DbSession, get_db
from python.fastapi_tools.zstd_middleware import ZstdMiddleware
__all__ = ["DbSession", "ZstdMiddleware", "get_db"]
@@ -1,11 +1,15 @@
"""FastAPI dependencies."""
from collections.abc import Iterator
from typing import Annotated
from __future__ import annotations
from typing import TYPE_CHECKING, Annotated
from fastapi import Depends, Request
from sqlalchemy.orm import Session
if TYPE_CHECKING:
from collections.abc import Iterator
def get_db(request: Request) -> Iterator[Session]:
"""Get database session from app state."""
@@ -1,10 +1,14 @@
"""Middleware for the FastAPI application."""
"""Zstd response compression middleware."""
from compression import zstd
from typing import TYPE_CHECKING
from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
from starlette.requests import Request
from starlette.responses import Response
if TYPE_CHECKING:
from starlette.requests import Request
MINIMUM_RESPONSE_SIZE = 500
+347
View File
@@ -0,0 +1,347 @@
"""Small Gitea API client for repository automation."""
from __future__ import annotations
from dataclasses import dataclass
from typing import Self
from urllib.parse import quote
import httpx
DEFAULT_PAGE_SIZE = 100
EXPECTED_NO_CONTENT = 204
EXPECTED_CREATED = 201
EXPECTED_OK = 200
@dataclass(frozen=True)
class CreatedIssue:
"""Issue data returned by Gitea."""
number: int | None
html_url: str | None
title: str
@dataclass(frozen=True)
class PullRequest:
"""Pull request data returned by Gitea."""
number: int
title: str
html_url: str | None
labels: tuple[str, ...]
head_branch: str | None
base_branch: str | None
@dataclass(frozen=True)
class WorkflowJob:
"""Workflow job data returned by Gitea Actions."""
id: int
name: str
run_id: int | None
status: str | None
conclusion: str | None
class GiteaError(RuntimeError):
"""Raised when Gitea rejects an API request."""
def split_repo_name(repo: str) -> tuple[str, str]:
"""Split an owner/repo string into its parts."""
owner, separator, repo_name = repo.partition("/")
if not separator or not owner or not repo_name:
msg = f"Invalid repository name: {repo}"
raise ValueError(msg)
return owner, repo_name
class GiteaClient:
"""HTTP client for the subset of Gitea APIs used in this repository."""
def __init__(
self,
*,
base_url: str,
token: str,
timeout: int = 30,
transport: httpx.BaseTransport | None = None,
) -> None:
"""Initialize the Gitea client."""
self._client = httpx.Client(
base_url=base_url.rstrip("/"),
timeout=timeout,
headers={"Authorization": f"token {token}"},
transport=transport,
)
def create_issue(
self,
*,
owner: str,
repo: str,
title: str,
body: str,
labels: list[int] | None = None,
) -> CreatedIssue:
"""Create a Gitea issue."""
payload: dict[str, object] = {"title": title, "body": body, "labels": labels or []}
response = self._request(
"POST",
f"/api/v1/repos/{owner}/{repo}/issues",
expected_statuses={EXPECTED_CREATED},
json=payload,
)
data = response.json()
return CreatedIssue(
number=_optional_int(data.get("number")),
html_url=_optional_str(data.get("html_url")),
title=str(data.get("title", title)),
)
def resolve_label_ids(self, *, owner: str, repo: str, labels: list[str]) -> list[int]:
"""Resolve label names to Gitea label IDs."""
if not labels:
return []
available_labels: dict[str, int] = {}
page = 1
while True:
response = self._request(
"GET",
f"/api/v1/repos/{owner}/{repo}/labels",
params={"page": page, "limit": DEFAULT_PAGE_SIZE},
)
batch = response.json()
if not batch:
break
for label in batch:
label_name = str(label.get("name", ""))
label_id = _optional_int(label.get("id"))
if label_name and label_id is not None:
available_labels[label_name] = label_id
if len(batch) < DEFAULT_PAGE_SIZE:
break
page += 1
missing = [label for label in labels if label not in available_labels]
if missing:
missing_names = ", ".join(sorted(missing))
msg = f"Missing Gitea labels: {missing_names}"
raise GiteaError(msg)
return [available_labels[label] for label in labels]
def list_open_pull_requests(
self,
*,
owner: str,
repo: str,
labels: list[str] | None = None,
head: str | None = None,
) -> list[PullRequest]:
"""List open pull requests for a repository."""
expected_labels = set(labels or [])
pull_requests: list[PullRequest] = []
page = 1
while True:
response = self._request(
"GET",
f"/api/v1/repos/{owner}/{repo}/pulls",
params={"state": "open", "page": page, "limit": DEFAULT_PAGE_SIZE},
)
batch = response.json()
if not batch:
break
for item in batch:
pull_request = _pull_request_from_api(item)
if head and pull_request.head_branch != head:
continue
if expected_labels and not expected_labels.issubset(set(pull_request.labels)):
continue
pull_requests.append(pull_request)
if len(batch) < DEFAULT_PAGE_SIZE:
break
page += 1
return pull_requests
def create_pull_request(
self,
*,
owner: str,
repo: str,
title: str,
body: str,
head: str,
base: str,
labels: list[str] | None = None,
) -> PullRequest:
"""Create a pull request."""
payload: dict[str, object] = {
"title": title,
"body": body,
"head": head,
"base": base,
}
if labels:
payload["labels"] = self.resolve_label_ids(owner=owner, repo=repo, labels=labels)
response = self._request(
"POST",
f"/api/v1/repos/{owner}/{repo}/pulls",
expected_statuses={EXPECTED_CREATED},
json=payload,
)
return _pull_request_from_api(response.json())
def merge_pull_request(
self,
*,
owner: str,
repo: str,
number: int,
merge_method: str = "rebase",
head_commit_id: str | None = None,
delete_branch_after_merge: bool = False,
) -> None:
"""Merge a pull request."""
payload: dict[str, object] = {
"Do": merge_method,
"delete_branch_after_merge": delete_branch_after_merge,
}
if head_commit_id:
payload["head_commit_id"] = head_commit_id
self._request(
"POST",
f"/api/v1/repos/{owner}/{repo}/pulls/{number}/merge",
json=payload,
)
def dispatch_workflow(self, *, owner: str, repo: str, workflow_id: str, ref: str) -> None:
"""Trigger a workflow_dispatch run."""
workflow_path = quote(workflow_id, safe="")
self._request(
"POST",
f"/api/v1/repos/{owner}/{repo}/actions/workflows/{workflow_path}/dispatches",
expected_statuses={EXPECTED_OK, EXPECTED_NO_CONTENT},
json={"ref": ref},
)
def list_run_jobs(self, *, owner: str, repo: str, run_id: str | int) -> list[WorkflowJob]:
"""List workflow jobs for a specific run."""
jobs: list[WorkflowJob] = []
page = 1
while True:
response = self._request(
"GET",
f"/api/v1/repos/{owner}/{repo}/actions/jobs",
params={"page": page, "limit": DEFAULT_PAGE_SIZE},
)
payload = response.json()
batch = payload.get("jobs", [])
if not batch:
break
for item in batch:
if str(item.get("run_id")) != str(run_id):
continue
jobs.append(_workflow_job_from_api(item))
if len(batch) < DEFAULT_PAGE_SIZE:
break
page += 1
return jobs
def download_job_logs(self, *, owner: str, repo: str, job_id: int) -> str:
"""Download logs for a workflow job."""
response = self._request(
"GET",
f"/api/v1/repos/{owner}/{repo}/actions/jobs/{job_id}/logs",
)
return response.text
def close(self) -> None:
"""Close the underlying HTTP client."""
self._client.close()
def __enter__(self) -> Self:
"""Enter the context manager."""
return self
def __exit__(self, *args: object) -> None:
"""Close the HTTP client."""
self.close()
def _request(
self,
method: str,
path: str,
*,
expected_statuses: set[int] | None = None,
**kwargs: object,
) -> httpx.Response:
"""Send an HTTP request and validate the response status."""
response = self._client.request(method, path, **kwargs)
statuses = expected_statuses or {EXPECTED_OK}
if response.status_code not in statuses:
msg = f"Gitea request failed ({response.status_code}): {response.text}"
raise GiteaError(msg)
return response
def _pull_request_from_api(data: dict[str, object]) -> PullRequest:
"""Convert Gitea API pull-request data into a dataclass."""
number = _optional_int(data.get("number")) or _optional_int(data.get("index"))
if number is None:
msg = "Gitea pull request payload is missing a number"
raise GiteaError(msg)
labels = tuple(str(label.get("name", "")) for label in data.get("labels", []))
head = data.get("head", {})
base = data.get("base", {})
return PullRequest(
number=number,
title=str(data.get("title", "")),
html_url=_optional_str(data.get("html_url")),
labels=tuple(label for label in labels if label),
head_branch=_optional_str(head.get("ref")) or _optional_str(data.get("head_branch")),
base_branch=_optional_str(base.get("ref")) or _optional_str(data.get("base_branch")),
)
def _workflow_job_from_api(data: dict[str, object]) -> WorkflowJob:
"""Convert Gitea API workflow-job data into a dataclass."""
job_id = _optional_int(data.get("id"))
if job_id is None:
msg = "Gitea workflow job payload is missing an ID"
raise GiteaError(msg)
return WorkflowJob(
id=job_id,
name=str(data.get("name", "")),
run_id=_optional_int(data.get("run_id")),
status=_optional_str(data.get("status")),
conclusion=_optional_str(data.get("conclusion")),
)
def _optional_int(value: object) -> int | None:
"""Convert an API value to an integer when present."""
if value is None:
return None
return int(value)
def _optional_str(value: object) -> str | None:
"""Convert an API value to a string when present."""
if value is None:
return None
return str(value)
+148
View File
@@ -0,0 +1,148 @@
"""Automation helpers for flake.lock pull requests on Gitea."""
from __future__ import annotations
import subprocess
from os import getenv
from typing import Annotated
import typer
from python.gitea import GiteaClient, PullRequest, split_repo_name
DEFAULT_BASE_BRANCH = "main"
DEFAULT_BRANCH = "automation/update-flake-lock"
DEFAULT_GITEA_URL = "https://gitea.tmmworkshop.com"
PR_LABELS = ["dependencies", "automated", "flake_lock_update"]
PR_CHECK_WORKFLOWS = ["build_systems.yml", "treefmt.yml", "pytest.yml"]
PR_TITLE = "Update flake.lock"
PR_BODY = "Automated flake.lock update."
app = typer.Typer(add_completion=False)
def run_cmd(cmd: list[str], *, check: bool = True) -> subprocess.CompletedProcess[str]:
"""Run a subprocess command."""
return subprocess.run(cmd, capture_output=True, text=True, check=check)
def ensure_flake_lock_pull_request(
client: GiteaClient,
*,
owner: str,
repo: str,
branch: str,
base: str,
) -> PullRequest:
"""Return an existing flake.lock PR for the branch or create one."""
pull_requests = client.list_open_pull_requests(owner=owner, repo=repo, head=branch)
if pull_requests:
return pull_requests[0]
return client.create_pull_request(
owner=owner,
repo=repo,
title=PR_TITLE,
body=PR_BODY,
head=branch,
base=base,
labels=PR_LABELS,
)
def find_flake_lock_pull_request(client: GiteaClient, *, owner: str, repo: str) -> PullRequest | None:
"""Find the first open flake.lock pull request."""
pull_requests = client.list_open_pull_requests(owner=owner, repo=repo, labels=["flake_lock_update"])
if not pull_requests:
return None
return pull_requests[0]
def dispatch_pull_request_checks(client: GiteaClient, *, owner: str, repo: str, branch: str) -> None:
"""Dispatch the workflows that normally run for pull requests."""
for workflow in PR_CHECK_WORKFLOWS:
client.dispatch_workflow(owner=owner, repo=repo, workflow_id=workflow, ref=branch)
def has_worktree_changes() -> bool:
"""Return whether `flake.lock` has worktree changes."""
result = run_cmd(["git", "diff", "--quiet", "--", "flake.lock"], check=False)
return result.returncode != 0
def commit_flake_lock_update(*, branch: str) -> None:
"""Commit the updated lock file to the automation branch."""
run_cmd(["git", "config", "user.name", "gitea-actions[bot]"])
run_cmd(["git", "config", "user.email", "gitea-actions@tmmworkshop.com"])
run_cmd(["git", "checkout", "-B", branch])
run_cmd(["git", "add", "flake.lock"])
run_cmd(["git", "commit", "-m", "chore: update flake.lock"])
def push_branch(*, branch: str) -> None:
"""Push the automation branch to origin."""
run_cmd(["git", "push", "origin", f"HEAD:{branch}", "--force"])
def _required_gitea_token() -> str:
"""Read the required Gitea token from the environment."""
token = getenv("GITEA_TOKEN")
if token:
return token
msg = "GITEA_TOKEN environment variable is required"
raise RuntimeError(msg)
@app.command()
def update(
repo: Annotated[str, typer.Option("--repo", help="Gitea repository in owner/repo form")],
base: Annotated[str, typer.Option("--base", help="Base branch")] = DEFAULT_BASE_BRANCH,
branch: Annotated[str, typer.Option("--branch", help="Automation branch")] = DEFAULT_BRANCH,
) -> None:
"""Commit flake.lock changes and ensure a pull request exists."""
if not has_worktree_changes():
typer.echo("No flake.lock changes detected")
return
commit_flake_lock_update(branch=branch)
push_branch(branch=branch)
owner, repo_name = split_repo_name(repo)
with GiteaClient(
base_url=getenv("GITEA_URL", DEFAULT_GITEA_URL),
token=_required_gitea_token(),
) as client:
pull_request = ensure_flake_lock_pull_request(
client,
owner=owner,
repo=repo_name,
branch=branch,
base=base,
)
# We can remove this if Gitea fixes the following issue:
# https://github.com/go-gitea/gitea/issues/33963
dispatch_pull_request_checks(client, owner=owner, repo=repo_name, branch=branch)
typer.echo(pull_request.html_url or f"Pull request #{pull_request.number}")
@app.command()
def merge(
repo: Annotated[str, typer.Option("--repo", help="Gitea repository in owner/repo form")],
) -> None:
"""Merge the first open flake.lock pull request."""
owner, repo_name = split_repo_name(repo)
with GiteaClient(
base_url=getenv("GITEA_URL", DEFAULT_GITEA_URL),
token=_required_gitea_token(),
) as client:
pull_request = find_flake_lock_pull_request(client, owner=owner, repo=repo_name)
if not pull_request:
typer.echo("No open PR found with label flake_lock_update")
return
client.merge_pull_request(owner=owner, repo=repo_name, number=pull_request.number, merge_method="rebase")
typer.echo(f"Merged PR #{pull_request.number}")
if __name__ == "__main__":
app()
+6 -2
View File
@@ -1,9 +1,10 @@
"""FastAPI heater control service."""
from __future__ import annotations
import logging
from collections.abc import AsyncIterator
from contextlib import asynccontextmanager
from typing import Annotated
from typing import TYPE_CHECKING, Annotated
import typer
import uvicorn
@@ -13,6 +14,9 @@ from python.common import configure_logger
from python.heater.controller import HeaterController
from python.heater.models import ActionResult, DeviceConfig, HeaterStatus
if TYPE_CHECKING:
from collections.abc import AsyncIterator
logger = logging.getLogger(__name__)
+1
View File
@@ -262,6 +262,7 @@ def installer(
):
run(command, check=True, stdin=test.stdout)
# Fixed mount point for the new system; the installer runs as root on a fresh disk
mnt_dir = "/tmp/nix_install" # noqa: S108
Path(mnt_dir).mkdir(parents=True, exist_ok=True)
@@ -16,9 +16,13 @@ from typing import TYPE_CHECKING
if TYPE_CHECKING:
from collections.abc import Sequence
logger = logging.getLogger(__name__)
ESCAPE_KEY = 27
def configure_logger(level: str = "INFO") -> None:
"""Configure the logger.
Args:
level (str, optional): The logging level. Defaults to "INFO".
"""
@@ -32,15 +36,17 @@ def configure_logger(level: str = "INFO") -> None:
def bash_wrapper(command: str) -> str:
"""Execute a bash command and capture the output.
Args:
command (str): The bash command to be executed.
Returns:
Tuple[str, int]: A tuple containing the output of the command (stdout) as a string,
the error output (stderr) as a string (optional), and the return code as an integer.
"""
logging.debug(f"running {command=}")
logger.debug(f"running {command=}")
# This is a acceptable risk
process = Popen(command.split(), stdout=PIPE, stderr=PIPE) # noqa: S603
process = Popen(command.split(), stdout=PIPE, stderr=PIPE)
output, _ = process.communicate()
if process.returncode != 0:
error = f"Failed to run command {command=} return code {process.returncode=}"
@@ -51,6 +57,7 @@ def bash_wrapper(command: str) -> str:
def partition_disk(disk: str, swap_size: int, reserve: int = 0) -> None:
"""Partition a disk.
Args:
disk (str): The disk to partition.
swap_size (int): The size of the swap partition in GB.
@@ -58,7 +65,7 @@ def partition_disk(disk: str, swap_size: int, reserve: int = 0) -> None:
reserve (int, optional): The size of the reserve partition in GB. Defaults to 0.
minimum value is 0.
"""
logging.info(f"partitioning {disk=}")
logger.info(f"partitioning {disk=}")
swap_size = max(swap_size, 1)
reserve = max(reserve, 0)
@@ -66,16 +73,16 @@ def partition_disk(disk: str, swap_size: int, reserve: int = 0) -> None:
if reserve > 0:
msg = f"Creating swap partition on {disk=} with size {swap_size=}GiB and reserve {reserve=}GiB"
logging.info(msg)
logger.info(msg)
swap_start = swap_size + reserve
swap_partition = f"mkpart swap -{swap_start}GiB -{reserve}GiB "
else:
logging.info(f"Creating swap partition on {disk=} with size {swap_size=}GiB")
logger.info(f"Creating swap partition on {disk=} with size {swap_size=}GiB")
swap_start = swap_size
swap_partition = f"mkpart swap -{swap_start}GiB 100% "
logging.debug(f"{swap_partition=}")
logger.debug(f"{swap_partition=}")
create_partitions = (
f"parted --script --align=optimal {disk} -- "
@@ -87,13 +94,14 @@ def partition_disk(disk: str, swap_size: int, reserve: int = 0) -> None:
)
bash_wrapper(create_partitions)
logging.info(f"{disk=} successfully partitioned")
logger.info(f"{disk=} successfully partitioned")
def create_zfs_pool(pool_disks: Sequence[str], mnt_dir: str) -> None:
"""Create a ZFS pool.
Args:
disks (Sequence[str]): A tuple of disks to use for the pool.
pool_disks (Sequence[str]): A tuple of disks to use for the pool.
mnt_dir (str): The mount directory.
"""
if len(pool_disks) <= 0:
@@ -125,13 +133,12 @@ def create_zfs_pool(pool_disks: Sequence[str], mnt_dir: str) -> None:
bash_wrapper(zpool_create)
zpools = bash_wrapper("zpool list -o name")
if "root_pool" not in zpools.splitlines():
logging.critical("Failed to create root_pool")
logger.critical("Failed to create root_pool")
sys.exit(1)
def create_zfs_datasets() -> None:
"""Create ZFS datasets."""
bash_wrapper("zfs create -o canmount=noauto -o reservation=10G root_pool/root")
bash_wrapper("zfs create root_pool/home")
bash_wrapper("zfs create root_pool/var -o reservation=1G")
@@ -146,7 +153,7 @@ def create_zfs_datasets() -> None:
}
missing_datasets = expected_datasets.difference(datasets.splitlines())
if missing_datasets:
logging.critical(f"Failed to create pools {missing_datasets}")
logger.critical(f"Failed to create pools {missing_datasets}")
sys.exit(1)
@@ -159,6 +166,8 @@ def get_cpu_manufacturer() -> str:
for line in output.splitlines():
if "vendor_id" in line:
return id_vendor[line.split(": ")[1].strip()]
error = "Failed to get CPU manufacturer"
raise RuntimeError(error)
def get_boot_drive_id(disk: str) -> str:
@@ -167,9 +176,8 @@ def get_boot_drive_id(disk: str) -> str:
return output.splitlines()[1]
def create_nix_hardware_file(mnt_dir: str, disks: Sequence[str], encrypt: bool) -> None:
def create_nix_hardware_file(mnt_dir: str, disks: Sequence[str], *, encrypt: bool) -> None:
"""Create a NixOS hardware file."""
cpu_manufacturer = get_cpu_manufacturer()
devices = ""
@@ -193,7 +201,15 @@ def create_nix_hardware_file(mnt_dir: str, disks: Sequence[str], encrypt: bool)
' imports = [ (modulesPath + "/installer/scan/not-detected.nix") ];\n\n'
" boot = {\n"
" initrd = {\n"
' availableKernelModules = [ \n "ahci"\n "ehci_pci"\n "nvme"\n "sd_mod"\n "usb_storage"\n "usbhid"\n "xhci_pci"\n ];\n'
" availableKernelModules = [ \n"
' "ahci"\n'
' "ehci_pci"\n'
' "nvme"\n'
' "sd_mod"\n'
' "usb_storage"\n'
' "usbhid"\n'
' "xhci_pci"\n'
" ];\n"
" kernelModules = [ ];\n"
f" {devices}"
" };\n"
@@ -207,11 +223,18 @@ def create_nix_hardware_file(mnt_dir: str, disks: Sequence[str], encrypt: bool)
' "/nix" = {\n device = "root_pool/nix";\n fsType = "zfs";\n };\n\n'
' "/boot" = {\n'
f' device = "/dev/disk/by-uuid/{get_boot_drive_id(disks[0])}";\n'
' fsType = "vfat";\n options = [\n "fmask=0077"\n "dmask=0077"\n ];\n };\n };\n\n'
' fsType = "vfat";\n'
" options = [\n"
' "fmask=0077"\n'
' "dmask=0077"\n'
" ];\n"
" };\n"
" };\n\n"
" swapDevices = [ ];\n\n"
" networking.useDHCP = lib.mkDefault true;\n\n"
' nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";\n'
f" hardware.cpu.{cpu_manufacturer}.updateMicrocode = lib.mkDefault config.hardware.enableRedistributableFirmware;\n"
f" hardware.cpu.{cpu_manufacturer}.updateMicrocode = lib.mkDefault "
"config.hardware.enableRedistributableFirmware;\n"
f' networking.hostId = "{host_id}";\n'
"}\n"
)
@@ -219,7 +242,7 @@ def create_nix_hardware_file(mnt_dir: str, disks: Sequence[str], encrypt: bool)
Path(f"{mnt_dir}/etc/nixos/hardware-configuration.nix").write_text(nix_hardware)
def install_nixos(mnt_dir: str, disks: Sequence[str], encrypt: bool) -> None:
def install_nixos(mnt_dir: str, disks: Sequence[str], *, encrypt: bool) -> None:
"""Install NixOS."""
bash_wrapper(f"mount -o X-mount.mkdir -t zfs root_pool/root {mnt_dir}")
bash_wrapper(f"mount -o X-mount.mkdir -t zfs root_pool/home {mnt_dir}/home")
@@ -230,14 +253,16 @@ def install_nixos(mnt_dir: str, disks: Sequence[str], encrypt: bool) -> None:
bash_wrapper(f"mkfs.vfat -n EFI {disk}-part1")
# set up mirroring afterwards if more than one disk
boot_partition = f"mount -t vfat -o fmask=0077,dmask=0077,iocharset=iso8859-1,X-mount.mkdir {disks[0]}-part1 {mnt_dir}/boot"
boot_partition = (
f"mount -t vfat -o fmask=0077,dmask=0077,iocharset=iso8859-1,X-mount.mkdir {disks[0]}-part1 {mnt_dir}/boot"
)
bash_wrapper(boot_partition)
bash_wrapper(f"nixos-generate-config --root {mnt_dir}")
create_nix_hardware_file(mnt_dir, disks, encrypt)
create_nix_hardware_file(mnt_dir, disks, encrypt=encrypt)
run(("nixos-install", "--root", mnt_dir), check=True) # noqa: S603
run(("nixos-install", "--root", mnt_dir), check=True)
def installer(
@@ -247,27 +272,38 @@ def installer(
encrypt_key: str | None,
) -> None:
"""Main."""
logging.info("Starting installation")
logger.info("Starting installation")
for disk in disks:
partition_disk(disk, swap_size, reserve)
if encrypt_key:
sleep(1)
for command in (
f'printf "{encrypt_key}" | cryptsetup luksFormat --type luks2 {disk}-part2 -',
f'printf "{encrypt_key}" | cryptsetup luksOpen {disk}-part2 luks-root-pool-{disk.split("/")[-1]}-part2 -',
):
run(command, shell=True, check=True)
key_input = encrypt_key.encode()
run(
("cryptsetup", "luksFormat", "--type", "luks2", f"{disk}-part2", "-"),
input=key_input,
check=True,
)
run(
(
"cryptsetup",
"luksOpen",
f"{disk}-part2",
f"luks-root-pool-{disk.split('/')[-1]}-part2",
"-",
),
input=key_input,
check=True,
)
# Fixed mount point for the new system; the installer runs as root on a fresh disk
mnt_dir = "/tmp/nix_install" # noqa: S108
Path(mnt_dir).mkdir(parents=True, exist_ok=True)
if encrypt_key:
pool_disks = [
f"/dev/mapper/luks-root-pool-{disk.split('/')[-1]}-part2" for disk in disks
]
pool_disks = [f"/dev/mapper/luks-root-pool-{disk.split('/')[-1]}-part2" for disk in disks]
else:
pool_disks = [f"{disk}-part2" for disk in disks]
@@ -275,57 +311,73 @@ def installer(
create_zfs_datasets()
install_nixos(mnt_dir, disks, encrypt_key)
install_nixos(mnt_dir, disks, encrypt=bool(encrypt_key))
logging.info("Installation complete")
logger.info("Installation complete")
class Cursor:
def __init__(self):
"""Track cursor position and constrain movement to screen bounds."""
def __init__(self) -> None:
"""Initialize cursor position and screen dimensions."""
self.x_position = 0
self.y_position = 0
self.height = 0
self.width = 0
def set_height(self, height: int):
def set_height(self, height: int) -> None:
"""Set the maximum screen height."""
self.height = height
def set_width(self, width: int):
def set_width(self, width: int) -> None:
"""Set the maximum screen width."""
self.width = width
def x_bounce_check(self, cursor: int) -> int:
"""Clamp an x position to the screen width."""
cursor = max(0, cursor)
return min(self.width - 1, cursor)
def y_bounce_check(self, cursor: int) -> int:
"""Clamp a y position to the screen height."""
cursor = max(0, cursor)
return min(self.height - 1, cursor)
def set_x(self, x: int):
def set_x(self, x: int) -> None:
"""Set the cursor x position."""
self.x_position = self.x_bounce_check(x)
def set_y(self, y: int):
def set_y(self, y: int) -> None:
"""Set the cursor y position."""
self.y_position = self.y_bounce_check(y)
def get_x(self) -> int:
"""Get the cursor x position."""
return self.x_position
def get_y(self) -> int:
"""Get the cursor y position."""
return self.y_position
def move_up(self):
def move_up(self) -> None:
"""Move the cursor up one row."""
self.set_y(self.y_position - 1)
def move_down(self):
def move_down(self) -> None:
"""Move the cursor down one row."""
self.set_y(self.y_position + 1)
def move_left(self):
def move_left(self) -> None:
"""Move the cursor left one column."""
self.set_x(self.x_position - 1)
def move_right(self):
def move_right(self) -> None:
"""Move the cursor right one column."""
self.set_x(self.x_position + 1)
def navigation(self, key: int) -> None:
"""Move the cursor for a curses navigation key."""
action = {
curses.KEY_DOWN: self.move_down,
curses.KEY_UP: self.move_up,
@@ -339,7 +391,8 @@ class Cursor:
class State:
"""State class to store the state of the program."""
def __init__(self):
def __init__(self) -> None:
"""Initialize installer menu state."""
self.key = 0
self.cursor = Cursor()
@@ -357,11 +410,9 @@ class State:
def get_device(raw_device: str) -> dict[str, str]:
"""Parse an lsblk key-value device row."""
raw_device_components = raw_device.split(" ")
return {
thing.split("=")[0].lower(): thing.split("=")[1].strip('"')
for thing in raw_device_components
}
return {thing.split("=")[0].lower(): thing.split("=")[1].strip('"') for thing in raw_device_components}
def get_devices() -> list[dict[str, str]]:
@@ -373,6 +424,7 @@ def get_devices() -> list[dict[str, str]]:
def get_device_id_mapping() -> dict[str, set[str]]:
"""Get a list of device ids.
Returns:
list[str]: the list of device ids
"""
@@ -387,9 +439,8 @@ def get_device_id_mapping() -> dict[str, set[str]]:
return device_id_mapping
def calculate_device_menu_padding(
devices: list[dict[str, str]], column: str, padding: int = 0
) -> int:
def calculate_device_menu_padding(devices: list[dict[str, str]], column: str, padding: int = 0) -> int:
"""Calculate the width needed for a device menu column."""
return max(len(device[column]) for device in devices) + padding
@@ -401,6 +452,7 @@ def draw_device_ids(
menu_width: list[int],
device_ids: set[str],
) -> tuple[State, int]:
"""Draw selectable device IDs for a device row."""
for device_id in sorted(device_ids):
row_number = row_number + 1
if row_number == state.cursor.get_y() and state.cursor.get_x() in menu_width:
@@ -429,8 +481,9 @@ def draw_device_menu(
state: State,
menu_start_y: int = 0,
menu_start_x: int = 0,
) -> State:
"""draw the device menu and handle user input
) -> tuple[State, int]:
"""Draw the device menu and handle user input.
Args:
std_screen (curses.window): the curses window to draw on
devices (list[dict[str, str]]): the list of devices to draw
@@ -438,6 +491,7 @@ def draw_device_menu(
state (State): the state object to update
menu_start_y (int, optional): the y position to start drawing the menu. Defaults to 0.
menu_start_x (int, optional): the x position to start drawing the menu. Defaults to 0.
Returns:
State: the updated state object
"""
@@ -448,7 +502,9 @@ def draw_device_menu(
type_padding = calculate_device_menu_padding(devices, "type", padding)
mountpoints_padding = calculate_device_menu_padding(devices, "mountpoints", padding)
device_header = f"{'Name':{name_padding}}{'Size':{size_padding}}{'Type':{type_padding}}{'Mountpoints':{mountpoints_padding}}"
device_header = (
f"{'Name':{name_padding}}{'Size':{size_padding}}{'Type':{type_padding}}{'Mountpoints':{mountpoints_padding}}"
)
menu_width = range(menu_start_x, len(device_header) + menu_start_x)
@@ -481,8 +537,9 @@ def draw_device_menu(
def debug_menu(std_screen: curses.window, key: int) -> None:
"""Draw debug information for the current curses screen."""
height, width = std_screen.getmaxyx()
width_height = "Width: {}, Height: {}".format(width, height)
width_height = f"Width: {width}, Height: {height}"
std_screen.addstr(height - 4, 0, width_height, curses.color_pair(5))
key_pressed = f"Last key pressed: {key}"[: width - 1]
@@ -490,7 +547,7 @@ def debug_menu(std_screen: curses.window, key: int) -> None:
key_pressed = "No key press detected..."[: width - 1]
std_screen.addstr(height - 3, 0, key_pressed)
for i in range(0, 8):
for i in range(8):
std_screen.addstr(height - 2, i * 3, f"{i}██", curses.color_pair(i))
@@ -500,12 +557,11 @@ def status_bar(
width: int,
height: int,
) -> None:
"""Draw the footer status bar."""
std_screen.attron(curses.A_REVERSE)
std_screen.attron(curses.color_pair(3))
status_bar = (
f"Press 'q' to exit | STATUS BAR | Pos: {cursor.get_x()}, {cursor.get_y()}"
)
status_bar = f"Press 'q' to exit | STATUS BAR | Pos: {cursor.get_x()}, {cursor.get_y()}"
std_screen.addstr(height - 1, 0, status_bar)
std_screen.addstr(height - 1, len(status_bar), " " * (width - len(status_bar) - 1))
@@ -514,13 +570,15 @@ def status_bar(
def set_color() -> None:
"""Initialize curses color pairs."""
curses.start_color()
curses.use_default_colors()
for i in range(0, curses.COLORS):
for i in range(curses.COLORS):
curses.init_pair(i + 1, i, -1)
def get_text_input(std_screen: curses.window, prompt: str, y: int, x: int) -> str:
"""Read text input from a curses screen."""
curses.echo()
std_screen.addstr(y, x, prompt)
input_str = ""
@@ -528,10 +586,10 @@ def get_text_input(std_screen: curses.window, prompt: str, y: int, x: int) -> st
key = std_screen.getch()
if key == ord("\n"):
break
elif key == 27: # ESC key
if key == ESCAPE_KEY:
input_str = ""
break
elif key in (curses.KEY_BACKSPACE, ord("\b"), 127):
if key in (curses.KEY_BACKSPACE, ord("\b"), 127):
input_str = input_str[:-1]
std_screen.addstr(y, x + len(prompt), input_str + " ")
else:
@@ -546,6 +604,7 @@ def swap_size_input(
state: State,
swap_offset: int,
) -> State:
"""Handle swap size input."""
swap_size_text = "Swap size (GB): "
std_screen.addstr(swap_offset, 0, f"{swap_size_text}{state.swap_size}")
if state.key == ord("\n") and state.cursor.get_y() == swap_offset:
@@ -557,9 +616,7 @@ def swap_size_input(
state.swap_size = int(swap_size_str)
state.show_swap_input = False
except ValueError:
std_screen.addstr(
swap_offset, 0, "Invalid input. Press any key to continue."
)
std_screen.addstr(swap_offset, 0, "Invalid input. Press any key to continue.")
std_screen.getch()
state.show_swap_input = False
@@ -571,22 +628,19 @@ def reserve_size_input(
state: State,
reserve_offset: int,
) -> State:
"""Handle reserve size input."""
reserve_size_text = "reserve size (GB): "
std_screen.addstr(reserve_offset, 0, f"{reserve_size_text}{state.reserve_size}")
if state.key == ord("\n") and state.cursor.get_y() == reserve_offset:
state.show_reserve_input = True
if state.show_reserve_input:
reserve_size_str = get_text_input(
std_screen, reserve_size_text, reserve_offset, 0
)
reserve_size_str = get_text_input(std_screen, reserve_size_text, reserve_offset, 0)
try:
state.reserve_size = int(reserve_size_str)
state.show_reserve_input = False
except ValueError:
std_screen.addstr(
reserve_offset, 0, "Invalid input. Press any key to continue."
)
std_screen.addstr(reserve_offset, 0, "Invalid input. Press any key to continue.")
std_screen.getch()
state.show_reserve_input = False
@@ -594,9 +648,11 @@ def reserve_size_input(
def draw_menu(std_screen: curses.window) -> State:
"""draw the menu and handle user input
"""Draw the menu and handle user input.
Args:
std_screen (curses.window): the curses window to draw on
Returns:
State: the state object
"""
@@ -656,17 +712,18 @@ def draw_menu(std_screen: curses.window) -> State:
def main() -> None:
"""Run the installer menu and start installation."""
configure_logger("DEBUG")
state = curses.wrapper(draw_menu)
encrypt_key = getenv("ENCRYPT_KEY")
logging.info("installing_nixos")
logging.info(f"disks: {state.selected_device_ids}")
logging.info(f"swap_size: {state.swap_size}")
logging.info(f"reserve: {state.reserve_size}")
logging.info(f"encrypted: {bool(encrypt_key)}")
logger.info("installing_nixos")
logger.info(f"disks: {state.selected_device_ids}")
logger.info(f"swap_size: {state.swap_size}")
logger.info(f"reserve: {state.reserve_size}")
logger.info(f"encrypted: {bool(encrypt_key)}")
sleep(3)
-6
View File
@@ -1,13 +1,7 @@
"""ORM package exports."""
from python.orm.data_science_dev.base import DataScienceDevBase
from python.orm.richie.base import RichieBase
from python.orm.signal_bot.base import SignalBotBase
from python.orm.van_inventory.base import VanInventoryBase
__all__ = [
"DataScienceDevBase",
"RichieBase",
"SignalBotBase",
"VanInventoryBase",
]
+24 -2
View File
@@ -31,8 +31,24 @@ def get_connection_info(name: str) -> tuple[str, str, str, str, str | None]:
return cast("tuple[str, str, str, str, str | None]", (database, host, port, username, password))
def get_postgres_engine(*, name: str = "POSTGRES", pool_pre_ping: bool = True) -> Engine:
"""Create a SQLAlchemy engine from environment variables."""
def get_postgres_engine(
*,
name: str = "POSTGRES",
pool_pre_ping: bool = True,
vector_engine: bool = False,
) -> Engine:
"""Create a SQLAlchemy engine from environment variables.
Args:
name (str, optional): The name of the environment variable prefix. Defaults to "POSTGRES".
pool_pre_ping (bool, optional): Whether to ping the database before each connection. Defaults to True.
This fixes the issue of trying to use a conection that has timed out on the database side.
vector_engine (bool, optional): Whether to use the vector search schema. Defaults to False.
This updates the search path the incldued the vecore types and operators.
Returns:
Engine: The SQLAlchemy engine.
"""
database, host, port, username, password = get_connection_info(name)
url = URL.create(
@@ -44,8 +60,14 @@ def get_postgres_engine(*, name: str = "POSTGRES", pool_pre_ping: bool = True) -
database=database,
)
connect_args = {}
# There more better way to do this is with separate PG account and a dedicated vector schema for the vector types
if vector_engine:
connect_args["options"] = "-csearch_path=main,public"
return create_engine(
url=url,
pool_pre_ping=pool_pre_ping,
pool_recycle=1800,
connect_args=connect_args,
)
-11
View File
@@ -1,11 +0,0 @@
"""Data science dev database ORM exports."""
from __future__ import annotations
from python.orm.data_science_dev.base import DataScienceDevBase, DataScienceDevTableBase, DataScienceDevTableBaseBig
__all__ = [
"DataScienceDevBase",
"DataScienceDevTableBase",
"DataScienceDevTableBaseBig",
]
-52
View File
@@ -1,52 +0,0 @@
"""Data science dev database ORM base."""
from __future__ import annotations
from datetime import datetime
from sqlalchemy import BigInteger, DateTime, MetaData, func
from sqlalchemy.ext.declarative import AbstractConcreteBase
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
from python.orm.common import NAMING_CONVENTION
class DataScienceDevBase(DeclarativeBase):
"""Base class for data_science_dev database ORM models."""
schema_name = "main"
metadata = MetaData(
schema=schema_name,
naming_convention=NAMING_CONVENTION,
)
class _TableMixin:
"""Shared timestamp columns for all table bases."""
created: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
server_default=func.now(),
)
updated: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
server_default=func.now(),
onupdate=func.now(),
)
class DataScienceDevTableBase(_TableMixin, AbstractConcreteBase, DataScienceDevBase):
"""Table with Integer primary key."""
__abstract__ = True
id: Mapped[int] = mapped_column(primary_key=True)
class DataScienceDevTableBaseBig(_TableMixin, AbstractConcreteBase, DataScienceDevBase):
"""Table with BigInteger primary key."""
__abstract__ = True
id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
-10
View File
@@ -1,10 +0,0 @@
"""Data science dev database ORM models."""
from __future__ import annotations
from python.orm.data_science_dev.posts import partitions # noqa: F401 — registers partition classes in metadata
from python.orm.data_science_dev.posts.tables import Posts
__all__ = [
"Posts",
]
@@ -1,11 +0,0 @@
"""Posts module — weekly-partitioned posts table and partition ORM models."""
from __future__ import annotations
from python.orm.data_science_dev.posts.failed_ingestion import FailedIngestion
from python.orm.data_science_dev.posts.tables import Posts
__all__ = [
"FailedIngestion",
"Posts",
]
@@ -1,33 +0,0 @@
"""Shared column definitions for the posts partitioned table family."""
from __future__ import annotations
from datetime import datetime
from sqlalchemy import BigInteger, SmallInteger, Text
from sqlalchemy.orm import Mapped, mapped_column
class PostsColumns:
"""Mixin providing all posts columns. Used by both the parent table and partitions."""
post_id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
user_id: Mapped[int] = mapped_column(BigInteger)
instance: Mapped[str]
date: Mapped[datetime] = mapped_column(primary_key=True)
text: Mapped[str] = mapped_column(Text)
langs: Mapped[str | None]
like_count: Mapped[int]
reply_count: Mapped[int]
repost_count: Mapped[int]
reply_to: Mapped[int | None] = mapped_column(BigInteger)
replied_author: Mapped[int | None] = mapped_column(BigInteger)
thread_root: Mapped[int | None] = mapped_column(BigInteger)
thread_root_author: Mapped[int | None] = mapped_column(BigInteger)
repost_from: Mapped[int | None] = mapped_column(BigInteger)
reposted_author: Mapped[int | None] = mapped_column(BigInteger)
quotes: Mapped[int | None] = mapped_column(BigInteger)
quoted_author: Mapped[int | None] = mapped_column(BigInteger)
labels: Mapped[str | None]
sent_label: Mapped[int | None] = mapped_column(SmallInteger)
sent_score: Mapped[float | None]
@@ -1,17 +0,0 @@
"""Table for storing JSONL lines that failed during post ingestion."""
from __future__ import annotations
from sqlalchemy import Text
from sqlalchemy.orm import Mapped, mapped_column
from python.orm.data_science_dev.base import DataScienceDevTableBase
class FailedIngestion(DataScienceDevTableBase):
"""Stores raw JSONL lines and their error messages when ingestion fails."""
__tablename__ = "failed_ingestion"
raw_line: Mapped[str] = mapped_column(Text)
error: Mapped[str] = mapped_column(Text)
@@ -1,71 +0,0 @@
"""Dynamically generated ORM classes for each weekly partition of the posts table.
Each class maps to a PostgreSQL partition table (e.g. posts_2024_01).
These are real ORM models tracked by Alembic autogenerate.
Uses ISO week numbering (datetime.isocalendar().week). ISO years can have
52 or 53 weeks, and week boundaries are always Monday to Monday.
"""
from __future__ import annotations
import sys
from datetime import UTC, datetime
from python.orm.data_science_dev.base import DataScienceDevBase
from python.orm.data_science_dev.posts.columns import PostsColumns
PARTITION_START_YEAR = 2023
PARTITION_END_YEAR = 2026
_current_module = sys.modules[__name__]
def iso_weeks_in_year(year: int) -> int:
"""Return the number of ISO weeks in a given year (52 or 53)."""
dec_28 = datetime(year, 12, 28, tzinfo=UTC)
return dec_28.isocalendar().week
def week_bounds(year: int, week: int) -> tuple[datetime, datetime]:
"""Return (start, end) datetimes for an ISO week.
Start = Monday 00:00:00 UTC of the given ISO week.
End = Monday 00:00:00 UTC of the following ISO week.
"""
start = datetime.fromisocalendar(year, week, 1).replace(tzinfo=UTC)
if week < iso_weeks_in_year(year):
end = datetime.fromisocalendar(year, week + 1, 1).replace(tzinfo=UTC)
else:
end = datetime.fromisocalendar(year + 1, 1, 1).replace(tzinfo=UTC)
return start, end
def _build_partition_classes() -> dict[str, type]:
"""Generate one ORM class per ISO week partition."""
classes: dict[str, type] = {}
for year in range(PARTITION_START_YEAR, PARTITION_END_YEAR + 1):
for week in range(1, iso_weeks_in_year(year) + 1):
class_name = f"PostsWeek{year}W{week:02d}"
table_name = f"posts_{year}_{week:02d}"
partition_class = type(
class_name,
(PostsColumns, DataScienceDevBase),
{
"__tablename__": table_name,
"__table_args__": ({"implicit_returning": False},),
},
)
classes[class_name] = partition_class
return classes
# Generate all partition classes and register them on this module
_partition_classes = _build_partition_classes()
for _name, _cls in _partition_classes.items():
setattr(_current_module, _name, _cls)
__all__ = list(_partition_classes.keys())
@@ -1,13 +0,0 @@
"""Posts parent table with PostgreSQL weekly range partitioning on date column."""
from __future__ import annotations
from python.orm.data_science_dev.base import DataScienceDevBase
from python.orm.data_science_dev.posts.columns import PostsColumns
class Posts(PostsColumns, DataScienceDevBase):
"""Parent partitioned table for posts, partitioned by week on `date`."""
__tablename__ = "posts"
__table_args__ = ({"postgresql_partition_by": "RANGE (date)"},)
+20 -5
View File
@@ -2,8 +2,8 @@
from __future__ import annotations
from python.orm.richie.audiobook import Audiobook, AudiobookAuthor, AudiobookSeries
from python.orm.richie.base import RichieBase, TableBase, TableBaseBig, TableBaseSmall
from python.orm.richie.congress import Bill, Legislator, Vote, VoteRecord
from python.orm.richie.contact import (
Contact,
ContactNeed,
@@ -11,19 +11,34 @@ from python.orm.richie.contact import (
Need,
RelationshipType,
)
from python.orm.richie.ebook import (
EbookChapter,
EbookChunk,
EbookChunkEmbedding1024,
EbookChunkEmbedding2560,
EbookChunkEmbedding4096,
EbookEmbeddingModel,
EbookSource,
)
__all__ = [
"Bill",
"Audiobook",
"AudiobookAuthor",
"AudiobookSeries",
"Contact",
"ContactNeed",
"ContactRelationship",
"Legislator",
"EbookChapter",
"EbookChunk",
"EbookChunkEmbedding1024",
"EbookChunkEmbedding2560",
"EbookChunkEmbedding4096",
"EbookEmbeddingModel",
"EbookSource",
"Need",
"RelationshipType",
"RichieBase",
"TableBase",
"TableBaseBig",
"TableBaseSmall",
"Vote",
"VoteRecord",
]
+55
View File
@@ -0,0 +1,55 @@
"""Audiobook catalog models."""
from __future__ import annotations
from sqlalchemy import ForeignKey, UniqueConstraint
from sqlalchemy.orm import Mapped, mapped_column, relationship
from python.orm.richie.base import TableBase
class AudiobookAuthor(TableBase):
"""Canonical audiobook author."""
__tablename__ = "audiobook_author"
__table_args__ = (UniqueConstraint("name"),)
name: Mapped[str]
books: Mapped[list[Audiobook]] = relationship("Audiobook", back_populates="author")
series: Mapped[list[AudiobookSeries]] = relationship("AudiobookSeries", back_populates="author")
class AudiobookSeries(TableBase):
"""Canonical audiobook series."""
__tablename__ = "audiobook_series"
__table_args__ = (UniqueConstraint("author_id", "name"),)
name: Mapped[str]
author_id: Mapped[int] = mapped_column(ForeignKey("main.audiobook_author.id", ondelete="CASCADE"))
author: Mapped[AudiobookAuthor] = relationship("AudiobookAuthor", back_populates="series")
books: Mapped[list[Audiobook]] = relationship("Audiobook", back_populates="series")
class Audiobook(TableBase):
"""Canonical audiobook title."""
__tablename__ = "audiobook"
__table_args__ = (
UniqueConstraint(
"author_id",
"series_id",
"title",
postgresql_nulls_not_distinct=True,
),
)
title: Mapped[str]
author_id: Mapped[int] = mapped_column(ForeignKey("main.audiobook_author.id", ondelete="CASCADE"))
series_id: Mapped[int | None] = mapped_column(ForeignKey("main.audiobook_series.id", ondelete="SET NULL"))
series_index: Mapped[float] = mapped_column(default=0.0)
author: Mapped[AudiobookAuthor] = relationship("AudiobookAuthor", back_populates="books")
series: Mapped[AudiobookSeries | None] = relationship("AudiobookSeries", back_populates="books")
-150
View File
@@ -1,150 +0,0 @@
"""Congress Tracker database models."""
from __future__ import annotations
from datetime import date
from sqlalchemy import ForeignKey, Index, Text, UniqueConstraint
from sqlalchemy.orm import Mapped, mapped_column, relationship
from python.orm.richie.base import RichieBase, TableBase
class Legislator(TableBase):
"""Legislator model - members of Congress."""
__tablename__ = "legislator"
# Natural key - bioguide ID is the authoritative identifier
bioguide_id: Mapped[str] = mapped_column(Text, unique=True, index=True)
# Other IDs for cross-referencing
thomas_id: Mapped[str | None]
lis_id: Mapped[str | None]
govtrack_id: Mapped[int | None]
opensecrets_id: Mapped[str | None]
fec_ids: Mapped[str | None] # JSON array stored as string
# Name info
first_name: Mapped[str]
last_name: Mapped[str]
official_full_name: Mapped[str | None]
nickname: Mapped[str | None]
# Bio
birthday: Mapped[date | None]
gender: Mapped[str | None] # M/F
# Current term info (denormalized for query efficiency)
current_party: Mapped[str | None]
current_state: Mapped[str | None]
current_district: Mapped[int | None] # House only
current_chamber: Mapped[str | None] # rep/sen
# Relationships
vote_records: Mapped[list[VoteRecord]] = relationship(
"VoteRecord",
back_populates="legislator",
cascade="all, delete-orphan",
)
class Bill(TableBase):
"""Bill model - legislation introduced in Congress."""
__tablename__ = "bill"
# Composite natural key: congress + bill_type + number
congress: Mapped[int]
bill_type: Mapped[str] # hr, s, hres, sres, hjres, sjres
number: Mapped[int]
# Bill info
title: Mapped[str | None]
title_short: Mapped[str | None]
official_title: Mapped[str | None]
# Status
status: Mapped[str | None]
status_at: Mapped[date | None]
# Sponsor
sponsor_bioguide_id: Mapped[str | None]
# Subjects
subjects_top_term: Mapped[str | None]
# Relationships
votes: Mapped[list[Vote]] = relationship(
"Vote",
back_populates="bill",
)
__table_args__ = (
UniqueConstraint("congress", "bill_type", "number", name="uq_bill_congress_type_number"),
Index("ix_bill_congress", "congress"),
)
class Vote(TableBase):
"""Vote model - roll call votes in Congress."""
__tablename__ = "vote"
# Composite natural key: congress + chamber + session + number
congress: Mapped[int]
chamber: Mapped[str] # house/senate
session: Mapped[int]
number: Mapped[int]
# Vote details
vote_type: Mapped[str | None]
question: Mapped[str | None]
result: Mapped[str | None]
result_text: Mapped[str | None]
# Timing
vote_date: Mapped[date]
# Vote counts (denormalized for efficiency)
yea_count: Mapped[int | None]
nay_count: Mapped[int | None]
not_voting_count: Mapped[int | None]
present_count: Mapped[int | None]
# Related bill (optional - not all votes are on bills)
bill_id: Mapped[int | None] = mapped_column(ForeignKey("main.bill.id"))
# Relationships
bill: Mapped[Bill | None] = relationship("Bill", back_populates="votes")
vote_records: Mapped[list[VoteRecord]] = relationship(
"VoteRecord",
back_populates="vote",
cascade="all, delete-orphan",
)
__table_args__ = (
UniqueConstraint("congress", "chamber", "session", "number", name="uq_vote_congress_chamber_session_number"),
Index("ix_vote_date", "vote_date"),
Index("ix_vote_congress_chamber", "congress", "chamber"),
)
class VoteRecord(RichieBase):
"""Association table: Vote <-> Legislator with position."""
__tablename__ = "vote_record"
vote_id: Mapped[int] = mapped_column(
ForeignKey("main.vote.id", ondelete="CASCADE"),
primary_key=True,
)
legislator_id: Mapped[int] = mapped_column(
ForeignKey("main.legislator.id", ondelete="CASCADE"),
primary_key=True,
)
position: Mapped[str] # Yea, Nay, Not Voting, Present
# Relationships
vote: Mapped[Vote] = relationship("Vote", back_populates="vote_records")
legislator: Mapped[Legislator] = relationship("Legislator", back_populates="vote_records")
+138
View File
@@ -0,0 +1,138 @@
"""EPUB search models."""
from __future__ import annotations
from datetime import datetime
from pgvector.sqlalchemy import Vector
from sqlalchemy import BigInteger, Boolean, DateTime, ForeignKey, Index, String, UniqueConstraint
from sqlalchemy.orm import Mapped, mapped_column, relationship
from python.orm.richie.base import TableBase, TableBaseBig
class EbookSource(TableBase):
"""One indexed EPUB file."""
__tablename__ = "ebook_source"
__table_args__ = (
UniqueConstraint("file_path"),
UniqueConstraint("file_sha256"),
)
title: Mapped[str]
author: Mapped[str | None]
language: Mapped[str | None]
publisher: Mapped[str | None]
identifier: Mapped[str | None]
file_path: Mapped[str]
file_sha256: Mapped[str] = mapped_column(String(64))
file_mtime: Mapped[datetime] = mapped_column(DateTime(timezone=True))
file_size: Mapped[int] = mapped_column(BigInteger)
chapters: Mapped[list[EbookChapter]] = relationship(
"EbookChapter",
back_populates="source",
cascade="all, delete-orphan",
passive_deletes=True,
)
chunks: Mapped[list[EbookChunk]] = relationship(
"EbookChunk",
back_populates="source",
cascade="all, delete-orphan",
passive_deletes=True,
)
class EbookChapter(TableBase):
"""A chapter or spine document inside an EPUB."""
__tablename__ = "ebook_chapter"
__table_args__ = (UniqueConstraint("source_id", "spine_index"),)
source_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_source.id", ondelete="CASCADE"))
spine_index: Mapped[int]
title: Mapped[str | None]
href: Mapped[str | None]
source: Mapped[EbookSource] = relationship("EbookSource", back_populates="chapters")
chunks: Mapped[list[EbookChunk]] = relationship(
"EbookChunk",
back_populates="chapter",
cascade="all, delete-orphan",
passive_deletes=True,
)
class EbookChunk(TableBaseBig):
"""A searchable text chunk."""
__tablename__ = "ebook_chunk"
__table_args__ = (
UniqueConstraint("source_id", "chunk_index", name="uq_ebook_chunk_source_id_chunk_index"),
UniqueConstraint("source_id", "content_sha256", name="uq_ebook_chunk_source_id_content_sha256"),
)
source_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_source.id", ondelete="CASCADE"))
chapter_id: Mapped[int | None] = mapped_column(ForeignKey("main.ebook_chapter.id", ondelete="SET NULL"))
chunk_index: Mapped[int]
text: Mapped[str]
token_start: Mapped[int]
token_count: Mapped[int]
page_label: Mapped[str | None]
content_sha256: Mapped[str] = mapped_column(String(64))
search_text: Mapped[str]
source: Mapped[EbookSource] = relationship("EbookSource", back_populates="chunks")
chapter: Mapped[EbookChapter | None] = relationship("EbookChapter", back_populates="chunks")
class EbookEmbeddingModel(TableBase):
"""A supported embedding model."""
__tablename__ = "ebook_embedding_model"
name: Mapped[str] = mapped_column(String, unique=True)
dimension: Mapped[int]
is_default: Mapped[bool] = mapped_column(Boolean, default=False)
class EbookChunkEmbedding1024(TableBaseBig):
"""1024-dimensional chunk embedding."""
__tablename__ = "ebook_chunk_embedding_1024"
__table_args__ = (
UniqueConstraint("chunk_id", "model_id"),
Index(
"ix_ebook_chunk_embedding_1024_embedding_cosine",
"embedding",
postgresql_using="hnsw",
postgresql_ops={"embedding": "vector_cosine_ops"},
),
)
chunk_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_chunk.id", ondelete="CASCADE"))
model_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_embedding_model.id", ondelete="CASCADE"))
embedding: Mapped[list[float]] = mapped_column(Vector(1024))
class EbookChunkEmbedding2560(TableBaseBig):
"""2560-dimensional chunk embedding."""
__tablename__ = "ebook_chunk_embedding_2560"
__table_args__ = (UniqueConstraint("chunk_id", "model_id"),)
chunk_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_chunk.id", ondelete="CASCADE"))
model_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_embedding_model.id", ondelete="CASCADE"))
embedding: Mapped[list[float]] = mapped_column(Vector(2560))
class EbookChunkEmbedding4096(TableBaseBig):
"""4096-dimensional chunk embedding."""
__tablename__ = "ebook_chunk_embedding_4096"
__table_args__ = (UniqueConstraint("chunk_id", "model_id"),)
chunk_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_chunk.id", ondelete="CASCADE"))
model_id: Mapped[int] = mapped_column(ForeignKey("main.ebook_embedding_model.id", ondelete="CASCADE"))
embedding: Mapped[list[float]] = mapped_column(Vector(4096))
-16
View File
@@ -1,16 +0,0 @@
"""Signal bot database ORM exports."""
from __future__ import annotations
from python.orm.signal_bot.base import SignalBotBase, SignalBotTableBase, SignalBotTableBaseSmall
from python.orm.signal_bot.models import DeadLetterMessage, DeviceRole, RoleRecord, SignalDevice
__all__ = [
"DeadLetterMessage",
"DeviceRole",
"RoleRecord",
"SignalBotBase",
"SignalBotTableBase",
"SignalBotTableBaseSmall",
"SignalDevice",
]
-52
View File
@@ -1,52 +0,0 @@
"""Signal bot database ORM base."""
from __future__ import annotations
from datetime import datetime
from sqlalchemy import DateTime, MetaData, SmallInteger, func
from sqlalchemy.ext.declarative import AbstractConcreteBase
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
from python.orm.common import NAMING_CONVENTION
class SignalBotBase(DeclarativeBase):
"""Base class for signal_bot database ORM models."""
schema_name = "main"
metadata = MetaData(
schema=schema_name,
naming_convention=NAMING_CONVENTION,
)
class _TableMixin:
"""Shared timestamp columns for all table bases."""
created: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
server_default=func.now(),
)
updated: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
server_default=func.now(),
onupdate=func.now(),
)
class SignalBotTableBaseSmall(_TableMixin, AbstractConcreteBase, SignalBotBase):
"""Table with SmallInteger primary key."""
__abstract__ = True
id: Mapped[int] = mapped_column(SmallInteger, primary_key=True)
class SignalBotTableBase(_TableMixin, AbstractConcreteBase, SignalBotBase):
"""Table with Integer primary key."""
__abstract__ = True
id: Mapped[int] = mapped_column(primary_key=True)
-62
View File
@@ -1,62 +0,0 @@
"""Signal bot device, role, and dead letter ORM models."""
from __future__ import annotations
from datetime import datetime
from sqlalchemy import DateTime, Enum, ForeignKey, SmallInteger, String, Text, UniqueConstraint
from sqlalchemy.orm import Mapped, mapped_column, relationship
from python.orm.signal_bot.base import SignalBotTableBase, SignalBotTableBaseSmall
from python.signal_bot.models import MessageStatus, TrustLevel
class RoleRecord(SignalBotTableBaseSmall):
"""Lookup table for RBAC roles, keyed by smallint."""
__tablename__ = "role"
name: Mapped[str] = mapped_column(String(50), unique=True)
class DeviceRole(SignalBotTableBase):
"""Association between a device and a role."""
__tablename__ = "device_role"
__table_args__ = (
UniqueConstraint("device_id", "role_id", name="uq_device_role_device_role"),
{"schema": "main"},
)
device_id: Mapped[int] = mapped_column(ForeignKey("main.signal_device.id"))
role_id: Mapped[int] = mapped_column(SmallInteger, ForeignKey("main.role.id"))
class SignalDevice(SignalBotTableBase):
"""A Signal device tracked by phone number and safety number."""
__tablename__ = "signal_device"
phone_number: Mapped[str] = mapped_column(String(50), unique=True)
safety_number: Mapped[str | None]
trust_level: Mapped[TrustLevel] = mapped_column(
Enum(TrustLevel, name="trust_level", create_constraint=False, native_enum=False),
default=TrustLevel.UNVERIFIED,
)
last_seen: Mapped[datetime] = mapped_column(DateTime(timezone=True))
roles: Mapped[list[RoleRecord]] = relationship(secondary=DeviceRole.__table__)
class DeadLetterMessage(SignalBotTableBase):
"""A Signal message that failed processing and was sent to the dead letter queue."""
__tablename__ = "dead_letter_message"
source: Mapped[str]
message: Mapped[str] = mapped_column(Text)
received_at: Mapped[datetime] = mapped_column(DateTime(timezone=True))
status: Mapped[MessageStatus] = mapped_column(
Enum(MessageStatus, name="message_status", create_constraint=False, native_enum=False),
default=MessageStatus.UNPROCESSED,
)
-1
View File
@@ -1 +0,0 @@
"""Van inventory database ORM exports."""
-39
View File
@@ -1,39 +0,0 @@
"""Van inventory database ORM base."""
from __future__ import annotations
from datetime import datetime
from sqlalchemy import DateTime, MetaData, func
from sqlalchemy.ext.declarative import AbstractConcreteBase
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
from python.orm.common import NAMING_CONVENTION
class VanInventoryBase(DeclarativeBase):
"""Base class for van_inventory database ORM models."""
schema_name = "main"
metadata = MetaData(
schema=schema_name,
naming_convention=NAMING_CONVENTION,
)
class VanTableBase(AbstractConcreteBase, VanInventoryBase):
"""Abstract concrete base for van_inventory tables with IDs and timestamps."""
__abstract__ = True
id: Mapped[int] = mapped_column(primary_key=True)
created: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
server_default=func.now(),
)
updated: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
server_default=func.now(),
onupdate=func.now(),
)
-46
View File
@@ -1,46 +0,0 @@
"""Van inventory ORM models."""
from __future__ import annotations
from sqlalchemy import ForeignKey, UniqueConstraint
from sqlalchemy.orm import Mapped, mapped_column, relationship
from python.orm.van_inventory.base import VanTableBase
class Item(VanTableBase):
"""A food item in the van."""
__tablename__ = "items"
name: Mapped[str] = mapped_column(unique=True)
quantity: Mapped[float] = mapped_column(default=0)
unit: Mapped[str]
category: Mapped[str | None]
meal_ingredients: Mapped[list[MealIngredient]] = relationship(back_populates="item")
class Meal(VanTableBase):
"""A meal that can be made from items in the van."""
__tablename__ = "meals"
name: Mapped[str] = mapped_column(unique=True)
instructions: Mapped[str | None]
ingredients: Mapped[list[MealIngredient]] = relationship(back_populates="meal")
class MealIngredient(VanTableBase):
"""Links a meal to the items it requires, with quantities."""
__tablename__ = "meal_ingredients"
__table_args__ = (UniqueConstraint("meal_id", "item_id"),)
meal_id: Mapped[int] = mapped_column(ForeignKey("meals.id"))
item_id: Mapped[int] = mapped_column(ForeignKey("items.id"))
quantity_needed: Mapped[float]
meal: Mapped[Meal] = relationship(back_populates="ingredients")
item: Mapped[Item] = relationship(back_populates="meal_ingredients")
-1
View File
@@ -1 +0,0 @@
"""Signal command and control bot."""
-1
View File
@@ -1 +0,0 @@
"""Signal bot commands."""
-137
View File
@@ -1,137 +0,0 @@
"""Van inventory command — parse receipts and item lists via LLM, push to API."""
from __future__ import annotations
import json
import logging
from typing import TYPE_CHECKING, Any
import httpx
from python.signal_bot.models import InventoryItem, InventoryUpdate
if TYPE_CHECKING:
from python.signal_bot.llm_client import LLMClient
from python.signal_bot.models import SignalMessage
from python.signal_bot.signal_client import SignalClient
logger = logging.getLogger(__name__)
SYSTEM_PROMPT = """\
You are an inventory assistant. Extract items from the input and return ONLY
a JSON array. Each element must have these fields:
- "name": item name (string)
- "quantity": numeric count or amount (default 1)
- "unit": unit of measure (e.g. "each", "lb", "oz", "gallon", "bag", "box")
- "category": category like "food", "tools", "supplies", etc.
- "notes": any extra detail (empty string if none)
Example output:
[{"name": "water bottles", "quantity": 6, "unit": "gallon", "category": "supplies", "notes": "1 gallon each"}]
Return ONLY the JSON array, no other text.\
"""
IMAGE_PROMPT = "Extract all items from this receipt or inventory photo."
TEXT_PROMPT = "Extract all items from this inventory list."
def parse_llm_response(raw: str) -> list[InventoryItem]:
"""Parse the LLM JSON response into InventoryItem list."""
text = raw.strip()
# Strip markdown code fences if present
if text.startswith("```"):
lines = text.split("\n")
lines = [line for line in lines if not line.startswith("```")]
text = "\n".join(lines)
items_data: list[dict[str, Any]] = json.loads(text)
return [InventoryItem.model_validate(item) for item in items_data]
def _upsert_item(api_url: str, item: InventoryItem) -> None:
"""Create or update an item via the van_inventory API.
Fetches existing items, and if one with the same name exists,
patches its quantity (summing). Otherwise creates a new item.
"""
base = api_url.rstrip("/")
response = httpx.get(f"{base}/api/items", timeout=10)
response.raise_for_status()
existing: list[dict[str, Any]] = response.json()
match = next((e for e in existing if e["name"].lower() == item.name.lower()), None)
if match:
new_qty = match["quantity"] + item.quantity
patch = {"quantity": new_qty}
if item.category:
patch["category"] = item.category
response = httpx.patch(f"{base}/api/items/{match['id']}", json=patch, timeout=10)
response.raise_for_status()
return
payload = {
"name": item.name,
"quantity": item.quantity,
"unit": item.unit,
"category": item.category or None,
}
response = httpx.post(f"{base}/api/items", json=payload, timeout=10)
response.raise_for_status()
def handle_inventory_update(
message: SignalMessage,
signal: SignalClient,
llm: LLMClient,
api_url: str,
) -> InventoryUpdate:
"""Process an inventory update from a Signal message.
Accepts either an image (receipt photo) or text list.
Uses the LLM to extract structured items, then pushes to the van_inventory API.
"""
try:
logger.info(f"Processing inventory update from {message.source}")
if message.attachments:
image_data = signal.get_attachment(message.attachments[0])
raw_response = llm.chat(
IMAGE_PROMPT,
image_data=image_data,
system=SYSTEM_PROMPT,
)
source_type = "receipt_photo"
elif message.message.strip():
raw_response = llm.chat(
f"{TEXT_PROMPT}\n\n{message.message}",
system=SYSTEM_PROMPT,
)
source_type = "text_list"
else:
signal.reply(message, "Send a photo of a receipt or a text list of items to update inventory.")
return InventoryUpdate()
logger.info(f"{raw_response=}")
new_items = parse_llm_response(raw_response)
logger.info(f"{new_items=}")
for item in new_items:
_upsert_item(api_url, item)
summary = _format_summary(new_items)
signal.reply(message, f"Inventory updated with {len(new_items)} item(s):\n{summary}")
return InventoryUpdate(items=new_items, raw_response=raw_response, source_type=source_type)
except Exception:
logger.exception("Failed to process inventory update")
signal.reply(message, "Failed to process inventory update. Check logs for details.")
return InventoryUpdate()
def _format_summary(items: list[InventoryItem]) -> str:
"""Format items into a readable summary."""
lines = [f" - {item.name} x{item.quantity} {item.unit} [{item.category}]" for item in items]
return "\n".join(lines)

Some files were not shown because too many files have changed in this diff Show More