setting up bluesky firehose

This commit is contained in:
2026-03-26 08:09:32 -04:00
parent e4d5f342be
commit 0ed7be036e
6 changed files with 777 additions and 0 deletions

View File

@@ -0,0 +1,104 @@
"""Utilities for converting Bluesky identifiers to numeric database IDs.
Handles DID-to-user_id hashing, TID-to-post_id decoding, and AT-URI parsing.
"""
from __future__ import annotations
import hashlib
TID_CHARSET = "234567abcdefghijklmnopqrstuvwxyz"
_TID_LENGTH = 13
_BIGINT_MASK = 0x7FFFFFFFFFFFFFFF
_AT_URI_SEGMENT_COUNT = 3
def did_to_user_id(did: str) -> int:
"""Convert a DID string to a deterministic 63-bit integer for user_id.
Uses SHA-256, truncated to 63 bits (positive signed BigInteger range).
Collision probability is negligible at Bluesky's scale (~tens of millions of users).
Args:
did: A Bluesky DID string, e.g. "did:plc:abc123".
Returns:
A positive 63-bit integer suitable for BigInteger storage.
"""
digest = hashlib.sha256(did.encode()).digest()
return int.from_bytes(digest[:8], "big") & _BIGINT_MASK
def tid_to_integer(tid: str) -> int:
"""Decode a Bluesky TID (base32-sortbase) into a 64-bit integer for post_id.
TIDs are 13-character, base32-sortbase encoded identifiers that encode a
microsecond timestamp plus a clock ID. They are globally unique by construction.
Args:
tid: A 13-character TID string, e.g. "3abc2defghijk".
Returns:
A positive integer suitable for BigInteger storage.
Raises:
ValueError: If the TID is malformed (wrong length or invalid characters).
"""
if len(tid) != _TID_LENGTH:
message = f"TID must be {_TID_LENGTH} characters, got {len(tid)}: {tid!r}"
raise ValueError(message)
result = 0
for char in tid:
index = TID_CHARSET.find(char)
if index == -1:
message = f"Invalid character {char!r} in TID {tid!r}"
raise ValueError(message)
result = result * 32 + index
return result
def parse_at_uri(uri: str) -> tuple[str, str, str]:
"""Parse an AT-URI into its components.
Args:
uri: An AT-URI string, e.g. "at://did:plc:abc123/app.bsky.feed.post/3abc2defghijk".
Returns:
A tuple of (did, collection, rkey).
Raises:
ValueError: If the URI doesn't have the expected format.
"""
stripped = uri.removeprefix("at://")
parts = stripped.split("/", maxsplit=2)
if len(parts) != _AT_URI_SEGMENT_COUNT:
message = f"Expected {_AT_URI_SEGMENT_COUNT} path segments in AT-URI, got {len(parts)}: {uri!r}"
raise ValueError(message)
return parts[0], parts[1], parts[2]
def post_id_from_uri(uri: str) -> int:
"""Extract and decode the post_id (TID) from an AT-URI.
Args:
uri: An AT-URI pointing to a post.
Returns:
The post_id as an integer.
"""
_did, _collection, rkey = parse_at_uri(uri)
return tid_to_integer(rkey)
def user_id_from_uri(uri: str) -> int:
"""Extract and hash the user_id (DID) from an AT-URI.
Args:
uri: An AT-URI pointing to a post.
Returns:
The user_id as an integer.
"""
did, _collection, _rkey = parse_at_uri(uri)
return did_to_user_id(did)