dotfiles/python/data_science/bluesky_transform.py

"""Transform Bluesky Jetstream messages into rows matching the Posts table schema."""

from __future__ import annotations

import json
import logging
from datetime import datetime

from python.data_science.bluesky_ids import (
    did_to_user_id,
    post_id_from_uri,
    tid_to_integer,
    user_id_from_uri,
)

logger = logging.getLogger(__name__)

INSTANCE = "bsky"
POST_COLLECTION = "app.bsky.feed.post"
EMBED_RECORD_TYPE = "app.bsky.embed.record"
EMBED_RECORD_WITH_MEDIA_TYPE = "app.bsky.embed.recordWithMedia"


def transform_jetstream_post(message: dict) -> dict:
    """Transform a Jetstream commit message into a dict matching Posts table columns.

    Expects a Jetstream message with kind=commit, operation=create,
    collection=app.bsky.feed.post.

    Args:
        message: The full Jetstream JSON message.

    Returns:
        A dict with keys matching the Posts table columns.
    """
    did = message["did"]
    commit = message["commit"]
    record = commit["record"]

    row: dict = {
        "post_id": tid_to_integer(commit["rkey"]),
        "user_id": did_to_user_id(did),
        "instance": INSTANCE,
        "date": datetime.fromisoformat(record["createdAt"]),
        "text": record.get("text", ""),
        "langs": _extract_langs(record),
        "like_count": 0,
        "reply_count": 0,
        "repost_count": 0,
        "reply_to": None,
        "replied_author": None,
        "thread_root": None,
        "thread_root_author": None,
        "repost_from": None,
        "reposted_author": None,
        "quotes": None,
        "quoted_author": None,
        "labels": _extract_labels(record),
        "sent_label": None,
        "sent_score": None,
    }

    _extract_reply_refs(record, row)
    _extract_quote_refs(record, row)

    return row


def is_post_create(message: dict) -> bool:
    """Check if a Jetstream message is a post creation event.

    Args:
        message: The full Jetstream JSON message.

    Returns:
        True if this is a create commit for app.bsky.feed.post.
    """
    if message.get("kind") != "commit":
        return False
    commit = message.get("commit", {})
    return commit.get("operation") == "create" and commit.get("collection") == POST_COLLECTION


def _extract_langs(record: dict) -> str | None:
    """Extract langs array as a JSON string, or None if absent."""
    langs = record.get("langs")
    if langs is None:
        return None
    return json.dumps(langs)


def _extract_labels(record: dict) -> str | None:
    """Extract self-labels as a JSON string, or None if absent."""
    labels_obj = record.get("labels")
    if labels_obj is None:
        return None
    values = labels_obj.get("values", [])
    if not values:
        return None
    label_strings = [label.get("val", "") for label in values]
    return json.dumps(label_strings)


def _extract_reply_refs(record: dict, row: dict) -> None:
    """Populate reply_to, replied_author, thread_root, thread_root_author from record.reply."""
    reply = record.get("reply")
    if reply is None:
        return

    parent = reply.get("parent", {})
    parent_uri = parent.get("uri")
    if parent_uri:
        row["reply_to"] = post_id_from_uri(parent_uri)
        row["replied_author"] = user_id_from_uri(parent_uri)

    root = reply.get("root", {})
    root_uri = root.get("uri")
    if root_uri:
        row["thread_root"] = post_id_from_uri(root_uri)
        row["thread_root_author"] = user_id_from_uri(root_uri)


def _extract_quote_refs(record: dict, row: dict) -> None:
    """Populate quotes and quoted_author from embed record references."""
    embed = record.get("embed")
    if embed is None:
        return

    embed_type = embed.get("$type", "")

    if embed_type == EMBED_RECORD_TYPE:
        _set_quote_from_record(embed.get("record", {}), row)
    elif embed_type == EMBED_RECORD_WITH_MEDIA_TYPE:
        inner_record = embed.get("record", {}).get("record", {})
        _set_quote_from_record(inner_record, row)


def _set_quote_from_record(record_ref: dict, row: dict) -> None:
    """Set quotes and quoted_author from a record reference object."""
    uri = record_ref.get("uri")
    if uri and POST_COLLECTION in uri:
        row["quotes"] = post_id_from_uri(uri)
        row["quoted_author"] = user_id_from_uri(uri)