"""SQLite database for podcast-mindmap: paragraphs, quotes, embeddings.""" import json import sqlite3 import os import numpy as np from typing import Optional DB_PATH = os.environ.get("DB_PATH", "/data/db.sqlite") def get_db(): db = sqlite3.connect(DB_PATH) db.row_factory = sqlite3.Row db.execute("PRAGMA journal_mode=WAL") return db def init_db(): db = get_db() db.executescript(""" CREATE TABLE IF NOT EXISTS podcasts ( id TEXT PRIMARY KEY, name TEXT NOT NULL, host TEXT, description TEXT, config_json TEXT ); CREATE TABLE IF NOT EXISTS episodes ( id TEXT NOT NULL, podcast_id TEXT NOT NULL, title TEXT NOT NULL, guest TEXT, staffel INTEGER, youtube_id TEXT, audio_file TEXT, PRIMARY KEY (podcast_id, id), FOREIGN KEY (podcast_id) REFERENCES podcasts(id) ); CREATE TABLE IF NOT EXISTS paragraphs ( id INTEGER PRIMARY KEY AUTOINCREMENT, podcast_id TEXT NOT NULL, episode_id TEXT NOT NULL, idx INTEGER NOT NULL, start_time REAL, end_time REAL, text TEXT NOT NULL, embedding BLOB, UNIQUE(podcast_id, episode_id, idx), FOREIGN KEY (podcast_id) REFERENCES podcasts(id) ); CREATE TABLE IF NOT EXISTS topics ( paragraph_id INTEGER NOT NULL, tag TEXT NOT NULL, score REAL DEFAULT 1.0, PRIMARY KEY (paragraph_id, tag), FOREIGN KEY (paragraph_id) REFERENCES paragraphs(id) ); CREATE TABLE IF NOT EXISTS quotes ( id TEXT NOT NULL, podcast_id TEXT NOT NULL, episode_id TEXT NOT NULL, text TEXT NOT NULL, verbatim TEXT, speaker TEXT, start_time REAL, end_time REAL, is_top_quote BOOLEAN DEFAULT 0, themes_json TEXT DEFAULT '[]', PRIMARY KEY (podcast_id, id), FOREIGN KEY (podcast_id) REFERENCES podcasts(id) ); CREATE TABLE IF NOT EXISTS staffeln ( id INTEGER NOT NULL, podcast_id TEXT NOT NULL, name TEXT NOT NULL, color TEXT DEFAULT '#666', PRIMARY KEY (podcast_id, id), FOREIGN KEY (podcast_id) REFERENCES podcasts(id) ); CREATE TABLE IF NOT EXISTS themes ( id TEXT NOT NULL, podcast_id TEXT NOT NULL, label TEXT NOT NULL, description TEXT, color TEXT DEFAULT '#666', episodes_json TEXT DEFAULT '[]', PRIMARY KEY (podcast_id, id), FOREIGN KEY (podcast_id) REFERENCES podcasts(id) ); CREATE INDEX IF NOT EXISTS idx_paragraphs_podcast ON paragraphs(podcast_id, episode_id); CREATE INDEX IF NOT EXISTS idx_quotes_podcast ON quotes(podcast_id, episode_id); CREATE INDEX IF NOT EXISTS idx_topics_tag ON topics(tag); """) db.commit() db.close() def import_podcast(podcast_id: str, mindmap_data: dict, srt_index: dict): """Import a podcast's data from mindmap_data.json + srt_index.json into the DB.""" db = get_db() # Podcast db.execute( "INSERT OR REPLACE INTO podcasts (id, name, host, description) VALUES (?, ?, ?, ?)", (podcast_id, mindmap_data.get("name", ""), mindmap_data.get("host", ""), mindmap_data.get("description", "")) ) # Staffeln for s in mindmap_data.get("staffeln", []): db.execute( "INSERT OR REPLACE INTO staffeln (id, podcast_id, name, color) VALUES (?, ?, ?, ?)", (s["id"], podcast_id, s["name"], s.get("color", "#666")) ) # Themes for t in mindmap_data.get("themes", []): db.execute( "INSERT OR REPLACE INTO themes (id, podcast_id, label, description, color, episodes_json) VALUES (?, ?, ?, ?, ?, ?)", (t["id"], podcast_id, t["label"], t.get("description", ""), t.get("color", "#666"), json.dumps(t.get("episodes", []))) ) # Episodes for ep in mindmap_data.get("episodes", []): db.execute( "INSERT OR REPLACE INTO episodes (id, podcast_id, title, guest, staffel, audio_file) VALUES (?, ?, ?, ?, ?, ?)", (ep["id"], podcast_id, ep["title"], ep.get("guest", ""), ep.get("staffel"), ep.get("audioFile")) ) # Quotes for q in mindmap_data.get("quotes", []): db.execute( "INSERT OR REPLACE INTO quotes (id, podcast_id, episode_id, text, verbatim, speaker, start_time, end_time, is_top_quote, themes_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (q["id"], podcast_id, q["episode"], q["text"], q.get("verbatim"), q.get("speaker", ""), q.get("startTime"), q.get("endTime"), q.get("isTopQuote", False), json.dumps(q.get("themes", []))) ) # Paragraphs from srt_index for ep_key, ep_data in srt_index.items(): ep_id = ep_key.split("-")[0] # S1E1-Wachstum → S1E1 for i, p in enumerate(ep_data.get("paragraphs", [])): db.execute( "INSERT OR REPLACE INTO paragraphs (podcast_id, episode_id, idx, start_time, end_time, text) VALUES (?, ?, ?, ?, ?, ?)", (podcast_id, ep_id, i, p["start"], p["end"], p["text"]) ) db.commit() db.close() def get_all_embeddings(podcast_id: Optional[str] = None): """Load all embeddings as numpy array + metadata.""" db = get_db() if podcast_id: rows = db.execute( "SELECT id, podcast_id, episode_id, idx, embedding FROM paragraphs WHERE podcast_id = ? AND embedding IS NOT NULL", (podcast_id,) ).fetchall() else: rows = db.execute( "SELECT id, podcast_id, episode_id, idx, embedding FROM paragraphs WHERE embedding IS NOT NULL" ).fetchall() db.close() if not rows: return None, [] meta = [{"id": r["id"], "podcast_id": r["podcast_id"], "episode_id": r["episode_id"], "idx": r["idx"]} for r in rows] vectors = np.array([np.frombuffer(r["embedding"], dtype=np.float32) for r in rows]) # Normalize for cosine similarity norms = np.linalg.norm(vectors, axis=1, keepdims=True) norms[norms == 0] = 1 vectors = vectors / norms return vectors, meta def store_embedding(paragraph_id: int, embedding: list[float]): """Store embedding as binary blob.""" db = get_db() blob = np.array(embedding, dtype=np.float32).tobytes() db.execute("UPDATE paragraphs SET embedding = ? WHERE id = ?", (blob, paragraph_id)) db.commit() db.close()