Phase 2: Vorberechnete semantische Similarity + API

- precompute.py: Berechnet paarweise Cosine-Similarity aller Absätze, speichert Top-10-Nachbarn pro Absatz in semantic_links Tabelle - API: /api/similar-precomputed/{podcast}/{episode}/{idx} — liefert vorberechnete ähnliche Stellen in <1ms - Getestet: 728 Absätze, 7144 Links (Threshold 0.55) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-20 21:23:31 +02:00 · 2026-04-20 21:23:31 +02:00 · cb5978132c
commit cb5978132c
parent b0649cea49
2 changed files with 119 additions and 0 deletions
--- a/backend/app.py
+++ b/backend/app.py
@ -185,6 +185,34 @@ def find_similar(podcast_id: str, episode_id: str, para_idx: int,
    return results


+@app.get("/api/similar-precomputed/{podcast_id}/{episode_id}/{para_idx}")
+def get_precomputed_similar(podcast_id: str, episode_id: str, para_idx: int, limit: int = 10):
+    """Get precomputed similar paragraphs (fast, no embedding computation)."""
+    db = get_db()
+    rows = db.execute(
+        "SELECT sl.target_podcast, sl.target_episode, sl.target_idx, sl.score, "
+        "p.text, p.start_time, e.title, e.guest "
+        "FROM semantic_links sl "
+        "JOIN paragraphs p ON sl.target_podcast = p.podcast_id AND sl.target_episode = p.episode_id AND sl.target_idx = p.idx "
+        "JOIN episodes e ON sl.target_podcast = e.podcast_id AND sl.target_episode = e.id "
+        "WHERE sl.podcast_id = ? AND sl.source_episode = ? AND sl.source_idx = ? "
+        "ORDER BY sl.score DESC LIMIT ?",
+        (podcast_id, episode_id, para_idx, limit)
+    ).fetchall()
+    db.close()
+
+    return [{
+        "podcast_id": r["target_podcast"],
+        "episode_id": r["target_episode"],
+        "paragraph_idx": r["target_idx"],
+        "score": r["score"],
+        "text_preview": r["text"][:150],
+        "start_time": r["start_time"],
+        "episode_title": r["title"],
+        "guest": r["guest"],
+    } for r in rows]
+
+
@app.get("/api/semantic-search")
 def semantic_search(q: str = Query(..., min_length=3), podcast_id: Optional[str] = None, limit: int = 20):
    """Semantic search using query embedding."""
--- a/backend/precompute.py
+++ b/backend/precompute.py
@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+"""Precompute similarity links between paragraphs and store in DB."""
+
+import os
+import sys
+import json
+import numpy as np
+from database import get_db, get_all_embeddings, init_db
+
+MIN_SCORE = float(os.environ.get("SIMILARITY_THRESHOLD", "0.55"))
+MAX_LINKS_PER_PARA = 10
+
+
+def precompute_similarities(podcast_id=None):
+    """Compute top-N similar paragraphs for each paragraph and store as JSON."""
+    vectors, meta = get_all_embeddings(podcast_id)
+    if vectors is None or len(meta) == 0:
+        print("No embeddings found.")
+        return
+
+    # Fix NaN vectors
+    nan_mask = np.isnan(vectors).any(axis=1)
+    if nan_mask.any():
+        print(f"Warning: {nan_mask.sum()} NaN vectors found, zeroing them out")
+        vectors[nan_mask] = 0
+
+    n = len(meta)
+    print(f"Computing similarity matrix for {n} paragraphs...")
+
+    # Compute full similarity matrix (n x n)
+    sim_matrix = vectors @ vectors.T
+    np.fill_diagonal(sim_matrix, 0)  # No self-links
+
+    # For each paragraph, find top-N similar from OTHER episodes
+    db = get_db()
+
+    # Create table if needed
+    db.execute("""
+        CREATE TABLE IF NOT EXISTS semantic_links (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            podcast_id TEXT NOT NULL,
+            source_episode TEXT NOT NULL,
+            source_idx INTEGER NOT NULL,
+            target_podcast TEXT NOT NULL,
+            target_episode TEXT NOT NULL,
+            target_idx INTEGER NOT NULL,
+            score REAL NOT NULL
+        )
+    """)
+    db.execute("CREATE INDEX IF NOT EXISTS idx_semantic_source ON semantic_links(podcast_id, source_episode, source_idx)")
+
+    # Clear existing links
+    if podcast_id:
+        db.execute("DELETE FROM semantic_links WHERE podcast_id = ?", (podcast_id,))
+    else:
+        db.execute("DELETE FROM semantic_links")
+
+    total_links = 0
+    for i in range(n):
+        scores = sim_matrix[i]
+        top_indices = np.argsort(scores)[::-1]
+
+        links_added = 0
+        for j in top_indices:
+            if links_added >= MAX_LINKS_PER_PARA:
+                break
+            if scores[j] < MIN_SCORE:
+                break
+            # Skip same episode
+            if meta[i]["episode_id"] == meta[j]["episode_id"] and meta[i]["podcast_id"] == meta[j]["podcast_id"]:
+                continue
+
+            db.execute(
+                "INSERT INTO semantic_links (podcast_id, source_episode, source_idx, target_podcast, target_episode, target_idx, score) VALUES (?, ?, ?, ?, ?, ?, ?)",
+                (meta[i]["podcast_id"], meta[i]["episode_id"], meta[i]["idx"],
+                 meta[j]["podcast_id"], meta[j]["episode_id"], meta[j]["idx"],
+                 float(scores[j]))
+            )
+            links_added += 1
+            total_links += 1
+
+    db.commit()
+    db.close()
+
+    print(f"Stored {total_links} semantic links (threshold: {MIN_SCORE})")
+
+
+if __name__ == "__main__":
+    init_db()
+    podcast_id = sys.argv[1] if len(sys.argv) > 1 else None
+    precompute_similarities(podcast_id)