Phase 2: Vorberechnete semantische Similarity + API
- precompute.py: Berechnet paarweise Cosine-Similarity aller Absätze,
speichert Top-10-Nachbarn pro Absatz in semantic_links Tabelle
- API: /api/similar-precomputed/{podcast}/{episode}/{idx} — liefert
vorberechnete ähnliche Stellen in <1ms
- Getestet: 728 Absätze, 7144 Links (Threshold 0.55)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
b0649cea49
commit
cb5978132c
@ -185,6 +185,34 @@ def find_similar(podcast_id: str, episode_id: str, para_idx: int,
|
||||
return results
|
||||
|
||||
|
||||
@app.get("/api/similar-precomputed/{podcast_id}/{episode_id}/{para_idx}")
|
||||
def get_precomputed_similar(podcast_id: str, episode_id: str, para_idx: int, limit: int = 10):
|
||||
"""Get precomputed similar paragraphs (fast, no embedding computation)."""
|
||||
db = get_db()
|
||||
rows = db.execute(
|
||||
"SELECT sl.target_podcast, sl.target_episode, sl.target_idx, sl.score, "
|
||||
"p.text, p.start_time, e.title, e.guest "
|
||||
"FROM semantic_links sl "
|
||||
"JOIN paragraphs p ON sl.target_podcast = p.podcast_id AND sl.target_episode = p.episode_id AND sl.target_idx = p.idx "
|
||||
"JOIN episodes e ON sl.target_podcast = e.podcast_id AND sl.target_episode = e.id "
|
||||
"WHERE sl.podcast_id = ? AND sl.source_episode = ? AND sl.source_idx = ? "
|
||||
"ORDER BY sl.score DESC LIMIT ?",
|
||||
(podcast_id, episode_id, para_idx, limit)
|
||||
).fetchall()
|
||||
db.close()
|
||||
|
||||
return [{
|
||||
"podcast_id": r["target_podcast"],
|
||||
"episode_id": r["target_episode"],
|
||||
"paragraph_idx": r["target_idx"],
|
||||
"score": r["score"],
|
||||
"text_preview": r["text"][:150],
|
||||
"start_time": r["start_time"],
|
||||
"episode_title": r["title"],
|
||||
"guest": r["guest"],
|
||||
} for r in rows]
|
||||
|
||||
|
||||
@app.get("/api/semantic-search")
|
||||
def semantic_search(q: str = Query(..., min_length=3), podcast_id: Optional[str] = None, limit: int = 20):
|
||||
"""Semantic search using query embedding."""
|
||||
|
||||
91
backend/precompute.py
Normal file
91
backend/precompute.py
Normal file
@ -0,0 +1,91 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Precompute similarity links between paragraphs and store in DB."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import numpy as np
|
||||
from database import get_db, get_all_embeddings, init_db
|
||||
|
||||
MIN_SCORE = float(os.environ.get("SIMILARITY_THRESHOLD", "0.55"))
|
||||
MAX_LINKS_PER_PARA = 10
|
||||
|
||||
|
||||
def precompute_similarities(podcast_id=None):
|
||||
"""Compute top-N similar paragraphs for each paragraph and store as JSON."""
|
||||
vectors, meta = get_all_embeddings(podcast_id)
|
||||
if vectors is None or len(meta) == 0:
|
||||
print("No embeddings found.")
|
||||
return
|
||||
|
||||
# Fix NaN vectors
|
||||
nan_mask = np.isnan(vectors).any(axis=1)
|
||||
if nan_mask.any():
|
||||
print(f"Warning: {nan_mask.sum()} NaN vectors found, zeroing them out")
|
||||
vectors[nan_mask] = 0
|
||||
|
||||
n = len(meta)
|
||||
print(f"Computing similarity matrix for {n} paragraphs...")
|
||||
|
||||
# Compute full similarity matrix (n x n)
|
||||
sim_matrix = vectors @ vectors.T
|
||||
np.fill_diagonal(sim_matrix, 0) # No self-links
|
||||
|
||||
# For each paragraph, find top-N similar from OTHER episodes
|
||||
db = get_db()
|
||||
|
||||
# Create table if needed
|
||||
db.execute("""
|
||||
CREATE TABLE IF NOT EXISTS semantic_links (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
podcast_id TEXT NOT NULL,
|
||||
source_episode TEXT NOT NULL,
|
||||
source_idx INTEGER NOT NULL,
|
||||
target_podcast TEXT NOT NULL,
|
||||
target_episode TEXT NOT NULL,
|
||||
target_idx INTEGER NOT NULL,
|
||||
score REAL NOT NULL
|
||||
)
|
||||
""")
|
||||
db.execute("CREATE INDEX IF NOT EXISTS idx_semantic_source ON semantic_links(podcast_id, source_episode, source_idx)")
|
||||
|
||||
# Clear existing links
|
||||
if podcast_id:
|
||||
db.execute("DELETE FROM semantic_links WHERE podcast_id = ?", (podcast_id,))
|
||||
else:
|
||||
db.execute("DELETE FROM semantic_links")
|
||||
|
||||
total_links = 0
|
||||
for i in range(n):
|
||||
scores = sim_matrix[i]
|
||||
top_indices = np.argsort(scores)[::-1]
|
||||
|
||||
links_added = 0
|
||||
for j in top_indices:
|
||||
if links_added >= MAX_LINKS_PER_PARA:
|
||||
break
|
||||
if scores[j] < MIN_SCORE:
|
||||
break
|
||||
# Skip same episode
|
||||
if meta[i]["episode_id"] == meta[j]["episode_id"] and meta[i]["podcast_id"] == meta[j]["podcast_id"]:
|
||||
continue
|
||||
|
||||
db.execute(
|
||||
"INSERT INTO semantic_links (podcast_id, source_episode, source_idx, target_podcast, target_episode, target_idx, score) VALUES (?, ?, ?, ?, ?, ?, ?)",
|
||||
(meta[i]["podcast_id"], meta[i]["episode_id"], meta[i]["idx"],
|
||||
meta[j]["podcast_id"], meta[j]["episode_id"], meta[j]["idx"],
|
||||
float(scores[j]))
|
||||
)
|
||||
links_added += 1
|
||||
total_links += 1
|
||||
|
||||
db.commit()
|
||||
db.close()
|
||||
|
||||
print(f"Stored {total_links} semantic links (threshold: {MIN_SCORE})")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
init_db()
|
||||
podcast_id = sys.argv[1] if len(sys.argv) > 1 else None
|
||||
precompute_similarities(podcast_id)
|
||||
Loading…
Reference in New Issue
Block a user