podcast-mindmap/backend/embeddings.py

"""Generate embeddings via DashScope (Qwen text-embedding-v3)."""

import os
import time
from openai import OpenAI
from database import get_db, store_embedding

DASHSCOPE_API_KEY = os.environ.get("DASHSCOPE_API_KEY", "")
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "text-embedding-v3")
BATCH_SIZE = 6  # DashScope text-embedding-v3 limit: 10 texts, but long texts need smaller batches


def get_client():
    return OpenAI(
        api_key=DASHSCOPE_API_KEY,
        base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
    )


def embed_texts(texts: list[str]) -> list[list[float]]:
    """Embed a batch of texts."""
    client = get_client()
    response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=texts,
        dimensions=1024
    )
    return [item.embedding for item in response.data]


def embed_all_paragraphs(podcast_id: str = None):
    """Embed all paragraphs that don't have embeddings yet."""
    db = get_db()

    if podcast_id:
        rows = db.execute(
            "SELECT id, text FROM paragraphs WHERE podcast_id = ? AND embedding IS NULL",
            (podcast_id,)
        ).fetchall()
    else:
        rows = db.execute(
            "SELECT id, text FROM paragraphs WHERE embedding IS NULL"
        ).fetchall()

    db.close()

    if not rows:
        print("No paragraphs to embed.")
        return

    print(f"Embedding {len(rows)} paragraphs...")

    for i in range(0, len(rows), BATCH_SIZE):
        batch = rows[i:i + BATCH_SIZE]
        texts = [r["text"][:2000] for r in batch]  # Truncate long texts

        try:
            embeddings = embed_texts(texts)
            for row, emb in zip(batch, embeddings):
                store_embedding(row["id"], emb)
            print(f"  Batch {i // BATCH_SIZE + 1}/{(len(rows) + BATCH_SIZE - 1) // BATCH_SIZE}: {len(batch)} paragraphs")
        except Exception as e:
            print(f"  Error at batch {i // BATCH_SIZE + 1}: {e}")
            time.sleep(2)
            continue

    print("Done.")


if __name__ == "__main__":
    import sys
    podcast_id = sys.argv[1] if len(sys.argv) > 1 else None
    embed_all_paragraphs(podcast_id)
Phase 1+2: FastAPI-Backend, SQLite, Embeddings, Semantische Suche Phase 1: - FastAPI-Backend (backend/app.py) mit REST-API - SQLite-Datenbank für Podcasts, Episoden, Absätze, Zitate - Auto-Import aus mindmap_data.json + srt_index.json beim Start - Webapp als SPA: API-first mit Static-File-Fallback - Audio als gemountetes Volume statt im Docker-Image - Docker-Compose mit Traefik-Labels Phase 2: - Qwen text-embedding-v3 via DashScope (1024-dim Vektoren) - Embedding aller Transkript-Absätze (728 für NEU DENKEN) - Semantische Suche: /api/semantic-search?q=... - Similarity-API: /api/similar/{podcast}/{episode}/{paragraph} - Cosine-Similarity auf normalisierten Vektoren, <100ms - Findet thematisch verwandte Stellen über Episoden hinweg, auch bei komplett unterschiedlicher Wortwahl Vorbereitet für Multi-Podcast (#10): Datenstruktur unterstützt mehrere Podcasts, Cross-Podcast-Similarity ist ein Parameter. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-20 10:24:53 +02:00			`"""Generate embeddings via DashScope (Qwen text-embedding-v3)."""`

			`import os`
			`import time`
			`from openai import OpenAI`
			`from database import get_db, store_embedding`

			`DASHSCOPE_API_KEY = os.environ.get("DASHSCOPE_API_KEY", "")`
			`EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "text-embedding-v3")`
			`BATCH_SIZE = 6 # DashScope text-embedding-v3 limit: 10 texts, but long texts need smaller batches`


			`def get_client():`
			`return OpenAI(`
			`api_key=DASHSCOPE_API_KEY,`
			`base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1"`
			`)`


			`def embed_texts(texts: list[str]) -> list[list[float]]:`
			`"""Embed a batch of texts."""`
			`client = get_client()`
			`response = client.embeddings.create(`
			`model=EMBEDDING_MODEL,`
			`input=texts,`
			`dimensions=1024`
			`)`
			`return [item.embedding for item in response.data]`


			`def embed_all_paragraphs(podcast_id: str = None):`
			`"""Embed all paragraphs that don't have embeddings yet."""`
			`db = get_db()`

			`if podcast_id:`
			`rows = db.execute(`
			`"SELECT id, text FROM paragraphs WHERE podcast_id = ? AND embedding IS NULL",`
			`(podcast_id,)`
			`).fetchall()`
			`else:`
			`rows = db.execute(`
			`"SELECT id, text FROM paragraphs WHERE embedding IS NULL"`
			`).fetchall()`

			`db.close()`

			`if not rows:`
			`print("No paragraphs to embed.")`
			`return`

			`print(f"Embedding {len(rows)} paragraphs...")`

			`for i in range(0, len(rows), BATCH_SIZE):`
			`batch = rows[i:i + BATCH_SIZE]`
			`texts = [r["text"][:2000] for r in batch] # Truncate long texts`

			`try:`
			`embeddings = embed_texts(texts)`
			`for row, emb in zip(batch, embeddings):`
			`store_embedding(row["id"], emb)`
			`print(f" Batch {i // BATCH_SIZE + 1}/{(len(rows) + BATCH_SIZE - 1) // BATCH_SIZE}: {len(batch)} paragraphs")`
			`except Exception as e:`
			`print(f" Error at batch {i // BATCH_SIZE + 1}: {e}")`
			`time.sleep(2)`
			`continue`

			`print("Done.")`


			`if __name__ == "__main__":`
			`import sys`
			`podcast_id = sys.argv[1] if len(sys.argv) > 1 else None`
			`embed_all_paragraphs(podcast_id)`