podcast-mindmap/scripts/detect_gaps.py

#!/usr/bin/env python3
"""#14 Leerstellen-Detektor: Embedding-Cluster-Analyse zur Identifikation von Diskurslücken.

Bildet Cluster über alle Paragraphen, misst Dichte pro Podcast,
identifiziert asymmetrische und leere Cluster.

Nutzung:
    python3 detect_gaps.py [db-pfad] [output-json]
"""

import json
import sys
import sqlite3
import numpy as np

DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite"
OUTPUT = sys.argv[2] if len(sys.argv) > 2 else "data/gaps_analysis.json"

N_CLUSTERS = 30  # Feinere Auflösung


def load_embeddings(db_path):
    db = sqlite3.connect(db_path)
    db.row_factory = sqlite3.Row
    rows = db.execute(
        "SELECT p.id, p.podcast_id, p.episode_id, p.idx, p.text, p.embedding, e.title, e.guest "
        "FROM paragraphs p JOIN episodes e ON p.podcast_id = e.podcast_id AND p.episode_id = e.id "
        "WHERE p.embedding IS NOT NULL"
    ).fetchall()
    db.close()

    meta = []
    vectors = []
    for r in rows:
        meta.append({
            "id": r["id"], "podcast_id": r["podcast_id"],
            "episode_id": r["episode_id"], "idx": r["idx"],
            "text": r["text"][:200], "title": r["title"], "guest": r["guest"]
        })
        vectors.append(np.frombuffer(r["embedding"], dtype=np.float32))

    return np.array(vectors), meta


def kmeans_simple(vectors, k, max_iter=50):
    """Simple K-Means ohne sklearn-Dependency."""
    n = len(vectors)
    # Init: random selection
    rng = np.random.default_rng(42)
    indices = rng.choice(n, k, replace=False)
    centroids = vectors[indices].copy()

    labels = np.zeros(n, dtype=int)

    for _ in range(max_iter):
        # Assign
        dists = np.linalg.norm(vectors[:, None] - centroids[None, :], axis=2)
        new_labels = np.argmin(dists, axis=1)

        if np.all(new_labels == labels):
            break
        labels = new_labels

        # Update centroids
        for j in range(k):
            mask = labels == j
            if mask.sum() > 0:
                centroids[j] = vectors[mask].mean(axis=0)

    return labels, centroids


def main():
    print("Lade Embeddings…")
    vectors, meta = load_embeddings(DB_PATH)
    print(f"  {len(vectors)} Absätze geladen.")

    # Normalize
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    norms[norms == 0] = 1
    vectors_norm = vectors / norms

    print(f"Clustere in {N_CLUSTERS} Gruppen…")
    labels, centroids = kmeans_simple(vectors_norm, N_CLUSTERS)

    # Analyze clusters
    podcasts = sorted(set(m["podcast_id"] for m in meta))
    clusters = []

    for c in range(N_CLUSTERS):
        mask = labels == c
        indices = np.where(mask)[0]
        members = [meta[i] for i in indices]

        # Count per podcast
        per_podcast = {p: 0 for p in podcasts}
        for m in members:
            per_podcast[m["podcast_id"]] += 1

        # Representative texts (closest to centroid)
        if len(indices) > 0:
            dists = np.linalg.norm(vectors_norm[indices] - centroids[c], axis=1)
            top_indices = indices[np.argsort(dists)[:5]]
            representative = [{"text": meta[i]["text"], "episode": meta[i]["episode_id"],
                              "podcast": meta[i]["podcast_id"], "guest": meta[i]["guest"]} for i in top_indices]
        else:
            representative = []

        # Derive topic label from representative texts
        words = " ".join(m["text"][:100] for m in members[:20]).lower().split()
        # Simple word frequency (exclude common words)
        stop = {"der", "die", "das", "und", "in", "von", "zu", "den", "ist", "ein", "eine", "es", "mit",
                "auf", "für", "an", "sich", "nicht", "auch", "dass", "wir", "man", "aber", "des", "dem",
                "werden", "oder", "als", "wie", "hat", "ich", "sind", "was", "so", "haben", "dann",
                "wenn", "noch", "schon", "kann", "wird", "hier", "über", "nach", "nur", "bei", "da",
                "diese", "dieser", "dieses", "einem", "einer", "also", "ja", "mal", "war", "sehr",
                "gibt", "aus", "zum", "zur", "mehr", "immer", "weil", "uns", "sie", "er", "vom"}
        word_freq = {}
        for w in words:
            w = w.strip(".,;:!?\"'()[]")
            if len(w) > 3 and w not in stop:
                word_freq[w] = word_freq.get(w, 0) + 1
        top_words = sorted(word_freq.items(), key=lambda x: -x[1])[:5]
        label = ", ".join(w for w, _ in top_words) if top_words else f"Cluster {c}"

        clusters.append({
            "id": c,
            "label": label,
            "size": int(mask.sum()),
            "per_podcast": per_podcast,
            "representative": representative,
        })

    # Sort by size
    clusters.sort(key=lambda x: -x["size"])

    # Identify gaps
    gaps = []
    for cl in clusters:
        total = cl["size"]
        if total < 3:
            continue

        for p in podcasts:
            count = cl["per_podcast"].get(p, 0)
            other_total = total - count
            if other_total > 10 and count <= 2:
                gaps.append({
                    "cluster_label": cl["label"],
                    "cluster_size": total,
                    "missing_in": p,
                    "present_in_count": other_total,
                    "representative": cl["representative"][:3],
                })

    # Sort gaps by how asymmetric they are
    gaps.sort(key=lambda x: -x["present_in_count"])

    result = {
        "total_paragraphs": len(meta),
        "podcasts": podcasts,
        "n_clusters": N_CLUSTERS,
        "clusters": clusters,
        "gaps": gaps[:30],
    }

    with open(OUTPUT, "w") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)

    print(f"\n{len(clusters)} Cluster, {len(gaps)} Leerstellen identifiziert.")
    print(f"\nTop-Leerstellen:")
    for g in gaps[:10]:
        print(f"  [{g['missing_in']}] fehlt: \"{g['cluster_label']}\" ({g['present_in_count']} Absätze im anderen Podcast)")

    print(f"\nCluster-Größen:")
    for cl in clusters[:15]:
        bar = " | ".join(f"{p}:{cl['per_podcast'][p]}" for p in podcasts)
        print(f"  {cl['label'][:40]:40s} ({cl['size']:4d}) — {bar}")

    print(f"\nErgebnis: {OUTPUT}")


if __name__ == "__main__":
    main()