#13/#16/#17/#18 Qwen-Analyse-Scripts - Frontend: Wort-Level-Highlighting im Transkript — jedes Wort als <span> mit Timestamp, Karaoke-Style Sync bei Wiedergabe, CSS word-active/word-spoken - API: /api/.../words Endpoint liefert Wort-Timestamps - #14 detect_gaps.py: K-Means-Clustering über 3727 Embeddings, identifiziert Leerstellen (Themen die in einem Podcast fehlen). Ergebnis: gaps_analysis.json - #15 detect_narrative_shift.py: Embedding-Drift pro Thema über Episodenfolge, erkennt Framing-Wechsel. Ergebnis: narrative_shifts.json - #13 analyse_arguments.py: Qwen klassifiziert logische Relationen (erweitert, widerspricht, belegt, relativiert) zwischen semantisch ähnlichen Absätzen - #16 extract_claims.py: Qwen extrahiert prüfbare Behauptungen (Zahlen, Statistiken) - #17 extract_questions.py: Qwen extrahiert und klassifiziert Fragen - #18 curate_debates.py: Qwen kuratiert Cross-Podcast-Gegenüberstellungen - run_all_qwen.sh: Sequentielle Pipeline für alle Qwen-Tasks (vermeidet DB-Locks) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
185 lines
6.1 KiB
Python
185 lines
6.1 KiB
Python
#!/usr/bin/env python3
|
|
"""#14 Leerstellen-Detektor: Embedding-Cluster-Analyse zur Identifikation von Diskurslücken.
|
|
|
|
Bildet Cluster über alle Paragraphen, misst Dichte pro Podcast,
|
|
identifiziert asymmetrische und leere Cluster.
|
|
|
|
Nutzung:
|
|
python3 detect_gaps.py [db-pfad] [output-json]
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
import sqlite3
|
|
import numpy as np
|
|
|
|
DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite"
|
|
OUTPUT = sys.argv[2] if len(sys.argv) > 2 else "data/gaps_analysis.json"
|
|
|
|
N_CLUSTERS = 30 # Feinere Auflösung
|
|
|
|
|
|
def load_embeddings(db_path):
|
|
db = sqlite3.connect(db_path)
|
|
db.row_factory = sqlite3.Row
|
|
rows = db.execute(
|
|
"SELECT p.id, p.podcast_id, p.episode_id, p.idx, p.text, p.embedding, e.title, e.guest "
|
|
"FROM paragraphs p JOIN episodes e ON p.podcast_id = e.podcast_id AND p.episode_id = e.id "
|
|
"WHERE p.embedding IS NOT NULL"
|
|
).fetchall()
|
|
db.close()
|
|
|
|
meta = []
|
|
vectors = []
|
|
for r in rows:
|
|
meta.append({
|
|
"id": r["id"], "podcast_id": r["podcast_id"],
|
|
"episode_id": r["episode_id"], "idx": r["idx"],
|
|
"text": r["text"][:200], "title": r["title"], "guest": r["guest"]
|
|
})
|
|
vectors.append(np.frombuffer(r["embedding"], dtype=np.float32))
|
|
|
|
return np.array(vectors), meta
|
|
|
|
|
|
def kmeans_simple(vectors, k, max_iter=50):
|
|
"""Simple K-Means ohne sklearn-Dependency."""
|
|
n = len(vectors)
|
|
# Init: random selection
|
|
rng = np.random.default_rng(42)
|
|
indices = rng.choice(n, k, replace=False)
|
|
centroids = vectors[indices].copy()
|
|
|
|
labels = np.zeros(n, dtype=int)
|
|
|
|
for _ in range(max_iter):
|
|
# Assign
|
|
dists = np.linalg.norm(vectors[:, None] - centroids[None, :], axis=2)
|
|
new_labels = np.argmin(dists, axis=1)
|
|
|
|
if np.all(new_labels == labels):
|
|
break
|
|
labels = new_labels
|
|
|
|
# Update centroids
|
|
for j in range(k):
|
|
mask = labels == j
|
|
if mask.sum() > 0:
|
|
centroids[j] = vectors[mask].mean(axis=0)
|
|
|
|
return labels, centroids
|
|
|
|
|
|
def main():
|
|
print("Lade Embeddings…")
|
|
vectors, meta = load_embeddings(DB_PATH)
|
|
print(f" {len(vectors)} Absätze geladen.")
|
|
|
|
# Normalize
|
|
norms = np.linalg.norm(vectors, axis=1, keepdims=True)
|
|
norms[norms == 0] = 1
|
|
vectors_norm = vectors / norms
|
|
|
|
print(f"Clustere in {N_CLUSTERS} Gruppen…")
|
|
labels, centroids = kmeans_simple(vectors_norm, N_CLUSTERS)
|
|
|
|
# Analyze clusters
|
|
podcasts = sorted(set(m["podcast_id"] for m in meta))
|
|
clusters = []
|
|
|
|
for c in range(N_CLUSTERS):
|
|
mask = labels == c
|
|
indices = np.where(mask)[0]
|
|
members = [meta[i] for i in indices]
|
|
|
|
# Count per podcast
|
|
per_podcast = {p: 0 for p in podcasts}
|
|
for m in members:
|
|
per_podcast[m["podcast_id"]] += 1
|
|
|
|
# Representative texts (closest to centroid)
|
|
if len(indices) > 0:
|
|
dists = np.linalg.norm(vectors_norm[indices] - centroids[c], axis=1)
|
|
top_indices = indices[np.argsort(dists)[:5]]
|
|
representative = [{"text": meta[i]["text"], "episode": meta[i]["episode_id"],
|
|
"podcast": meta[i]["podcast_id"], "guest": meta[i]["guest"]} for i in top_indices]
|
|
else:
|
|
representative = []
|
|
|
|
# Derive topic label from representative texts
|
|
words = " ".join(m["text"][:100] for m in members[:20]).lower().split()
|
|
# Simple word frequency (exclude common words)
|
|
stop = {"der", "die", "das", "und", "in", "von", "zu", "den", "ist", "ein", "eine", "es", "mit",
|
|
"auf", "für", "an", "sich", "nicht", "auch", "dass", "wir", "man", "aber", "des", "dem",
|
|
"werden", "oder", "als", "wie", "hat", "ich", "sind", "was", "so", "haben", "dann",
|
|
"wenn", "noch", "schon", "kann", "wird", "hier", "über", "nach", "nur", "bei", "da",
|
|
"diese", "dieser", "dieses", "einem", "einer", "also", "ja", "mal", "war", "sehr",
|
|
"gibt", "aus", "zum", "zur", "mehr", "immer", "weil", "uns", "sie", "er", "vom"}
|
|
word_freq = {}
|
|
for w in words:
|
|
w = w.strip(".,;:!?\"'()[]")
|
|
if len(w) > 3 and w not in stop:
|
|
word_freq[w] = word_freq.get(w, 0) + 1
|
|
top_words = sorted(word_freq.items(), key=lambda x: -x[1])[:5]
|
|
label = ", ".join(w for w, _ in top_words) if top_words else f"Cluster {c}"
|
|
|
|
clusters.append({
|
|
"id": c,
|
|
"label": label,
|
|
"size": int(mask.sum()),
|
|
"per_podcast": per_podcast,
|
|
"representative": representative,
|
|
})
|
|
|
|
# Sort by size
|
|
clusters.sort(key=lambda x: -x["size"])
|
|
|
|
# Identify gaps
|
|
gaps = []
|
|
for cl in clusters:
|
|
total = cl["size"]
|
|
if total < 3:
|
|
continue
|
|
|
|
for p in podcasts:
|
|
count = cl["per_podcast"].get(p, 0)
|
|
other_total = total - count
|
|
if other_total > 10 and count <= 2:
|
|
gaps.append({
|
|
"cluster_label": cl["label"],
|
|
"cluster_size": total,
|
|
"missing_in": p,
|
|
"present_in_count": other_total,
|
|
"representative": cl["representative"][:3],
|
|
})
|
|
|
|
# Sort gaps by how asymmetric they are
|
|
gaps.sort(key=lambda x: -x["present_in_count"])
|
|
|
|
result = {
|
|
"total_paragraphs": len(meta),
|
|
"podcasts": podcasts,
|
|
"n_clusters": N_CLUSTERS,
|
|
"clusters": clusters,
|
|
"gaps": gaps[:30],
|
|
}
|
|
|
|
with open(OUTPUT, "w") as f:
|
|
json.dump(result, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"\n{len(clusters)} Cluster, {len(gaps)} Leerstellen identifiziert.")
|
|
print(f"\nTop-Leerstellen:")
|
|
for g in gaps[:10]:
|
|
print(f" [{g['missing_in']}] fehlt: \"{g['cluster_label']}\" ({g['present_in_count']} Absätze im anderen Podcast)")
|
|
|
|
print(f"\nCluster-Größen:")
|
|
for cl in clusters[:15]:
|
|
bar = " | ".join(f"{p}:{cl['per_podcast'][p]}" for p in podcasts)
|
|
print(f" {cl['label'][:40]:40s} ({cl['size']:4d}) — {bar}")
|
|
|
|
print(f"\nErgebnis: {OUTPUT}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|