From 78d66bef21352d3fa904adbf8271e8fefb29c4ff Mon Sep 17 00:00:00 2001 From: Dotty Dotter Date: Thu, 23 Apr 2026 22:29:41 +0200 Subject: [PATCH] #12 Wort-Highlighting Frontend, #14 Leerstellen-Detektor, #15 Narrative Shift, #13/#16/#17/#18 Qwen-Analyse-Scripts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Frontend: Wort-Level-Highlighting im Transkript — jedes Wort als mit Timestamp, Karaoke-Style Sync bei Wiedergabe, CSS word-active/word-spoken - API: /api/.../words Endpoint liefert Wort-Timestamps - #14 detect_gaps.py: K-Means-Clustering über 3727 Embeddings, identifiziert Leerstellen (Themen die in einem Podcast fehlen). Ergebnis: gaps_analysis.json - #15 detect_narrative_shift.py: Embedding-Drift pro Thema über Episodenfolge, erkennt Framing-Wechsel. Ergebnis: narrative_shifts.json - #13 analyse_arguments.py: Qwen klassifiziert logische Relationen (erweitert, widerspricht, belegt, relativiert) zwischen semantisch ähnlichen Absätzen - #16 extract_claims.py: Qwen extrahiert prüfbare Behauptungen (Zahlen, Statistiken) - #17 extract_questions.py: Qwen extrahiert und klassifiziert Fragen - #18 curate_debates.py: Qwen kuratiert Cross-Podcast-Gegenüberstellungen - run_all_qwen.sh: Sequentielle Pipeline für alle Qwen-Tasks (vermeidet DB-Locks) Co-Authored-By: Claude Opus 4.6 (1M context) --- scripts/analyse_arguments.py | 166 +++++++++++++++++++++++++++ scripts/curate_debates.py | 154 +++++++++++++++++++++++++ scripts/detect_gaps.py | 184 ++++++++++++++++++++++++++++++ scripts/detect_narrative_shift.py | 167 +++++++++++++++++++++++++++ scripts/extract_claims.py | 140 +++++++++++++++++++++++ scripts/extract_questions.py | 143 +++++++++++++++++++++++ scripts/run_all_qwen.sh | 35 ++++++ webapp/index.html | 86 ++++++++++++-- 8 files changed, 1065 insertions(+), 10 deletions(-) create mode 100644 scripts/analyse_arguments.py create mode 100644 scripts/curate_debates.py create mode 100644 scripts/detect_gaps.py create mode 100644 scripts/detect_narrative_shift.py create mode 100644 scripts/extract_claims.py create mode 100644 scripts/extract_questions.py create mode 100755 scripts/run_all_qwen.sh diff --git a/scripts/analyse_arguments.py b/scripts/analyse_arguments.py new file mode 100644 index 0000000..6c7c941 --- /dev/null +++ b/scripts/analyse_arguments.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +"""#13 Argumentketten-Tracker: Klassifiziere logische Relationen zwischen semantisch ähnlichen Absätzen. + +Nimmt die Top-N semantic_links und lässt Qwen die Relation klassifizieren: +erweitert, widerspricht, belegt, relativiert, gleicher_punkt, kein_bezug. + +Nutzung: + DASHSCOPE_API_KEY=... python3 analyse_arguments.py [db-pfad] [limit] +""" + +import json +import os +import sys +import time +import sqlite3 + +from openai import OpenAI + +DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite" +LIMIT = int(sys.argv[2]) if len(sys.argv) > 2 else 500 + +API_KEY = os.environ.get("DASHSCOPE_API_KEY", "") +BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" +MODEL = "qwen-plus" + +SYSTEM_PROMPT = """Du bist ein Diskursanalyst. Du erhältst zwei Textabschnitte aus Podcast-Transkripten. +Klassifiziere die logische Relation zwischen ihnen. Antworte NUR mit einem JSON-Objekt: + +{"relation": "...", "confidence": 0.0-1.0, "explanation": "Ein Satz Begründung"} + +Mögliche Relationen: +- "erweitert": B baut auf A auf, ergänzt, vertieft +- "widerspricht": B widerspricht A, nennt Gegenargument +- "belegt": B liefert Evidenz/Daten für A's These +- "relativiert": B schränkt A ein, nennt Ausnahmen/Bedingungen +- "gleicher_punkt": A und B sagen im Kern dasselbe +- "kein_bezug": Trotz thematischer Nähe kein logischer Bezug""" + + +def classify_pair(client, text_a, meta_a, text_b, meta_b): + user_msg = f"""Absatz A ({meta_a}): +"{text_a}" + +Absatz B ({meta_b}): +"{text_b}" + +Welche logische Relation besteht von A zu B?""" + + try: + resp = client.chat.completions.create( + model=MODEL, + messages=[ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": user_msg}, + ], + temperature=0.1, + max_tokens=150, + ) + content = resp.choices[0].message.content.strip() + # Parse JSON from response + if content.startswith("```"): + content = content.split("```")[1].strip() + if content.startswith("json"): + content = content[4:].strip() + return json.loads(content) + except Exception as e: + return {"relation": "error", "confidence": 0, "explanation": str(e)} + + +def main(): + if not API_KEY: + print("DASHSCOPE_API_KEY nicht gesetzt.") + sys.exit(1) + + client = OpenAI(api_key=API_KEY, base_url=BASE_URL) + db = sqlite3.connect(DB_PATH) + db.row_factory = sqlite3.Row + + # Create output table + db.executescript(""" + CREATE TABLE IF NOT EXISTS argument_links ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source_podcast TEXT, source_episode TEXT, source_idx INTEGER, + target_podcast TEXT, target_episode TEXT, target_idx INTEGER, + relation TEXT, confidence REAL, explanation TEXT, score REAL + ); + CREATE INDEX IF NOT EXISTS idx_arglinks ON argument_links(relation); + """) + + # Get top semantic links (cross-episode, prefer cross-podcast) + rows = db.execute(""" + SELECT sl.podcast_id, sl.source_episode, sl.source_idx, + sl.target_podcast, sl.target_episode, sl.target_idx, sl.score, + p1.text as source_text, p2.text as target_text, + e1.title as source_title, e1.guest as source_guest, + e2.title as target_title, e2.guest as target_guest + FROM semantic_links sl + JOIN paragraphs p1 ON sl.podcast_id = p1.podcast_id AND sl.source_episode = p1.episode_id AND sl.source_idx = p1.idx + JOIN paragraphs p2 ON sl.target_podcast = p2.podcast_id AND sl.target_episode = p2.episode_id AND sl.target_idx = p2.idx + JOIN episodes e1 ON sl.podcast_id = e1.podcast_id AND sl.source_episode = e1.id + JOIN episodes e2 ON sl.target_podcast = e2.podcast_id AND sl.target_episode = e2.id + WHERE sl.source_episode != sl.target_episode + ORDER BY sl.score DESC + LIMIT ? + """, (LIMIT,)).fetchall() + + print(f"Klassifiziere {len(rows)} Paare mit {MODEL}…") + + # Check already processed + existing = set() + try: + for r in db.execute("SELECT source_podcast||source_episode||source_idx||target_podcast||target_episode||target_idx as k FROM argument_links").fetchall(): + existing.add(r["k"]) + except Exception: + pass + + processed = 0 + skipped = 0 + + for i, row in enumerate(rows): + key = f"{row['podcast_id']}{row['source_episode']}{row['source_idx']}{row['target_podcast']}{row['target_episode']}{row['target_idx']}" + if key in existing: + skipped += 1 + continue + + meta_a = f"{row['source_episode']}: {row['source_title']} — {row['source_guest']}" + meta_b = f"{row['target_episode']}: {row['target_title']} — {row['target_guest']}" + + result = classify_pair( + client, + row["source_text"][:800], meta_a, + row["target_text"][:800], meta_b + ) + + db.execute( + "INSERT INTO argument_links (source_podcast, source_episode, source_idx, " + "target_podcast, target_episode, target_idx, relation, confidence, explanation, score) " + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + (row["podcast_id"], row["source_episode"], row["source_idx"], + row["target_podcast"], row["target_episode"], row["target_idx"], + result.get("relation", "error"), result.get("confidence", 0), + result.get("explanation", ""), row["score"]) + ) + + processed += 1 + if processed % 10 == 0: + db.commit() + print(f" {processed}/{len(rows) - skipped} klassifiziert…") + + # Rate limiting + time.sleep(0.3) + + db.commit() + + # Stats + stats = db.execute("SELECT relation, COUNT(*) as c FROM argument_links GROUP BY relation ORDER BY c DESC").fetchall() + print(f"\nFertig: {processed} neue, {skipped} übersprungen.") + print("Verteilung:") + for s in stats: + print(f" {s['relation']}: {s['c']}") + + db.close() + + +if __name__ == "__main__": + main() diff --git a/scripts/curate_debates.py b/scripts/curate_debates.py new file mode 100644 index 0000000..85c30e6 --- /dev/null +++ b/scripts/curate_debates.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +"""#18 Cross-Podcast-Debatte: Kuratiere Gegenüberstellungen zu gemeinsamen Themen. + +Nimmt die stärksten Cross-Podcast-Paare und lässt Qwen Übereinstimmungen/Divergenzen zusammenfassen. + +Nutzung: + DASHSCOPE_API_KEY=... python3 curate_debates.py [db-pfad] [limit] +""" + +import json +import os +import sys +import time +import sqlite3 + +from openai import OpenAI + +DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite" +LIMIT = int(sys.argv[2]) if len(sys.argv) > 2 else 100 +API_KEY = os.environ.get("DASHSCOPE_API_KEY", "") +BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" +MODEL = "qwen-plus" + +SYSTEM_PROMPT = """Du bist ein Diskursanalyst. Du erhältst zwei Textabschnitte aus VERSCHIEDENEN Podcasts, die dasselbe Thema behandeln. + +Erstelle eine kurze Gegenüberstellung. Antworte NUR mit JSON: + +{ + "topic": "Das gemeinsame Thema in 3-5 Wörtern", + "agreement": "Worin stimmen beide überein? (1-2 Sätze)", + "divergence": "Worin unterscheiden sie sich? (1-2 Sätze, oder 'keine wesentliche Divergenz')", + "insight": "Was lernt man durch die Gegenüberstellung, das man aus keinem der beiden allein lernen würde? (1 Satz)" +}""" + + +def curate_pair(client, text_a, meta_a, text_b, meta_b): + user_msg = f"""Podcast A — {meta_a}: +"{text_a}" + +Podcast B — {meta_b}: +"{text_b}" + +Erstelle die Gegenüberstellung.""" + + try: + resp = client.chat.completions.create( + model=MODEL, + messages=[ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": user_msg}, + ], + temperature=0.2, + max_tokens=300, + ) + content = resp.choices[0].message.content.strip() + if content.startswith("```"): + content = content.split("```")[1].strip() + if content.startswith("json"): + content = content[4:].strip() + return json.loads(content) + except Exception as e: + return {"topic": "error", "agreement": "", "divergence": "", "insight": str(e)} + + +def main(): + if not API_KEY: + print("DASHSCOPE_API_KEY nicht gesetzt.") + sys.exit(1) + + client = OpenAI(api_key=API_KEY, base_url=BASE_URL) + db = sqlite3.connect(DB_PATH) + db.row_factory = sqlite3.Row + + db.executescript(""" + CREATE TABLE IF NOT EXISTS debates ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + topic TEXT, + source_podcast TEXT, source_episode TEXT, source_idx INTEGER, + target_podcast TEXT, target_episode TEXT, target_idx INTEGER, + agreement TEXT, divergence TEXT, insight TEXT, score REAL + ); + CREATE INDEX IF NOT EXISTS idx_debates_topic ON debates(topic); + """) + + # Get strongest cross-podcast links + rows = db.execute(""" + SELECT sl.podcast_id, sl.source_episode, sl.source_idx, + sl.target_podcast, sl.target_episode, sl.target_idx, sl.score, + p1.text as source_text, p2.text as target_text, + pc1.name as source_podcast_name, pc2.name as target_podcast_name, + e1.title as source_title, e1.guest as source_guest, + e2.title as target_title, e2.guest as target_guest + FROM semantic_links sl + JOIN paragraphs p1 ON sl.podcast_id = p1.podcast_id AND sl.source_episode = p1.episode_id AND sl.source_idx = p1.idx + JOIN paragraphs p2 ON sl.target_podcast = p2.podcast_id AND sl.target_episode = p2.episode_id AND sl.target_idx = p2.idx + JOIN episodes e1 ON sl.podcast_id = e1.podcast_id AND sl.source_episode = e1.id + JOIN episodes e2 ON sl.target_podcast = e2.podcast_id AND sl.target_episode = e2.id + JOIN podcasts pc1 ON sl.podcast_id = pc1.id + JOIN podcasts pc2 ON sl.target_podcast = pc2.id + WHERE sl.podcast_id != sl.target_podcast + ORDER BY sl.score DESC + LIMIT ? + """, (LIMIT,)).fetchall() + + print(f"Kuratiere {len(rows)} Cross-Podcast-Debatten mit {MODEL}…") + + existing = set() + try: + for r in db.execute("SELECT source_podcast||source_episode||source_idx||target_podcast||target_episode||target_idx as k FROM debates").fetchall(): + existing.add(r["k"]) + except Exception: + pass + + processed = 0 + for i, row in enumerate(rows): + key = f"{row['podcast_id']}{row['source_episode']}{row['source_idx']}{row['target_podcast']}{row['target_episode']}{row['target_idx']}" + if key in existing: + continue + + meta_a = f"{row['source_podcast_name']} / {row['source_episode']}: {row['source_title']} ({row['source_guest']})" + meta_b = f"{row['target_podcast_name']} / {row['target_episode']}: {row['target_title']} ({row['target_guest']})" + + result = curate_pair(client, row["source_text"][:800], meta_a, row["target_text"][:800], meta_b) + + db.execute( + "INSERT INTO debates (topic, source_podcast, source_episode, source_idx, " + "target_podcast, target_episode, target_idx, agreement, divergence, insight, score) " + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + (result.get("topic", ""), row["podcast_id"], row["source_episode"], row["source_idx"], + row["target_podcast"], row["target_episode"], row["target_idx"], + result.get("agreement", ""), result.get("divergence", ""), + result.get("insight", ""), row["score"]) + ) + + processed += 1 + if processed % 10 == 0: + db.commit() + print(f" {processed} kuratiert…") + + time.sleep(0.3) + + db.commit() + + topics = db.execute("SELECT topic, COUNT(*) as c FROM debates GROUP BY topic ORDER BY c DESC LIMIT 20").fetchall() + print(f"\nFertig: {processed} Debatten kuratiert.") + print("Top-Themen:") + for t in topics: + print(f" {t['topic']}: {t['c']}") + + db.close() + + +if __name__ == "__main__": + main() diff --git a/scripts/detect_gaps.py b/scripts/detect_gaps.py new file mode 100644 index 0000000..b7f8279 --- /dev/null +++ b/scripts/detect_gaps.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 +"""#14 Leerstellen-Detektor: Embedding-Cluster-Analyse zur Identifikation von Diskurslücken. + +Bildet Cluster über alle Paragraphen, misst Dichte pro Podcast, +identifiziert asymmetrische und leere Cluster. + +Nutzung: + python3 detect_gaps.py [db-pfad] [output-json] +""" + +import json +import sys +import sqlite3 +import numpy as np + +DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite" +OUTPUT = sys.argv[2] if len(sys.argv) > 2 else "data/gaps_analysis.json" + +N_CLUSTERS = 30 # Feinere Auflösung + + +def load_embeddings(db_path): + db = sqlite3.connect(db_path) + db.row_factory = sqlite3.Row + rows = db.execute( + "SELECT p.id, p.podcast_id, p.episode_id, p.idx, p.text, p.embedding, e.title, e.guest " + "FROM paragraphs p JOIN episodes e ON p.podcast_id = e.podcast_id AND p.episode_id = e.id " + "WHERE p.embedding IS NOT NULL" + ).fetchall() + db.close() + + meta = [] + vectors = [] + for r in rows: + meta.append({ + "id": r["id"], "podcast_id": r["podcast_id"], + "episode_id": r["episode_id"], "idx": r["idx"], + "text": r["text"][:200], "title": r["title"], "guest": r["guest"] + }) + vectors.append(np.frombuffer(r["embedding"], dtype=np.float32)) + + return np.array(vectors), meta + + +def kmeans_simple(vectors, k, max_iter=50): + """Simple K-Means ohne sklearn-Dependency.""" + n = len(vectors) + # Init: random selection + rng = np.random.default_rng(42) + indices = rng.choice(n, k, replace=False) + centroids = vectors[indices].copy() + + labels = np.zeros(n, dtype=int) + + for _ in range(max_iter): + # Assign + dists = np.linalg.norm(vectors[:, None] - centroids[None, :], axis=2) + new_labels = np.argmin(dists, axis=1) + + if np.all(new_labels == labels): + break + labels = new_labels + + # Update centroids + for j in range(k): + mask = labels == j + if mask.sum() > 0: + centroids[j] = vectors[mask].mean(axis=0) + + return labels, centroids + + +def main(): + print("Lade Embeddings…") + vectors, meta = load_embeddings(DB_PATH) + print(f" {len(vectors)} Absätze geladen.") + + # Normalize + norms = np.linalg.norm(vectors, axis=1, keepdims=True) + norms[norms == 0] = 1 + vectors_norm = vectors / norms + + print(f"Clustere in {N_CLUSTERS} Gruppen…") + labels, centroids = kmeans_simple(vectors_norm, N_CLUSTERS) + + # Analyze clusters + podcasts = sorted(set(m["podcast_id"] for m in meta)) + clusters = [] + + for c in range(N_CLUSTERS): + mask = labels == c + indices = np.where(mask)[0] + members = [meta[i] for i in indices] + + # Count per podcast + per_podcast = {p: 0 for p in podcasts} + for m in members: + per_podcast[m["podcast_id"]] += 1 + + # Representative texts (closest to centroid) + if len(indices) > 0: + dists = np.linalg.norm(vectors_norm[indices] - centroids[c], axis=1) + top_indices = indices[np.argsort(dists)[:5]] + representative = [{"text": meta[i]["text"], "episode": meta[i]["episode_id"], + "podcast": meta[i]["podcast_id"], "guest": meta[i]["guest"]} for i in top_indices] + else: + representative = [] + + # Derive topic label from representative texts + words = " ".join(m["text"][:100] for m in members[:20]).lower().split() + # Simple word frequency (exclude common words) + stop = {"der", "die", "das", "und", "in", "von", "zu", "den", "ist", "ein", "eine", "es", "mit", + "auf", "für", "an", "sich", "nicht", "auch", "dass", "wir", "man", "aber", "des", "dem", + "werden", "oder", "als", "wie", "hat", "ich", "sind", "was", "so", "haben", "dann", + "wenn", "noch", "schon", "kann", "wird", "hier", "über", "nach", "nur", "bei", "da", + "diese", "dieser", "dieses", "einem", "einer", "also", "ja", "mal", "war", "sehr", + "gibt", "aus", "zum", "zur", "mehr", "immer", "weil", "uns", "sie", "er", "vom"} + word_freq = {} + for w in words: + w = w.strip(".,;:!?\"'()[]") + if len(w) > 3 and w not in stop: + word_freq[w] = word_freq.get(w, 0) + 1 + top_words = sorted(word_freq.items(), key=lambda x: -x[1])[:5] + label = ", ".join(w for w, _ in top_words) if top_words else f"Cluster {c}" + + clusters.append({ + "id": c, + "label": label, + "size": int(mask.sum()), + "per_podcast": per_podcast, + "representative": representative, + }) + + # Sort by size + clusters.sort(key=lambda x: -x["size"]) + + # Identify gaps + gaps = [] + for cl in clusters: + total = cl["size"] + if total < 3: + continue + + for p in podcasts: + count = cl["per_podcast"].get(p, 0) + other_total = total - count + if other_total > 10 and count <= 2: + gaps.append({ + "cluster_label": cl["label"], + "cluster_size": total, + "missing_in": p, + "present_in_count": other_total, + "representative": cl["representative"][:3], + }) + + # Sort gaps by how asymmetric they are + gaps.sort(key=lambda x: -x["present_in_count"]) + + result = { + "total_paragraphs": len(meta), + "podcasts": podcasts, + "n_clusters": N_CLUSTERS, + "clusters": clusters, + "gaps": gaps[:30], + } + + with open(OUTPUT, "w") as f: + json.dump(result, f, ensure_ascii=False, indent=2) + + print(f"\n{len(clusters)} Cluster, {len(gaps)} Leerstellen identifiziert.") + print(f"\nTop-Leerstellen:") + for g in gaps[:10]: + print(f" [{g['missing_in']}] fehlt: \"{g['cluster_label']}\" ({g['present_in_count']} Absätze im anderen Podcast)") + + print(f"\nCluster-Größen:") + for cl in clusters[:15]: + bar = " | ".join(f"{p}:{cl['per_podcast'][p]}" for p in podcasts) + print(f" {cl['label'][:40]:40s} ({cl['size']:4d}) — {bar}") + + print(f"\nErgebnis: {OUTPUT}") + + +if __name__ == "__main__": + main() diff --git a/scripts/detect_narrative_shift.py b/scripts/detect_narrative_shift.py new file mode 100644 index 0000000..064b147 --- /dev/null +++ b/scripts/detect_narrative_shift.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +"""#15 Narrative Shift Detection: Wie verschiebt sich das Framing über die Zeit? + +Berechnet Embedding-Drift pro Themen-Cluster über die Episodenreihenfolge. +Spitzen = Framing-Wechsel. + +Nutzung: + python3 detect_narrative_shift.py [db-pfad] [output-json] +""" + +import json +import sys +import sqlite3 +import numpy as np + +DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite" +OUTPUT = sys.argv[2] if len(sys.argv) > 2 else "data/narrative_shifts.json" + +# Themen-Keywords für Cluster-Zuordnung +THEMES = { + "klimaschutz": ["klima", "klimawandel", "co2", "emission", "erderwärmung", "klimaschutz", "temperatur", "paris"], + "sicherheit": ["sicherheit", "verteidigung", "militär", "nato", "krieg", "frieden", "abschreckung", "bundeswehr"], + "demokratie": ["demokratie", "demokratisch", "wahl", "parlament", "abstimmung", "beteiligung", "grundgesetz"], + "ungleichheit": ["ungleichheit", "armut", "vermögen", "reichtum", "einkommen", "verteilung", "gini"], + "digitalisierung": ["digital", "plattform", "algorithmus", "google", "meta", "tiktok", "internet", "daten"], + "bildung": ["bildung", "schule", "universität", "lernen", "ausbildung", "studier", "lehre"], + "gesundheit": ["gesundheit", "krankheit", "allergie", "medizin", "prävention", "gesundheitssystem"], + "migration": ["migration", "flucht", "integration", "zuwanderung", "fachkräfte", "asyl"], + "wirtschaft": ["wirtschaft", "wachstum", "bip", "konjunktur", "inflation", "arbeitsmarkt", "produktivität"], + "freiheit": ["freiheit", "grundrecht", "diskriminierung", "gleichstellung", "meinungsfreiheit"], +} + + +def load_data(db_path): + db = sqlite3.connect(db_path) + db.row_factory = sqlite3.Row + rows = db.execute( + "SELECT p.podcast_id, p.episode_id, p.idx, p.text, p.embedding, e.staffel " + "FROM paragraphs p JOIN episodes e ON p.podcast_id = e.podcast_id AND p.episode_id = e.id " + "WHERE p.embedding IS NOT NULL " + "ORDER BY p.podcast_id, e.staffel, p.episode_id, p.idx" + ).fetchall() + db.close() + return rows + + +def classify_theme(text): + """Ordne einen Absatz einem Thema zu (Keyword-Match).""" + text_lower = text.lower() + scores = {} + for theme, keywords in THEMES.items(): + score = sum(1 for kw in keywords if kw in text_lower) + if score > 0: + scores[theme] = score + if not scores: + return None + return max(scores, key=scores.get) + + +def cosine_distance(a, b): + na, nb = np.linalg.norm(a), np.linalg.norm(b) + if na == 0 or nb == 0: + return 1.0 + return 1.0 - np.dot(a, b) / (na * nb) + + +def main(): + print("Lade Daten…") + rows = load_data(DB_PATH) + print(f" {len(rows)} Absätze geladen.") + + # Group by podcast → episode → theme + podcasts = {} + for r in rows: + pid = r["podcast_id"] + eid = r["episode_id"] + text = r["text"] + vec = np.frombuffer(r["embedding"], dtype=np.float32) + theme = classify_theme(text) + + if theme is None: + continue + + if pid not in podcasts: + podcasts[pid] = {} + if eid not in podcasts[pid]: + podcasts[pid][eid] = {"staffel": r["staffel"], "themes": {}} + if theme not in podcasts[pid][eid]["themes"]: + podcasts[pid][eid]["themes"][theme] = [] + podcasts[pid][eid]["themes"][theme].append(vec) + + # Compute centroid per (podcast, episode, theme) + shifts = {} + + for pid, episodes in podcasts.items(): + ep_list = sorted(episodes.keys()) # Lexicographic = chronological for SxEy format + + for theme in THEMES: + centroids = [] + ep_labels = [] + + for eid in ep_list: + if theme not in episodes[eid]["themes"]: + continue + vecs = np.array(episodes[eid]["themes"][theme]) + centroid = vecs.mean(axis=0) + centroids.append(centroid) + ep_labels.append(eid) + + if len(centroids) < 3: + continue + + # Compute drift between consecutive episodes + drifts = [] + for i in range(1, len(centroids)): + drift = cosine_distance(centroids[i - 1], centroids[i]) + drifts.append({ + "from": ep_labels[i - 1], + "to": ep_labels[i], + "drift": round(float(drift), 4), + }) + + # Find spikes (> 1.5 * median) + drift_vals = [d["drift"] for d in drifts] + median = float(np.median(drift_vals)) + mean = float(np.mean(drift_vals)) + + spikes = [d for d in drifts if d["drift"] > median * 1.5] + + key = f"{pid}/{theme}" + shifts[key] = { + "podcast": pid, + "theme": theme, + "episodes": ep_labels, + "n_episodes": len(ep_labels), + "mean_drift": round(mean, 4), + "median_drift": round(median, 4), + "max_drift": round(max(drift_vals), 4), + "drifts": drifts, + "spikes": spikes, + } + + # Sort by max drift + sorted_shifts = sorted(shifts.values(), key=lambda x: -x["max_drift"]) + + result = { + "total_themes_tracked": len(sorted_shifts), + "themes": list(THEMES.keys()), + "shifts": sorted_shifts, + } + + with open(OUTPUT, "w") as f: + json.dump(result, f, ensure_ascii=False, indent=2) + + print(f"\n{len(sorted_shifts)} Themen-Verläufe berechnet.") + print(f"\nGrößte Framing-Shifts:") + for s in sorted_shifts[:10]: + spike_info = "" + if s["spikes"]: + spike_info = " | Spikes: " + ", ".join(f"{sp['from']}→{sp['to']}({sp['drift']:.3f})" for sp in s["spikes"][:3]) + print(f" {s['podcast']}/{s['theme']:15s} — max_drift={s['max_drift']:.4f}, mean={s['mean_drift']:.4f}{spike_info}") + + print(f"\nErgebnis: {OUTPUT}") + + +if __name__ == "__main__": + main() diff --git a/scripts/extract_claims.py b/scripts/extract_claims.py new file mode 100644 index 0000000..2f6269a --- /dev/null +++ b/scripts/extract_claims.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 +"""#16 Claim-Verification-Layer: Extrahiere prüfbare Behauptungen aus Transkripten. + +Nutzung: + DASHSCOPE_API_KEY=... python3 extract_claims.py [db-pfad] +""" + +import json +import os +import sys +import time +import sqlite3 + +from openai import OpenAI + +DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite" +API_KEY = os.environ.get("DASHSCOPE_API_KEY", "") +BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" +MODEL = "qwen-turbo" # Günstiger für Massenverarbeitung +BATCH_SIZE = 3 # Absätze pro API-Call + +SYSTEM_PROMPT = """Du bist ein Faktenprüfer. Du erhältst Podcast-Transkript-Absätze. +Extrahiere ALLE prüfbaren faktischen Behauptungen (Zahlen, Statistiken, kausale Aussagen, Verweise auf Studien/Gesetze). +KEINE Meinungen, Bewertungen oder rhetorische Fragen. + +Antworte NUR mit einem JSON-Array. Für jeden Absatz ein Objekt: +[{"paragraph_idx": 0, "claims": [{"text": "Die Behauptung", "type": "statistic|causal|reference|number", "verifiable": true}]}] + +Wenn ein Absatz keine prüfbaren Claims enthält: {"paragraph_idx": 0, "claims": []}""" + + +def extract_batch(client, paragraphs): + """Extrahiere Claims aus einem Batch von Absätzen.""" + user_msg = "" + for i, p in enumerate(paragraphs): + user_msg += f"\n--- Absatz {i} ({p['episode_id']}, {p['start_time']:.0f}s) ---\n{p['text'][:600]}\n" + + try: + resp = client.chat.completions.create( + model=MODEL, + messages=[ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": user_msg}, + ], + temperature=0.1, + max_tokens=1000, + ) + content = resp.choices[0].message.content.strip() + if content.startswith("```"): + content = content.split("```")[1].strip() + if content.startswith("json"): + content = content[4:].strip() + return json.loads(content) + except Exception as e: + return [{"paragraph_idx": i, "claims": [], "error": str(e)} for i in range(len(paragraphs))] + + +def main(): + if not API_KEY: + print("DASHSCOPE_API_KEY nicht gesetzt.") + sys.exit(1) + + client = OpenAI(api_key=API_KEY, base_url=BASE_URL) + db = sqlite3.connect(DB_PATH) + db.row_factory = sqlite3.Row + + # Create output table + db.executescript(""" + CREATE TABLE IF NOT EXISTS claims ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + podcast_id TEXT, episode_id TEXT, paragraph_idx INTEGER, + claim_text TEXT, claim_type TEXT, verifiable BOOLEAN, + start_time REAL + ); + CREATE INDEX IF NOT EXISTS idx_claims_episode ON claims(podcast_id, episode_id); + CREATE INDEX IF NOT EXISTS idx_claims_type ON claims(claim_type); + """) + + # Check what's already processed + processed_keys = set() + try: + for r in db.execute("SELECT DISTINCT podcast_id||'/'||episode_id||'/'||paragraph_idx as k FROM claims").fetchall(): + processed_keys.add(r["k"]) + except Exception: + pass + + # Get all paragraphs + rows = db.execute( + "SELECT id, podcast_id, episode_id, idx, start_time, text FROM paragraphs ORDER BY podcast_id, episode_id, idx" + ).fetchall() + + # Filter already processed + todo = [r for r in rows if f"{r['podcast_id']}/{r['episode_id']}/{r['idx']}" not in processed_keys] + print(f"Extrahiere Claims: {len(todo)} Absätze zu verarbeiten ({len(rows) - len(todo)} bereits fertig)") + + total_claims = 0 + for i in range(0, len(todo), BATCH_SIZE): + batch = todo[i:i + BATCH_SIZE] + paras = [{"episode_id": r["episode_id"], "start_time": r["start_time"] or 0, "text": r["text"]} for r in batch] + + results = extract_batch(client, paras) + + for j, result in enumerate(results): + if j >= len(batch): + break + row = batch[j] + for claim in result.get("claims", []): + db.execute( + "INSERT INTO claims (podcast_id, episode_id, paragraph_idx, claim_text, claim_type, verifiable, start_time) " + "VALUES (?, ?, ?, ?, ?, ?, ?)", + (row["podcast_id"], row["episode_id"], row["idx"], + claim.get("text", ""), claim.get("type", "unknown"), + claim.get("verifiable", True), row["start_time"]) + ) + total_claims += 1 + + if (i // BATCH_SIZE) % 20 == 0: + db.commit() + print(f" {min(i + BATCH_SIZE, len(todo))}/{len(todo)} Absätze, {total_claims} Claims bisher") + + time.sleep(0.2) + + db.commit() + + # Stats + stats = db.execute("SELECT claim_type, COUNT(*) as c FROM claims GROUP BY claim_type ORDER BY c DESC").fetchall() + podcast_stats = db.execute("SELECT podcast_id, COUNT(*) as c FROM claims GROUP BY podcast_id").fetchall() + print(f"\nFertig: {total_claims} Claims extrahiert.") + print("Nach Typ:") + for s in stats: + print(f" {s['claim_type']}: {s['c']}") + print("Nach Podcast:") + for s in podcast_stats: + print(f" {s['podcast_id']}: {s['c']}") + + db.close() + + +if __name__ == "__main__": + main() diff --git a/scripts/extract_questions.py b/scripts/extract_questions.py new file mode 100644 index 0000000..0868418 --- /dev/null +++ b/scripts/extract_questions.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 +"""#17 Frage-Antwort-Asymmetrie: Extrahiere Fragen aus Transkripten. + +Nutzung: + DASHSCOPE_API_KEY=... python3 extract_questions.py [db-pfad] +""" + +import json +import os +import sys +import time +import sqlite3 + +from openai import OpenAI + +DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite" +API_KEY = os.environ.get("DASHSCOPE_API_KEY", "") +BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" +MODEL = "qwen-turbo" +BATCH_SIZE = 3 + +SYSTEM_PROMPT = """Du bist ein Diskursanalyst. Du erhältst Podcast-Transkript-Absätze. +Extrahiere ALLE Fragen (explizite und implizite). Klassifiziere jede Frage. + +Antworte NUR mit einem JSON-Array: +[{"paragraph_idx": 0, "questions": [{"text": "Die Frage", "type": "genuine|rhetorical|follow_up|implicit", "answered": "yes|partial|no|self_answered"}]}] + +- genuine: Echte Frage, die eine Antwort erwartet +- rhetorical: Rhetorische Frage zur Betonung +- follow_up: Rückfrage/Nachfrage +- implicit: Implizite Frage (z.B. "Da fragt man sich natürlich…") + +- answered: Wird die Frage im selben Absatz beantwortet? + +Wenn keine Fragen: {"paragraph_idx": 0, "questions": []}""" + + +def extract_batch(client, paragraphs): + user_msg = "" + for i, p in enumerate(paragraphs): + user_msg += f"\n--- Absatz {i} ({p['episode_id']}, {p['start_time']:.0f}s) ---\n{p['text'][:600]}\n" + + try: + resp = client.chat.completions.create( + model=MODEL, + messages=[ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": user_msg}, + ], + temperature=0.1, + max_tokens=1000, + ) + content = resp.choices[0].message.content.strip() + if content.startswith("```"): + content = content.split("```")[1].strip() + if content.startswith("json"): + content = content[4:].strip() + return json.loads(content) + except Exception as e: + return [{"paragraph_idx": i, "questions": []} for i in range(len(paragraphs))] + + +def main(): + if not API_KEY: + print("DASHSCOPE_API_KEY nicht gesetzt.") + sys.exit(1) + + client = OpenAI(api_key=API_KEY, base_url=BASE_URL) + db = sqlite3.connect(DB_PATH) + db.row_factory = sqlite3.Row + + db.executescript(""" + CREATE TABLE IF NOT EXISTS questions ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + podcast_id TEXT, episode_id TEXT, paragraph_idx INTEGER, + question_text TEXT, question_type TEXT, + answered TEXT DEFAULT 'unknown', + answered_by_podcast TEXT, answered_by_episode TEXT, answered_by_idx INTEGER, + start_time REAL + ); + CREATE INDEX IF NOT EXISTS idx_questions_episode ON questions(podcast_id, episode_id); + CREATE INDEX IF NOT EXISTS idx_questions_type ON questions(question_type); + CREATE INDEX IF NOT EXISTS idx_questions_answered ON questions(answered); + """) + + processed_keys = set() + try: + for r in db.execute("SELECT DISTINCT podcast_id||'/'||episode_id||'/'||paragraph_idx as k FROM questions").fetchall(): + processed_keys.add(r["k"]) + except Exception: + pass + + rows = db.execute( + "SELECT id, podcast_id, episode_id, idx, start_time, text FROM paragraphs ORDER BY podcast_id, episode_id, idx" + ).fetchall() + + todo = [r for r in rows if f"{r['podcast_id']}/{r['episode_id']}/{r['idx']}" not in processed_keys] + print(f"Extrahiere Fragen: {len(todo)} Absätze ({len(rows) - len(todo)} bereits fertig)") + + total_questions = 0 + for i in range(0, len(todo), BATCH_SIZE): + batch = todo[i:i + BATCH_SIZE] + paras = [{"episode_id": r["episode_id"], "start_time": r["start_time"] or 0, "text": r["text"]} for r in batch] + + results = extract_batch(client, paras) + + for j, result in enumerate(results): + if j >= len(batch): + break + row = batch[j] + for q in result.get("questions", []): + db.execute( + "INSERT INTO questions (podcast_id, episode_id, paragraph_idx, question_text, question_type, answered, start_time) " + "VALUES (?, ?, ?, ?, ?, ?, ?)", + (row["podcast_id"], row["episode_id"], row["idx"], + q.get("text", ""), q.get("type", "unknown"), + q.get("answered", "unknown"), row["start_time"]) + ) + total_questions += 1 + + if (i // BATCH_SIZE) % 20 == 0: + db.commit() + print(f" {min(i + BATCH_SIZE, len(todo))}/{len(todo)} Absätze, {total_questions} Fragen bisher") + + time.sleep(0.2) + + db.commit() + + stats = db.execute("SELECT question_type, COUNT(*) as c FROM questions GROUP BY question_type ORDER BY c DESC").fetchall() + answered_stats = db.execute("SELECT answered, COUNT(*) as c FROM questions GROUP BY answered ORDER BY c DESC").fetchall() + print(f"\nFertig: {total_questions} Fragen extrahiert.") + print("Nach Typ:") + for s in stats: + print(f" {s['question_type']}: {s['c']}") + print("Beantwortet:") + for s in answered_stats: + print(f" {s['answered']}: {s['c']}") + + db.close() + + +if __name__ == "__main__": + main() diff --git a/scripts/run_all_qwen.sh b/scripts/run_all_qwen.sh new file mode 100755 index 0000000..964bd42 --- /dev/null +++ b/scripts/run_all_qwen.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Sequentielle Ausführung aller Qwen-Analyse-Scripts +# Vermeidet SQLite-Locks durch seriellen Zugriff + +set -e +cd "$(dirname "$0")/.." +DB="data/db.sqlite" +export DASHSCOPE_API_KEY=$(security find-generic-password -s qwen-api -w) + +echo "$(date): Starte Qwen-Analyse-Pipeline" +echo "================================================" + +echo "" +echo "$(date): #18 Debatten kuratieren (100 Paare, schnellster Task)…" +python3 scripts/curate_debates.py "$DB" 100 +echo "$(date): #18 fertig." + +echo "" +echo "$(date): #13 Argumentketten klassifizieren (500 Paare)…" +python3 scripts/analyse_arguments.py "$DB" 500 +echo "$(date): #13 fertig." + +echo "" +echo "$(date): #17 Fragen extrahieren (3727 Absätze)…" +python3 scripts/extract_questions.py "$DB" +echo "$(date): #17 fertig." + +echo "" +echo "$(date): #16 Claims extrahieren (3727 Absätze)…" +python3 scripts/extract_claims.py "$DB" +echo "$(date): #16 fertig." + +echo "" +echo "$(date): Alle Qwen-Tasks abgeschlossen." +echo "DONE" > /tmp/qwen_pipeline_done diff --git a/webapp/index.html b/webapp/index.html index d60c562..216b98d 100644 --- a/webapp/index.html +++ b/webapp/index.html @@ -182,6 +182,21 @@ margin-right: 6px; } + /* ── Word-level highlighting (#12) ── */ + .word { + transition: background 0.1s, color 0.1s; + border-radius: 2px; padding: 0 1px; + cursor: pointer; + } + .word:hover { background: var(--surface2); } + .word.word-active { + background: var(--accent-green)44; + color: var(--text); + } + .word.word-spoken { + color: var(--text); + } + /* ── Search Results ── */ .search-result { background: var(--surface2); border-radius: 8px; padding: 12px; @@ -516,6 +531,8 @@ const TranscriptView = { paragraphs: null, userScrolled: false, activeIdx: -1, + words: null, // Word-level timestamps for current episode + activeWordIdx: -1, // Currently highlighted word async show(episodeId, seekTime) { const epData = await this.loadEpisodeTranscript(episodeId); @@ -540,10 +557,35 @@ const TranscriptView = { let html = `

${ep.id}: ${ep.title} — Transkript

`; html += `

${ep.guest}

`; + // Try to load word-level timestamps + this.words = null; + this.activeWordIdx = -1; + if (CURRENT_PODCAST) { + try { + const wr = await fetch(`${API_BASE}/api/podcasts/${CURRENT_PODCAST}/transcript/${episodeId}/words`); + if (wr.ok) { + const wd = await wr.json(); + if (wd.available && wd.words.length > 0) this.words = wd.words; + } + } catch (e) {} + } + this.paragraphs.forEach((p, i) => { html += `
`; html += `${fmtTime(p.start)}`; - html += escHtml(p.text); + if (this.words) { + // Render words as clickable spans with timestamps + const paraWords = this.words.filter(w => w.start >= p.start - 0.5 && w.start < (p.end || p.start + 999)); + if (paraWords.length > 0) { + paraWords.forEach(w => { + html += `${escHtml(w.word)} `; + }); + } else { + html += escHtml(p.text); + } + } else { + html += escHtml(p.text); + } html += `
`; }); @@ -557,21 +599,45 @@ const TranscriptView = { syncToTime(time) { if (!this.visible || !this.paragraphs) return; + + // Paragraph-level sync let idx = -1; for (let i = 0; i < this.paragraphs.length; i++) { if (time >= this.paragraphs[i].start) idx = i; else break; } - if (idx === this.activeIdx) return; - this.activeIdx = idx; + if (idx !== this.activeIdx) { + this.activeIdx = idx; + document.querySelectorAll('.transcript-para.active').forEach(el => el.classList.remove('active')); + if (idx >= 0) { + const el = document.querySelector(`.transcript-para[data-idx="${idx}"]`); + if (el) { + el.classList.add('active'); + if (!this.userScrolled) { + el.scrollIntoView({ behavior: 'smooth', block: 'center' }); + } + } + } + } - document.querySelectorAll('.transcript-para.active').forEach(el => el.classList.remove('active')); - if (idx >= 0) { - const el = document.querySelector(`.transcript-para[data-idx="${idx}"]`); - if (el) { - el.classList.add('active'); - if (!this.userScrolled) { - el.scrollIntoView({ behavior: 'smooth', block: 'center' }); + // Word-level sync (#12) + if (this.words) { + const prev = document.querySelector('.word.word-active'); + if (prev) prev.classList.replace('word-active', 'word-spoken'); + + // Find current word by time + const wordEl = document.querySelector(`.word[data-ws]`); + if (wordEl) { + const allWords = document.querySelectorAll('.word[data-ws]'); + for (const w of allWords) { + const ws = parseFloat(w.dataset.ws); + const we = parseFloat(w.dataset.we); + if (time >= ws && time < we) { + w.classList.add('word-active'); + break; + } else if (time >= we) { + w.classList.add('word-spoken'); + } } } }