#!/usr/bin/env python3 """#15 Narrative Shift Detection: Wie verschiebt sich das Framing über die Zeit? Berechnet Embedding-Drift pro Themen-Cluster über die Episodenreihenfolge. Spitzen = Framing-Wechsel. Nutzung: python3 detect_narrative_shift.py [db-pfad] [output-json] """ import json import sys import sqlite3 import numpy as np DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite" OUTPUT = sys.argv[2] if len(sys.argv) > 2 else "data/narrative_shifts.json" # Themen-Keywords für Cluster-Zuordnung THEMES = { "klimaschutz": ["klima", "klimawandel", "co2", "emission", "erderwärmung", "klimaschutz", "temperatur", "paris"], "sicherheit": ["sicherheit", "verteidigung", "militär", "nato", "krieg", "frieden", "abschreckung", "bundeswehr"], "demokratie": ["demokratie", "demokratisch", "wahl", "parlament", "abstimmung", "beteiligung", "grundgesetz"], "ungleichheit": ["ungleichheit", "armut", "vermögen", "reichtum", "einkommen", "verteilung", "gini"], "digitalisierung": ["digital", "plattform", "algorithmus", "google", "meta", "tiktok", "internet", "daten"], "bildung": ["bildung", "schule", "universität", "lernen", "ausbildung", "studier", "lehre"], "gesundheit": ["gesundheit", "krankheit", "allergie", "medizin", "prävention", "gesundheitssystem"], "migration": ["migration", "flucht", "integration", "zuwanderung", "fachkräfte", "asyl"], "wirtschaft": ["wirtschaft", "wachstum", "bip", "konjunktur", "inflation", "arbeitsmarkt", "produktivität"], "freiheit": ["freiheit", "grundrecht", "diskriminierung", "gleichstellung", "meinungsfreiheit"], } def load_data(db_path): db = sqlite3.connect(db_path) db.row_factory = sqlite3.Row rows = db.execute( "SELECT p.podcast_id, p.episode_id, p.idx, p.text, p.embedding, e.staffel " "FROM paragraphs p JOIN episodes e ON p.podcast_id = e.podcast_id AND p.episode_id = e.id " "WHERE p.embedding IS NOT NULL " "ORDER BY p.podcast_id, e.staffel, p.episode_id, p.idx" ).fetchall() db.close() return rows def classify_theme(text): """Ordne einen Absatz einem Thema zu (Keyword-Match).""" text_lower = text.lower() scores = {} for theme, keywords in THEMES.items(): score = sum(1 for kw in keywords if kw in text_lower) if score > 0: scores[theme] = score if not scores: return None return max(scores, key=scores.get) def cosine_distance(a, b): na, nb = np.linalg.norm(a), np.linalg.norm(b) if na == 0 or nb == 0: return 1.0 return 1.0 - np.dot(a, b) / (na * nb) def main(): print("Lade Daten…") rows = load_data(DB_PATH) print(f" {len(rows)} Absätze geladen.") # Group by podcast → episode → theme podcasts = {} for r in rows: pid = r["podcast_id"] eid = r["episode_id"] text = r["text"] vec = np.frombuffer(r["embedding"], dtype=np.float32) theme = classify_theme(text) if theme is None: continue if pid not in podcasts: podcasts[pid] = {} if eid not in podcasts[pid]: podcasts[pid][eid] = {"staffel": r["staffel"], "themes": {}} if theme not in podcasts[pid][eid]["themes"]: podcasts[pid][eid]["themes"][theme] = [] podcasts[pid][eid]["themes"][theme].append(vec) # Compute centroid per (podcast, episode, theme) shifts = {} for pid, episodes in podcasts.items(): ep_list = sorted(episodes.keys()) # Lexicographic = chronological for SxEy format for theme in THEMES: centroids = [] ep_labels = [] for eid in ep_list: if theme not in episodes[eid]["themes"]: continue vecs = np.array(episodes[eid]["themes"][theme]) centroid = vecs.mean(axis=0) centroids.append(centroid) ep_labels.append(eid) if len(centroids) < 3: continue # Compute drift between consecutive episodes drifts = [] for i in range(1, len(centroids)): drift = cosine_distance(centroids[i - 1], centroids[i]) drifts.append({ "from": ep_labels[i - 1], "to": ep_labels[i], "drift": round(float(drift), 4), }) # Find spikes (> 1.5 * median) drift_vals = [d["drift"] for d in drifts] median = float(np.median(drift_vals)) mean = float(np.mean(drift_vals)) spikes = [d for d in drifts if d["drift"] > median * 1.5] key = f"{pid}/{theme}" shifts[key] = { "podcast": pid, "theme": theme, "episodes": ep_labels, "n_episodes": len(ep_labels), "mean_drift": round(mean, 4), "median_drift": round(median, 4), "max_drift": round(max(drift_vals), 4), "drifts": drifts, "spikes": spikes, } # Sort by max drift sorted_shifts = sorted(shifts.values(), key=lambda x: -x["max_drift"]) result = { "total_themes_tracked": len(sorted_shifts), "themes": list(THEMES.keys()), "shifts": sorted_shifts, } with open(OUTPUT, "w") as f: json.dump(result, f, ensure_ascii=False, indent=2) print(f"\n{len(sorted_shifts)} Themen-Verläufe berechnet.") print(f"\nGrößte Framing-Shifts:") for s in sorted_shifts[:10]: spike_info = "" if s["spikes"]: spike_info = " | Spikes: " + ", ".join(f"{sp['from']}→{sp['to']}({sp['drift']:.3f})" for sp in s["spikes"][:3]) print(f" {s['podcast']}/{s['theme']:15s} — max_drift={s['max_drift']:.4f}, mean={s['mean_drift']:.4f}{spike_info}") print(f"\nErgebnis: {OUTPUT}") if __name__ == "__main__": main()