#!/usr/bin/env python3 """Index transcript paragraphs with topic tags for cross-referencing. Reads srt_index.json, assigns topic tags to each paragraph, finds cross-references between paragraphs across episodes. Outputs topics_index.json for the webapp. This script processes paragraphs in batches and uses simple keyword/phrase extraction — no external LLM API needed. """ import json import os import re import sys from collections import defaultdict from difflib import SequenceMatcher # Topic taxonomy — shared vocabulary for cross-podcast matching TOPIC_TAXONOMY = { # Wirtschaft "wachstum": ["wachstum", "bip", "bruttoinlandsprodukt", "wirtschaftswachstum", "konjunktur", "rezession", "wohlstand"], "schulden": ["schulden", "staatsverschuldung", "schuldenbremse", "kredit", "anleihen", "haushalt", "investition"], "vermoegen": ["vermögen", "reichtum", "eigentum", "besitz", "erbschaft", "ungleichheit", "verteilung", "kodierung"], "steuern": ["steuer", "erbschaftssteuer", "vermögensteuer", "kapitalertrag", "steuergerechtigkeit", "umverteilung"], "markt": ["markt", "marktwirtschaft", "kapitalismus", "neoliberal", "deregulierung", "privatisierung", "wettbewerb"], "innovation": ["innovation", "forschung", "technologie", "digitalisierung", "transformation", "start-up", "patent"], "arbeit": ["arbeit", "beschäftigung", "arbeitsmarkt", "fachkräfte", "mindestlohn", "gewerkschaft", "care-arbeit"], # Sicherheit "klimakrise": ["klima", "klimawandel", "co2", "emission", "erderwärmung", "paris", "erneuerbar", "fossil", "ökologisch"], "geopolitik": ["geopolitik", "nato", "russland", "china", "europa", "sicherheitspolitik", "verteidigung", "aufrüstung", "krieg"], "soziale_sicherheit": ["sozial", "sicherheit", "rente", "pflege", "gesundheit", "daseinsvorsorge", "sozialstaat"], "digitale_sicherheit": ["plattform", "monopol", "google", "meta", "tiktok", "algorithmus", "datenschutz", "überwachung"], # Demokratie "demokratie": ["demokratie", "wahl", "parlament", "regierung", "rechtsstaat", "gewaltenteilung", "verfassung", "grundgesetz"], "macht": ["macht", "herrschaft", "elite", "oligarchie", "lobbyismus", "einfluss", "gestaltung"], "polarisierung": ["polarisierung", "spaltung", "populismus", "radikalisierung", "filterblas", "desinformation", "fake"], "buerokratie": ["bürokratie", "verwaltung", "regulierung", "föderalismus", "kommune", "behörde"], "kompromiss": ["kompromiss", "verhandlung", "koalition", "konsens", "diskurs", "debatte", "streit"], "extremismus": ["extremismus", "faschismus", "rechtsextrem", "autoritär", "rassismus", "antisemitismus", "gewalt"], # Freiheit "freiheit": ["freiheit", "grundrecht", "menschenrecht", "selbstbestimmung", "autonomie", "emanzipation"], "diskriminierung": ["diskriminierung", "rassismus", "gleichstellung", "gender", "minderheit", "inklusion", "vielfalt"], "bildung": ["bildung", "schule", "universität", "lernen", "kompetenz", "pisa", "chancengleichheit"], "gesundheit": ["gesundheit", "krankheit", "prävention", "medizin", "mental health", "ernährung", "allergie"], "migration": ["migration", "flucht", "integration", "zuwanderung", "asyl", "fachkräfteeinwanderung"], # Querschnitt "ohnmacht": ["ohnmacht", "hilflosigkeit", "resignation", "vertrauen", "selbstwirksamkeit", "beteiligung", "engagement"], "narrative": ["narrativ", "erzählung", "framing", "kommunikation", "medien", "öffentlichkeit", "diskurs"], "generationen": ["generation", "jugend", "alter", "demografie", "zukunft", "intergenerationell", "boomer"], "infrastruktur": ["infrastruktur", "bahn", "straße", "brücke", "netz", "glasfaser", "öpnv", "mobilität"], } def tag_paragraph(text, taxonomy=TOPIC_TAXONOMY): """Assign topic tags to a paragraph based on keyword matching.""" text_lower = text.lower() scores = {} for topic, keywords in taxonomy.items(): score = 0 for kw in keywords: # Count occurrences, weight longer keywords higher count = text_lower.count(kw) if count > 0: score += count * (1 + len(kw) / 10) if score > 0: scores[topic] = score # Return top tags (normalized score > threshold) if not scores: return [] max_score = max(scores.values()) threshold = max_score * 0.3 tags = sorted( [(t, s) for t, s in scores.items() if s >= threshold], key=lambda x: -x[1] ) return [t for t, s in tags[:5]] # max 5 tags def find_crossrefs(tagged_paragraphs, min_shared_tags=2): """Find cross-references between paragraphs in different episodes.""" crossrefs = [] # Build index: topic → list of (episode_key, para_idx) topic_index = defaultdict(list) for ep_key, paras in tagged_paragraphs.items(): for i, p in enumerate(paras): for tag in p["tags"]: topic_index[tag].append((ep_key, i)) # Find pairs with shared tags across episodes seen = set() for topic, locations in topic_index.items(): for j in range(len(locations)): for k in range(j + 1, len(locations)): ep1, idx1 = locations[j] ep2, idx2 = locations[k] if ep1 == ep2: continue # Skip same-episode refs pair = tuple(sorted([(ep1, idx1), (ep2, idx2)])) if pair in seen: continue seen.add(pair) # Count shared tags tags1 = set(tagged_paragraphs[ep1][idx1]["tags"]) tags2 = set(tagged_paragraphs[ep2][idx2]["tags"]) shared = tags1 & tags2 if len(shared) >= min_shared_tags: crossrefs.append({ "source": {"episode": ep1, "paragraph": idx1}, "target": {"episode": ep2, "paragraph": idx2}, "shared_topics": sorted(shared), "score": len(shared) / max(len(tags1), len(tags2), 1) }) # Sort by score crossrefs.sort(key=lambda x: -x["score"]) return crossrefs def main(): project_dir = sys.argv[1] if len(sys.argv) > 1 else "." data_dir = os.path.join(project_dir, "data") # Load transcript index srt_path = os.path.join(data_dir, "srt_index.json") if not os.path.exists(srt_path): print(f"ERROR: {srt_path} not found") sys.exit(1) with open(srt_path, "r", encoding="utf-8") as f: srt_data = json.load(f) # Tag all paragraphs tagged = {} total_tags = 0 for ep_key, ep_data in srt_data.items(): tagged[ep_key] = [] for p in ep_data["paragraphs"]: tags = tag_paragraph(p["text"]) tagged[ep_key].append({ "start": p["start"], "end": p["end"], "tags": tags, "text_preview": p["text"][:100] }) total_tags += len(tags) print(f"Tagged {sum(len(v) for v in tagged.values())} paragraphs") print(f"Total tags assigned: {total_tags}") # Show tag distribution tag_counts = defaultdict(int) for ep_paras in tagged.values(): for p in ep_paras: for t in p["tags"]: tag_counts[t] += 1 print(f"\nTop topics:") for tag, count in sorted(tag_counts.items(), key=lambda x: -x[1])[:15]: print(f" {tag}: {count} paragraphs") # Find cross-references crossrefs = find_crossrefs(tagged) print(f"\nCross-references: {len(crossrefs)} (min 2 shared tags)") # Output output = { "taxonomy": {k: {"keywords": v} for k, v in TOPIC_TAXONOMY.items()}, "tagged_paragraphs": tagged, "crossrefs": crossrefs[:500], # Top 500 } output_path = os.path.join(data_dir, "topics_index.json") with open(output_path, "w", encoding="utf-8") as f: json.dump(output, f, ensure_ascii=False, indent=2) # Copy to webapp webapp_dir = os.path.join(project_dir, "webapp") if os.path.isdir(webapp_dir): import shutil shutil.copy2(output_path, os.path.join(webapp_dir, "topics_index.json")) print(f"\nOutput: {output_path}") if __name__ == "__main__": main()