podcast-mindmap/scripts/index_topics.py

199 lines
8.1 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""Index transcript paragraphs with topic tags for cross-referencing.
Reads srt_index.json, assigns topic tags to each paragraph,
finds cross-references between paragraphs across episodes.
Outputs topics_index.json for the webapp.
This script processes paragraphs in batches and uses simple
keyword/phrase extraction no external LLM API needed.
"""
import json
import os
import re
import sys
from collections import defaultdict
from difflib import SequenceMatcher
# Topic taxonomy — shared vocabulary for cross-podcast matching
TOPIC_TAXONOMY = {
# Wirtschaft
"wachstum": ["wachstum", "bip", "bruttoinlandsprodukt", "wirtschaftswachstum", "konjunktur", "rezession", "wohlstand"],
"schulden": ["schulden", "staatsverschuldung", "schuldenbremse", "kredit", "anleihen", "haushalt", "investition"],
"vermoegen": ["vermögen", "reichtum", "eigentum", "besitz", "erbschaft", "ungleichheit", "verteilung", "kodierung"],
"steuern": ["steuer", "erbschaftssteuer", "vermögensteuer", "kapitalertrag", "steuergerechtigkeit", "umverteilung"],
"markt": ["markt", "marktwirtschaft", "kapitalismus", "neoliberal", "deregulierung", "privatisierung", "wettbewerb"],
"innovation": ["innovation", "forschung", "technologie", "digitalisierung", "transformation", "start-up", "patent"],
"arbeit": ["arbeit", "beschäftigung", "arbeitsmarkt", "fachkräfte", "mindestlohn", "gewerkschaft", "care-arbeit"],
# Sicherheit
"klimakrise": ["klima", "klimawandel", "co2", "emission", "erderwärmung", "paris", "erneuerbar", "fossil", "ökologisch"],
"geopolitik": ["geopolitik", "nato", "russland", "china", "europa", "sicherheitspolitik", "verteidigung", "aufrüstung", "krieg"],
"soziale_sicherheit": ["sozial", "sicherheit", "rente", "pflege", "gesundheit", "daseinsvorsorge", "sozialstaat"],
"digitale_sicherheit": ["plattform", "monopol", "google", "meta", "tiktok", "algorithmus", "datenschutz", "überwachung"],
# Demokratie
"demokratie": ["demokratie", "wahl", "parlament", "regierung", "rechtsstaat", "gewaltenteilung", "verfassung", "grundgesetz"],
"macht": ["macht", "herrschaft", "elite", "oligarchie", "lobbyismus", "einfluss", "gestaltung"],
"polarisierung": ["polarisierung", "spaltung", "populismus", "radikalisierung", "filterblas", "desinformation", "fake"],
"buerokratie": ["bürokratie", "verwaltung", "regulierung", "föderalismus", "kommune", "behörde"],
"kompromiss": ["kompromiss", "verhandlung", "koalition", "konsens", "diskurs", "debatte", "streit"],
"extremismus": ["extremismus", "faschismus", "rechtsextrem", "autoritär", "rassismus", "antisemitismus", "gewalt"],
# Freiheit
"freiheit": ["freiheit", "grundrecht", "menschenrecht", "selbstbestimmung", "autonomie", "emanzipation"],
"diskriminierung": ["diskriminierung", "rassismus", "gleichstellung", "gender", "minderheit", "inklusion", "vielfalt"],
"bildung": ["bildung", "schule", "universität", "lernen", "kompetenz", "pisa", "chancengleichheit"],
"gesundheit": ["gesundheit", "krankheit", "prävention", "medizin", "mental health", "ernährung", "allergie"],
"migration": ["migration", "flucht", "integration", "zuwanderung", "asyl", "fachkräfteeinwanderung"],
# Querschnitt
"ohnmacht": ["ohnmacht", "hilflosigkeit", "resignation", "vertrauen", "selbstwirksamkeit", "beteiligung", "engagement"],
"narrative": ["narrativ", "erzählung", "framing", "kommunikation", "medien", "öffentlichkeit", "diskurs"],
"generationen": ["generation", "jugend", "alter", "demografie", "zukunft", "intergenerationell", "boomer"],
"infrastruktur": ["infrastruktur", "bahn", "straße", "brücke", "netz", "glasfaser", "öpnv", "mobilität"],
}
def tag_paragraph(text, taxonomy=TOPIC_TAXONOMY):
"""Assign topic tags to a paragraph based on keyword matching."""
text_lower = text.lower()
scores = {}
for topic, keywords in taxonomy.items():
score = 0
for kw in keywords:
# Count occurrences, weight longer keywords higher
count = text_lower.count(kw)
if count > 0:
score += count * (1 + len(kw) / 10)
if score > 0:
scores[topic] = score
# Return top tags (normalized score > threshold)
if not scores:
return []
max_score = max(scores.values())
threshold = max_score * 0.3
tags = sorted(
[(t, s) for t, s in scores.items() if s >= threshold],
key=lambda x: -x[1]
)
return [t for t, s in tags[:5]] # max 5 tags
def find_crossrefs(tagged_paragraphs, min_shared_tags=2):
"""Find cross-references between paragraphs in different episodes."""
crossrefs = []
# Build index: topic → list of (episode_key, para_idx)
topic_index = defaultdict(list)
for ep_key, paras in tagged_paragraphs.items():
for i, p in enumerate(paras):
for tag in p["tags"]:
topic_index[tag].append((ep_key, i))
# Find pairs with shared tags across episodes
seen = set()
for topic, locations in topic_index.items():
for j in range(len(locations)):
for k in range(j + 1, len(locations)):
ep1, idx1 = locations[j]
ep2, idx2 = locations[k]
if ep1 == ep2:
continue # Skip same-episode refs
pair = tuple(sorted([(ep1, idx1), (ep2, idx2)]))
if pair in seen:
continue
seen.add(pair)
# Count shared tags
tags1 = set(tagged_paragraphs[ep1][idx1]["tags"])
tags2 = set(tagged_paragraphs[ep2][idx2]["tags"])
shared = tags1 & tags2
if len(shared) >= min_shared_tags:
crossrefs.append({
"source": {"episode": ep1, "paragraph": idx1},
"target": {"episode": ep2, "paragraph": idx2},
"shared_topics": sorted(shared),
"score": len(shared) / max(len(tags1), len(tags2), 1)
})
# Sort by score
crossrefs.sort(key=lambda x: -x["score"])
return crossrefs
def main():
project_dir = sys.argv[1] if len(sys.argv) > 1 else "."
data_dir = os.path.join(project_dir, "data")
# Load transcript index
srt_path = os.path.join(data_dir, "srt_index.json")
if not os.path.exists(srt_path):
print(f"ERROR: {srt_path} not found")
sys.exit(1)
with open(srt_path, "r", encoding="utf-8") as f:
srt_data = json.load(f)
# Tag all paragraphs
tagged = {}
total_tags = 0
for ep_key, ep_data in srt_data.items():
tagged[ep_key] = []
for p in ep_data["paragraphs"]:
tags = tag_paragraph(p["text"])
tagged[ep_key].append({
"start": p["start"],
"end": p["end"],
"tags": tags,
"text_preview": p["text"][:100]
})
total_tags += len(tags)
print(f"Tagged {sum(len(v) for v in tagged.values())} paragraphs")
print(f"Total tags assigned: {total_tags}")
# Show tag distribution
tag_counts = defaultdict(int)
for ep_paras in tagged.values():
for p in ep_paras:
for t in p["tags"]:
tag_counts[t] += 1
print(f"\nTop topics:")
for tag, count in sorted(tag_counts.items(), key=lambda x: -x[1])[:15]:
print(f" {tag}: {count} paragraphs")
# Find cross-references
crossrefs = find_crossrefs(tagged)
print(f"\nCross-references: {len(crossrefs)} (min 2 shared tags)")
# Output
output = {
"taxonomy": {k: {"keywords": v} for k, v in TOPIC_TAXONOMY.items()},
"tagged_paragraphs": tagged,
"crossrefs": crossrefs[:500], # Top 500
}
output_path = os.path.join(data_dir, "topics_index.json")
with open(output_path, "w", encoding="utf-8") as f:
json.dump(output, f, ensure_ascii=False, indent=2)
# Copy to webapp
webapp_dir = os.path.join(project_dir, "webapp")
if os.path.isdir(webapp_dir):
import shutil
shutil.copy2(output_path, os.path.join(webapp_dir, "topics_index.json"))
print(f"\nOutput: {output_path}")
if __name__ == "__main__":
main()