- #2: Topic-Tags pro Transkript-Absatz (27 Themen-Taxonomie), Backlinks zu verwandten Stellen in anderen Episoden. Neues Script: scripts/index_topics.py - #6: Audio-Clip-Export direkt im Browser (Web Audio API → WAV). Kein serverseitiges ffmpeg nötig. - #7: Timeline-Ansicht als Alternative zur Mindmap. Staffeln → Episoden → Zitate auf Zeitachse. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
199 lines
8.1 KiB
Python
199 lines
8.1 KiB
Python
#!/usr/bin/env python3
|
|
"""Index transcript paragraphs with topic tags for cross-referencing.
|
|
|
|
Reads srt_index.json, assigns topic tags to each paragraph,
|
|
finds cross-references between paragraphs across episodes.
|
|
Outputs topics_index.json for the webapp.
|
|
|
|
This script processes paragraphs in batches and uses simple
|
|
keyword/phrase extraction — no external LLM API needed.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
from collections import defaultdict
|
|
from difflib import SequenceMatcher
|
|
|
|
# Topic taxonomy — shared vocabulary for cross-podcast matching
|
|
TOPIC_TAXONOMY = {
|
|
# Wirtschaft
|
|
"wachstum": ["wachstum", "bip", "bruttoinlandsprodukt", "wirtschaftswachstum", "konjunktur", "rezession", "wohlstand"],
|
|
"schulden": ["schulden", "staatsverschuldung", "schuldenbremse", "kredit", "anleihen", "haushalt", "investition"],
|
|
"vermoegen": ["vermögen", "reichtum", "eigentum", "besitz", "erbschaft", "ungleichheit", "verteilung", "kodierung"],
|
|
"steuern": ["steuer", "erbschaftssteuer", "vermögensteuer", "kapitalertrag", "steuergerechtigkeit", "umverteilung"],
|
|
"markt": ["markt", "marktwirtschaft", "kapitalismus", "neoliberal", "deregulierung", "privatisierung", "wettbewerb"],
|
|
"innovation": ["innovation", "forschung", "technologie", "digitalisierung", "transformation", "start-up", "patent"],
|
|
"arbeit": ["arbeit", "beschäftigung", "arbeitsmarkt", "fachkräfte", "mindestlohn", "gewerkschaft", "care-arbeit"],
|
|
|
|
# Sicherheit
|
|
"klimakrise": ["klima", "klimawandel", "co2", "emission", "erderwärmung", "paris", "erneuerbar", "fossil", "ökologisch"],
|
|
"geopolitik": ["geopolitik", "nato", "russland", "china", "europa", "sicherheitspolitik", "verteidigung", "aufrüstung", "krieg"],
|
|
"soziale_sicherheit": ["sozial", "sicherheit", "rente", "pflege", "gesundheit", "daseinsvorsorge", "sozialstaat"],
|
|
"digitale_sicherheit": ["plattform", "monopol", "google", "meta", "tiktok", "algorithmus", "datenschutz", "überwachung"],
|
|
|
|
# Demokratie
|
|
"demokratie": ["demokratie", "wahl", "parlament", "regierung", "rechtsstaat", "gewaltenteilung", "verfassung", "grundgesetz"],
|
|
"macht": ["macht", "herrschaft", "elite", "oligarchie", "lobbyismus", "einfluss", "gestaltung"],
|
|
"polarisierung": ["polarisierung", "spaltung", "populismus", "radikalisierung", "filterblas", "desinformation", "fake"],
|
|
"buerokratie": ["bürokratie", "verwaltung", "regulierung", "föderalismus", "kommune", "behörde"],
|
|
"kompromiss": ["kompromiss", "verhandlung", "koalition", "konsens", "diskurs", "debatte", "streit"],
|
|
"extremismus": ["extremismus", "faschismus", "rechtsextrem", "autoritär", "rassismus", "antisemitismus", "gewalt"],
|
|
|
|
# Freiheit
|
|
"freiheit": ["freiheit", "grundrecht", "menschenrecht", "selbstbestimmung", "autonomie", "emanzipation"],
|
|
"diskriminierung": ["diskriminierung", "rassismus", "gleichstellung", "gender", "minderheit", "inklusion", "vielfalt"],
|
|
"bildung": ["bildung", "schule", "universität", "lernen", "kompetenz", "pisa", "chancengleichheit"],
|
|
"gesundheit": ["gesundheit", "krankheit", "prävention", "medizin", "mental health", "ernährung", "allergie"],
|
|
"migration": ["migration", "flucht", "integration", "zuwanderung", "asyl", "fachkräfteeinwanderung"],
|
|
|
|
# Querschnitt
|
|
"ohnmacht": ["ohnmacht", "hilflosigkeit", "resignation", "vertrauen", "selbstwirksamkeit", "beteiligung", "engagement"],
|
|
"narrative": ["narrativ", "erzählung", "framing", "kommunikation", "medien", "öffentlichkeit", "diskurs"],
|
|
"generationen": ["generation", "jugend", "alter", "demografie", "zukunft", "intergenerationell", "boomer"],
|
|
"infrastruktur": ["infrastruktur", "bahn", "straße", "brücke", "netz", "glasfaser", "öpnv", "mobilität"],
|
|
}
|
|
|
|
|
|
def tag_paragraph(text, taxonomy=TOPIC_TAXONOMY):
|
|
"""Assign topic tags to a paragraph based on keyword matching."""
|
|
text_lower = text.lower()
|
|
scores = {}
|
|
|
|
for topic, keywords in taxonomy.items():
|
|
score = 0
|
|
for kw in keywords:
|
|
# Count occurrences, weight longer keywords higher
|
|
count = text_lower.count(kw)
|
|
if count > 0:
|
|
score += count * (1 + len(kw) / 10)
|
|
if score > 0:
|
|
scores[topic] = score
|
|
|
|
# Return top tags (normalized score > threshold)
|
|
if not scores:
|
|
return []
|
|
|
|
max_score = max(scores.values())
|
|
threshold = max_score * 0.3
|
|
tags = sorted(
|
|
[(t, s) for t, s in scores.items() if s >= threshold],
|
|
key=lambda x: -x[1]
|
|
)
|
|
return [t for t, s in tags[:5]] # max 5 tags
|
|
|
|
|
|
def find_crossrefs(tagged_paragraphs, min_shared_tags=2):
|
|
"""Find cross-references between paragraphs in different episodes."""
|
|
crossrefs = []
|
|
|
|
# Build index: topic → list of (episode_key, para_idx)
|
|
topic_index = defaultdict(list)
|
|
for ep_key, paras in tagged_paragraphs.items():
|
|
for i, p in enumerate(paras):
|
|
for tag in p["tags"]:
|
|
topic_index[tag].append((ep_key, i))
|
|
|
|
# Find pairs with shared tags across episodes
|
|
seen = set()
|
|
for topic, locations in topic_index.items():
|
|
for j in range(len(locations)):
|
|
for k in range(j + 1, len(locations)):
|
|
ep1, idx1 = locations[j]
|
|
ep2, idx2 = locations[k]
|
|
if ep1 == ep2:
|
|
continue # Skip same-episode refs
|
|
pair = tuple(sorted([(ep1, idx1), (ep2, idx2)]))
|
|
if pair in seen:
|
|
continue
|
|
seen.add(pair)
|
|
|
|
# Count shared tags
|
|
tags1 = set(tagged_paragraphs[ep1][idx1]["tags"])
|
|
tags2 = set(tagged_paragraphs[ep2][idx2]["tags"])
|
|
shared = tags1 & tags2
|
|
|
|
if len(shared) >= min_shared_tags:
|
|
crossrefs.append({
|
|
"source": {"episode": ep1, "paragraph": idx1},
|
|
"target": {"episode": ep2, "paragraph": idx2},
|
|
"shared_topics": sorted(shared),
|
|
"score": len(shared) / max(len(tags1), len(tags2), 1)
|
|
})
|
|
|
|
# Sort by score
|
|
crossrefs.sort(key=lambda x: -x["score"])
|
|
return crossrefs
|
|
|
|
|
|
def main():
|
|
project_dir = sys.argv[1] if len(sys.argv) > 1 else "."
|
|
data_dir = os.path.join(project_dir, "data")
|
|
|
|
# Load transcript index
|
|
srt_path = os.path.join(data_dir, "srt_index.json")
|
|
if not os.path.exists(srt_path):
|
|
print(f"ERROR: {srt_path} not found")
|
|
sys.exit(1)
|
|
|
|
with open(srt_path, "r", encoding="utf-8") as f:
|
|
srt_data = json.load(f)
|
|
|
|
# Tag all paragraphs
|
|
tagged = {}
|
|
total_tags = 0
|
|
|
|
for ep_key, ep_data in srt_data.items():
|
|
tagged[ep_key] = []
|
|
for p in ep_data["paragraphs"]:
|
|
tags = tag_paragraph(p["text"])
|
|
tagged[ep_key].append({
|
|
"start": p["start"],
|
|
"end": p["end"],
|
|
"tags": tags,
|
|
"text_preview": p["text"][:100]
|
|
})
|
|
total_tags += len(tags)
|
|
|
|
print(f"Tagged {sum(len(v) for v in tagged.values())} paragraphs")
|
|
print(f"Total tags assigned: {total_tags}")
|
|
|
|
# Show tag distribution
|
|
tag_counts = defaultdict(int)
|
|
for ep_paras in tagged.values():
|
|
for p in ep_paras:
|
|
for t in p["tags"]:
|
|
tag_counts[t] += 1
|
|
|
|
print(f"\nTop topics:")
|
|
for tag, count in sorted(tag_counts.items(), key=lambda x: -x[1])[:15]:
|
|
print(f" {tag}: {count} paragraphs")
|
|
|
|
# Find cross-references
|
|
crossrefs = find_crossrefs(tagged)
|
|
print(f"\nCross-references: {len(crossrefs)} (min 2 shared tags)")
|
|
|
|
# Output
|
|
output = {
|
|
"taxonomy": {k: {"keywords": v} for k, v in TOPIC_TAXONOMY.items()},
|
|
"tagged_paragraphs": tagged,
|
|
"crossrefs": crossrefs[:500], # Top 500
|
|
}
|
|
|
|
output_path = os.path.join(data_dir, "topics_index.json")
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
json.dump(output, f, ensure_ascii=False, indent=2)
|
|
|
|
# Copy to webapp
|
|
webapp_dir = os.path.join(project_dir, "webapp")
|
|
if os.path.isdir(webapp_dir):
|
|
import shutil
|
|
shutil.copy2(output_path, os.path.join(webapp_dir, "topics_index.json"))
|
|
|
|
print(f"\nOutput: {output_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|