podcast-mindmap/scripts/index_topics.py

#!/usr/bin/env python3
"""Index transcript paragraphs with topic tags for cross-referencing.

Reads srt_index.json, assigns topic tags to each paragraph,
finds cross-references between paragraphs across episodes.
Outputs topics_index.json for the webapp.

This script processes paragraphs in batches and uses simple
keyword/phrase extraction — no external LLM API needed.
"""

import json
import os
import re
import sys
from collections import defaultdict
from difflib import SequenceMatcher

# Topic taxonomy — shared vocabulary for cross-podcast matching
TOPIC_TAXONOMY = {
    # Wirtschaft
    "wachstum": ["wachstum", "bip", "bruttoinlandsprodukt", "wirtschaftswachstum", "konjunktur", "rezession", "wohlstand"],
    "schulden": ["schulden", "staatsverschuldung", "schuldenbremse", "kredit", "anleihen", "haushalt", "investition"],
    "vermoegen": ["vermögen", "reichtum", "eigentum", "besitz", "erbschaft", "ungleichheit", "verteilung", "kodierung"],
    "steuern": ["steuer", "erbschaftssteuer", "vermögensteuer", "kapitalertrag", "steuergerechtigkeit", "umverteilung"],
    "markt": ["markt", "marktwirtschaft", "kapitalismus", "neoliberal", "deregulierung", "privatisierung", "wettbewerb"],
    "innovation": ["innovation", "forschung", "technologie", "digitalisierung", "transformation", "start-up", "patent"],
    "arbeit": ["arbeit", "beschäftigung", "arbeitsmarkt", "fachkräfte", "mindestlohn", "gewerkschaft", "care-arbeit"],

    # Sicherheit
    "klimakrise": ["klima", "klimawandel", "co2", "emission", "erderwärmung", "paris", "erneuerbar", "fossil", "ökologisch"],
    "geopolitik": ["geopolitik", "nato", "russland", "china", "europa", "sicherheitspolitik", "verteidigung", "aufrüstung", "krieg"],
    "soziale_sicherheit": ["sozial", "sicherheit", "rente", "pflege", "gesundheit", "daseinsvorsorge", "sozialstaat"],
    "digitale_sicherheit": ["plattform", "monopol", "google", "meta", "tiktok", "algorithmus", "datenschutz", "überwachung"],

    # Demokratie
    "demokratie": ["demokratie", "wahl", "parlament", "regierung", "rechtsstaat", "gewaltenteilung", "verfassung", "grundgesetz"],
    "macht": ["macht", "herrschaft", "elite", "oligarchie", "lobbyismus", "einfluss", "gestaltung"],
    "polarisierung": ["polarisierung", "spaltung", "populismus", "radikalisierung", "filterblas", "desinformation", "fake"],
    "buerokratie": ["bürokratie", "verwaltung", "regulierung", "föderalismus", "kommune", "behörde"],
    "kompromiss": ["kompromiss", "verhandlung", "koalition", "konsens", "diskurs", "debatte", "streit"],
    "extremismus": ["extremismus", "faschismus", "rechtsextrem", "autoritär", "rassismus", "antisemitismus", "gewalt"],

    # Freiheit
    "freiheit": ["freiheit", "grundrecht", "menschenrecht", "selbstbestimmung", "autonomie", "emanzipation"],
    "diskriminierung": ["diskriminierung", "rassismus", "gleichstellung", "gender", "minderheit", "inklusion", "vielfalt"],
    "bildung": ["bildung", "schule", "universität", "lernen", "kompetenz", "pisa", "chancengleichheit"],
    "gesundheit": ["gesundheit", "krankheit", "prävention", "medizin", "mental health", "ernährung", "allergie"],
    "migration": ["migration", "flucht", "integration", "zuwanderung", "asyl", "fachkräfteeinwanderung"],

    # Querschnitt
    "ohnmacht": ["ohnmacht", "hilflosigkeit", "resignation", "vertrauen", "selbstwirksamkeit", "beteiligung", "engagement"],
    "narrative": ["narrativ", "erzählung", "framing", "kommunikation", "medien", "öffentlichkeit", "diskurs"],
    "generationen": ["generation", "jugend", "alter", "demografie", "zukunft", "intergenerationell", "boomer"],
    "infrastruktur": ["infrastruktur", "bahn", "straße", "brücke", "netz", "glasfaser", "öpnv", "mobilität"],
}


def tag_paragraph(text, taxonomy=TOPIC_TAXONOMY):
    """Assign topic tags to a paragraph based on keyword matching."""
    text_lower = text.lower()
    scores = {}

    for topic, keywords in taxonomy.items():
        score = 0
        for kw in keywords:
            # Count occurrences, weight longer keywords higher
            count = text_lower.count(kw)
            if count > 0:
                score += count * (1 + len(kw) / 10)
        if score > 0:
            scores[topic] = score

    # Return top tags (normalized score > threshold)
    if not scores:
        return []

    max_score = max(scores.values())
    threshold = max_score * 0.3
    tags = sorted(
        [(t, s) for t, s in scores.items() if s >= threshold],
        key=lambda x: -x[1]
    )
    return [t for t, s in tags[:5]]  # max 5 tags


def find_crossrefs(tagged_paragraphs, min_shared_tags=2):
    """Find cross-references between paragraphs in different episodes."""
    crossrefs = []

    # Build index: topic → list of (episode_key, para_idx)
    topic_index = defaultdict(list)
    for ep_key, paras in tagged_paragraphs.items():
        for i, p in enumerate(paras):
            for tag in p["tags"]:
                topic_index[tag].append((ep_key, i))

    # Find pairs with shared tags across episodes
    seen = set()
    for topic, locations in topic_index.items():
        for j in range(len(locations)):
            for k in range(j + 1, len(locations)):
                ep1, idx1 = locations[j]
                ep2, idx2 = locations[k]
                if ep1 == ep2:
                    continue  # Skip same-episode refs
                pair = tuple(sorted([(ep1, idx1), (ep2, idx2)]))
                if pair in seen:
                    continue
                seen.add(pair)

                # Count shared tags
                tags1 = set(tagged_paragraphs[ep1][idx1]["tags"])
                tags2 = set(tagged_paragraphs[ep2][idx2]["tags"])
                shared = tags1 & tags2

                if len(shared) >= min_shared_tags:
                    crossrefs.append({
                        "source": {"episode": ep1, "paragraph": idx1},
                        "target": {"episode": ep2, "paragraph": idx2},
                        "shared_topics": sorted(shared),
                        "score": len(shared) / max(len(tags1), len(tags2), 1)
                    })

    # Sort by score
    crossrefs.sort(key=lambda x: -x["score"])
    return crossrefs


def main():
    project_dir = sys.argv[1] if len(sys.argv) > 1 else "."
    data_dir = os.path.join(project_dir, "data")

    # Load transcript index
    srt_path = os.path.join(data_dir, "srt_index.json")
    if not os.path.exists(srt_path):
        print(f"ERROR: {srt_path} not found")
        sys.exit(1)

    with open(srt_path, "r", encoding="utf-8") as f:
        srt_data = json.load(f)

    # Tag all paragraphs
    tagged = {}
    total_tags = 0

    for ep_key, ep_data in srt_data.items():
        tagged[ep_key] = []
        for p in ep_data["paragraphs"]:
            tags = tag_paragraph(p["text"])
            tagged[ep_key].append({
                "start": p["start"],
                "end": p["end"],
                "tags": tags,
                "text_preview": p["text"][:100]
            })
            total_tags += len(tags)

    print(f"Tagged {sum(len(v) for v in tagged.values())} paragraphs")
    print(f"Total tags assigned: {total_tags}")

    # Show tag distribution
    tag_counts = defaultdict(int)
    for ep_paras in tagged.values():
        for p in ep_paras:
            for t in p["tags"]:
                tag_counts[t] += 1

    print(f"\nTop topics:")
    for tag, count in sorted(tag_counts.items(), key=lambda x: -x[1])[:15]:
        print(f"  {tag}: {count} paragraphs")

    # Find cross-references
    crossrefs = find_crossrefs(tagged)
    print(f"\nCross-references: {len(crossrefs)} (min 2 shared tags)")

    # Output
    output = {
        "taxonomy": {k: {"keywords": v} for k, v in TOPIC_TAXONOMY.items()},
        "tagged_paragraphs": tagged,
        "crossrefs": crossrefs[:500],  # Top 500
    }

    output_path = os.path.join(data_dir, "topics_index.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

    # Copy to webapp
    webapp_dir = os.path.join(project_dir, "webapp")
    if os.path.isdir(webapp_dir):
        import shutil
        shutil.copy2(output_path, os.path.join(webapp_dir, "topics_index.json"))

    print(f"\nOutput: {output_path}")


if __name__ == "__main__":
    main()
#2 Obsidian-Links, #6 Soundbite-Export, #7 Timeline - #2: Topic-Tags pro Transkript-Absatz (27 Themen-Taxonomie), Backlinks zu verwandten Stellen in anderen Episoden. Neues Script: scripts/index_topics.py - #6: Audio-Clip-Export direkt im Browser (Web Audio API → WAV). Kein serverseitiges ffmpeg nötig. - #7: Timeline-Ansicht als Alternative zur Mindmap. Staffeln → Episoden → Zitate auf Zeitachse. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-20 08:03:12 +02:00			`#!/usr/bin/env python3`
			`"""Index transcript paragraphs with topic tags for cross-referencing.`

			`Reads srt_index.json, assigns topic tags to each paragraph,`
			`finds cross-references between paragraphs across episodes.`
			`Outputs topics_index.json for the webapp.`

			`This script processes paragraphs in batches and uses simple`
			`keyword/phrase extraction — no external LLM API needed.`
			`"""`

			`import json`
			`import os`
			`import re`
			`import sys`
			`from collections import defaultdict`
			`from difflib import SequenceMatcher`

			`# Topic taxonomy — shared vocabulary for cross-podcast matching`
			`TOPIC_TAXONOMY = {`
			`# Wirtschaft`
			`"wachstum": ["wachstum", "bip", "bruttoinlandsprodukt", "wirtschaftswachstum", "konjunktur", "rezession", "wohlstand"],`
			`"schulden": ["schulden", "staatsverschuldung", "schuldenbremse", "kredit", "anleihen", "haushalt", "investition"],`
			`"vermoegen": ["vermögen", "reichtum", "eigentum", "besitz", "erbschaft", "ungleichheit", "verteilung", "kodierung"],`
			`"steuern": ["steuer", "erbschaftssteuer", "vermögensteuer", "kapitalertrag", "steuergerechtigkeit", "umverteilung"],`
			`"markt": ["markt", "marktwirtschaft", "kapitalismus", "neoliberal", "deregulierung", "privatisierung", "wettbewerb"],`
			`"innovation": ["innovation", "forschung", "technologie", "digitalisierung", "transformation", "start-up", "patent"],`
			`"arbeit": ["arbeit", "beschäftigung", "arbeitsmarkt", "fachkräfte", "mindestlohn", "gewerkschaft", "care-arbeit"],`

			`# Sicherheit`
			`"klimakrise": ["klima", "klimawandel", "co2", "emission", "erderwärmung", "paris", "erneuerbar", "fossil", "ökologisch"],`
			`"geopolitik": ["geopolitik", "nato", "russland", "china", "europa", "sicherheitspolitik", "verteidigung", "aufrüstung", "krieg"],`
			`"soziale_sicherheit": ["sozial", "sicherheit", "rente", "pflege", "gesundheit", "daseinsvorsorge", "sozialstaat"],`
			`"digitale_sicherheit": ["plattform", "monopol", "google", "meta", "tiktok", "algorithmus", "datenschutz", "überwachung"],`

			`# Demokratie`
			`"demokratie": ["demokratie", "wahl", "parlament", "regierung", "rechtsstaat", "gewaltenteilung", "verfassung", "grundgesetz"],`
			`"macht": ["macht", "herrschaft", "elite", "oligarchie", "lobbyismus", "einfluss", "gestaltung"],`
			`"polarisierung": ["polarisierung", "spaltung", "populismus", "radikalisierung", "filterblas", "desinformation", "fake"],`
			`"buerokratie": ["bürokratie", "verwaltung", "regulierung", "föderalismus", "kommune", "behörde"],`
			`"kompromiss": ["kompromiss", "verhandlung", "koalition", "konsens", "diskurs", "debatte", "streit"],`
			`"extremismus": ["extremismus", "faschismus", "rechtsextrem", "autoritär", "rassismus", "antisemitismus", "gewalt"],`

			`# Freiheit`
			`"freiheit": ["freiheit", "grundrecht", "menschenrecht", "selbstbestimmung", "autonomie", "emanzipation"],`
			`"diskriminierung": ["diskriminierung", "rassismus", "gleichstellung", "gender", "minderheit", "inklusion", "vielfalt"],`
			`"bildung": ["bildung", "schule", "universität", "lernen", "kompetenz", "pisa", "chancengleichheit"],`
			`"gesundheit": ["gesundheit", "krankheit", "prävention", "medizin", "mental health", "ernährung", "allergie"],`
			`"migration": ["migration", "flucht", "integration", "zuwanderung", "asyl", "fachkräfteeinwanderung"],`

			`# Querschnitt`
			`"ohnmacht": ["ohnmacht", "hilflosigkeit", "resignation", "vertrauen", "selbstwirksamkeit", "beteiligung", "engagement"],`
			`"narrative": ["narrativ", "erzählung", "framing", "kommunikation", "medien", "öffentlichkeit", "diskurs"],`
			`"generationen": ["generation", "jugend", "alter", "demografie", "zukunft", "intergenerationell", "boomer"],`
			`"infrastruktur": ["infrastruktur", "bahn", "straße", "brücke", "netz", "glasfaser", "öpnv", "mobilität"],`
			`}`


			`def tag_paragraph(text, taxonomy=TOPIC_TAXONOMY):`
			`"""Assign topic tags to a paragraph based on keyword matching."""`
			`text_lower = text.lower()`
			`scores = {}`

			`for topic, keywords in taxonomy.items():`
			`score = 0`
			`for kw in keywords:`
			`# Count occurrences, weight longer keywords higher`
			`count = text_lower.count(kw)`
			`if count > 0:`
			`score += count * (1 + len(kw) / 10)`
			`if score > 0:`
			`scores[topic] = score`

			`# Return top tags (normalized score > threshold)`
			`if not scores:`
			`return []`

			`max_score = max(scores.values())`
			`threshold = max_score * 0.3`
			`tags = sorted(`
			`[(t, s) for t, s in scores.items() if s >= threshold],`
			`key=lambda x: -x[1]`
			`)`
			`return [t for t, s in tags[:5]] # max 5 tags`


			`def find_crossrefs(tagged_paragraphs, min_shared_tags=2):`
			`"""Find cross-references between paragraphs in different episodes."""`
			`crossrefs = []`

			`# Build index: topic → list of (episode_key, para_idx)`
			`topic_index = defaultdict(list)`
			`for ep_key, paras in tagged_paragraphs.items():`
			`for i, p in enumerate(paras):`
			`for tag in p["tags"]:`
			`topic_index[tag].append((ep_key, i))`

			`# Find pairs with shared tags across episodes`
			`seen = set()`
			`for topic, locations in topic_index.items():`
			`for j in range(len(locations)):`
			`for k in range(j + 1, len(locations)):`
			`ep1, idx1 = locations[j]`
			`ep2, idx2 = locations[k]`
			`if ep1 == ep2:`
			`continue # Skip same-episode refs`
			`pair = tuple(sorted([(ep1, idx1), (ep2, idx2)]))`
			`if pair in seen:`
			`continue`
			`seen.add(pair)`

			`# Count shared tags`
			`tags1 = set(tagged_paragraphs[ep1][idx1]["tags"])`
			`tags2 = set(tagged_paragraphs[ep2][idx2]["tags"])`
			`shared = tags1 & tags2`

			`if len(shared) >= min_shared_tags:`
			`crossrefs.append({`
			`"source": {"episode": ep1, "paragraph": idx1},`
			`"target": {"episode": ep2, "paragraph": idx2},`
			`"shared_topics": sorted(shared),`
			`"score": len(shared) / max(len(tags1), len(tags2), 1)`
			`})`

			`# Sort by score`
			`crossrefs.sort(key=lambda x: -x["score"])`
			`return crossrefs`


			`def main():`
			`project_dir = sys.argv[1] if len(sys.argv) > 1 else "."`
			`data_dir = os.path.join(project_dir, "data")`

			`# Load transcript index`
			`srt_path = os.path.join(data_dir, "srt_index.json")`
			`if not os.path.exists(srt_path):`
			`print(f"ERROR: {srt_path} not found")`
			`sys.exit(1)`

			`with open(srt_path, "r", encoding="utf-8") as f:`
			`srt_data = json.load(f)`

			`# Tag all paragraphs`
			`tagged = {}`
			`total_tags = 0`

			`for ep_key, ep_data in srt_data.items():`
			`tagged[ep_key] = []`
			`for p in ep_data["paragraphs"]:`
			`tags = tag_paragraph(p["text"])`
			`tagged[ep_key].append({`
			`"start": p["start"],`
			`"end": p["end"],`
			`"tags": tags,`
			`"text_preview": p["text"][:100]`
			`})`
			`total_tags += len(tags)`

			`print(f"Tagged {sum(len(v) for v in tagged.values())} paragraphs")`
			`print(f"Total tags assigned: {total_tags}")`

			`# Show tag distribution`
			`tag_counts = defaultdict(int)`
			`for ep_paras in tagged.values():`
			`for p in ep_paras:`
			`for t in p["tags"]:`
			`tag_counts[t] += 1`

			`print(f"\nTop topics:")`
			`for tag, count in sorted(tag_counts.items(), key=lambda x: -x[1])[:15]:`
			`print(f" {tag}: {count} paragraphs")`

			`# Find cross-references`
			`crossrefs = find_crossrefs(tagged)`
			`print(f"\nCross-references: {len(crossrefs)} (min 2 shared tags)")`

			`# Output`
			`output = {`
			`"taxonomy": {k: {"keywords": v} for k, v in TOPIC_TAXONOMY.items()},`
			`"tagged_paragraphs": tagged,`
			`"crossrefs": crossrefs[:500], # Top 500`
			`}`

			`output_path = os.path.join(data_dir, "topics_index.json")`
			`with open(output_path, "w", encoding="utf-8") as f:`
			`json.dump(output, f, ensure_ascii=False, indent=2)`

			`# Copy to webapp`
			`webapp_dir = os.path.join(project_dir, "webapp")`
			`if os.path.isdir(webapp_dir):`
			`import shutil`
			`shutil.copy2(output_path, os.path.join(webapp_dir, "topics_index.json"))`

			`print(f"\nOutput: {output_path}")`


			`if __name__ == "__main__":`
			`main()`