podcast-mindmap/scripts/index_topics.py

#!/usr/bin/env python3
"""Index transcript paragraphs with topic tags for cross-referencing.

Reads srt_index.json, assigns topic tags to each paragraph,
finds cross-references between paragraphs across episodes.
Outputs topics_index.json for the webapp.

This script processes paragraphs in batches and uses simple
keyword/phrase extraction — no external LLM API needed.
"""

import json
import os
import re
import sys
from collections import defaultdict
from difflib import SequenceMatcher

# Topic taxonomy — shared vocabulary for cross-podcast matching
TOPIC_TAXONOMY = {
    # Wirtschaft
    "wachstum": ["wachstum", "bip", "bruttoinlandsprodukt", "wirtschaftswachstum", "konjunktur", "rezession", "wohlstand"],
    "schulden": ["schulden", "staatsverschuldung", "schuldenbremse", "kredit", "anleihen", "haushalt", "investition"],
    "vermoegen": ["vermögen", "reichtum", "eigentum", "besitz", "erbschaft", "ungleichheit", "verteilung", "kodierung"],
    "steuern": ["steuer", "erbschaftssteuer", "vermögensteuer", "kapitalertrag", "steuergerechtigkeit", "umverteilung"],
    "markt": ["markt", "marktwirtschaft", "kapitalismus", "neoliberal", "deregulierung", "privatisierung", "wettbewerb"],
    "innovation": ["innovation", "forschung", "technologie", "digitalisierung", "transformation", "start-up", "patent"],
    "arbeit": ["arbeit", "beschäftigung", "arbeitsmarkt", "fachkräfte", "mindestlohn", "gewerkschaft", "care-arbeit"],

    # Sicherheit
    "klimakrise": ["klima", "klimawandel", "co2", "emission", "erderwärmung", "paris", "erneuerbar", "fossil", "ökologisch"],
    "geopolitik": ["geopolitik", "nato", "russland", "china", "europa", "sicherheitspolitik", "verteidigung", "aufrüstung", "krieg"],
    "soziale_sicherheit": ["sozial", "sicherheit", "rente", "pflege", "gesundheit", "daseinsvorsorge", "sozialstaat"],
    "digitale_sicherheit": ["plattform", "monopol", "google", "meta", "tiktok", "algorithmus", "datenschutz", "überwachung"],

    # Demokratie
    "demokratie": ["demokratie", "wahl", "parlament", "regierung", "rechtsstaat", "gewaltenteilung", "verfassung", "grundgesetz"],
    "macht": ["macht", "herrschaft", "elite", "oligarchie", "lobbyismus", "einfluss", "gestaltung"],
    "polarisierung": ["polarisierung", "spaltung", "populismus", "radikalisierung", "filterblas", "desinformation", "fake"],
    "buerokratie": ["bürokratie", "verwaltung", "regulierung", "föderalismus", "kommune", "behörde"],
    "kompromiss": ["kompromiss", "verhandlung", "koalition", "konsens", "diskurs", "debatte", "streit"],
    "extremismus": ["extremismus", "faschismus", "rechtsextrem", "autoritär", "rassismus", "antisemitismus", "gewalt"],

    # Freiheit
    "freiheit": ["freiheit", "grundrecht", "menschenrecht", "selbstbestimmung", "autonomie", "emanzipation"],
    "diskriminierung": ["diskriminierung", "rassismus", "gleichstellung", "gender", "minderheit", "inklusion", "vielfalt"],
    "bildung": ["bildung", "schule", "universität", "lernen", "kompetenz", "pisa", "chancengleichheit"],
    "gesundheit": ["gesundheit", "krankheit", "prävention", "medizin", "mental health", "ernährung", "allergie"],
    "migration": ["migration", "flucht", "integration", "zuwanderung", "asyl", "fachkräfteeinwanderung"],

    # Querschnitt
    "ohnmacht": ["ohnmacht", "hilflosigkeit", "resignation", "vertrauen", "selbstwirksamkeit", "beteiligung", "engagement"],
    "narrative": ["narrativ", "erzählung", "framing", "kommunikation", "medien", "öffentlichkeit", "diskurs"],
    "generationen": ["generation", "jugend", "alter", "demografie", "zukunft", "intergenerationell", "boomer"],
    "infrastruktur": ["infrastruktur", "bahn", "straße", "brücke", "netz", "glasfaser", "öpnv", "mobilität"],
}


def tag_paragraph(text, taxonomy=TOPIC_TAXONOMY):
    """Assign topic tags to a paragraph based on keyword matching."""
    text_lower = text.lower()
    scores = {}

    for topic, keywords in taxonomy.items():
        score = 0
        for kw in keywords:
            # Count occurrences, weight longer keywords higher
            count = text_lower.count(kw)
            if count > 0:
                score += count * (1 + len(kw) / 10)
        if score > 0:
            scores[topic] = score

    # Return top tags (normalized score > threshold)
    if not scores:
        return []

    max_score = max(scores.values())
    threshold = max_score * 0.3
    tags = sorted(
        [(t, s) for t, s in scores.items() if s >= threshold],
        key=lambda x: -x[1]
    )
    return [t for t, s in tags[:5]]  # max 5 tags


def find_crossrefs(tagged_paragraphs, min_shared_tags=2):
    """Find cross-references between paragraphs in different episodes."""
    crossrefs = []

    # Build index: topic → list of (episode_key, para_idx)
    topic_index = defaultdict(list)
    for ep_key, paras in tagged_paragraphs.items():
        for i, p in enumerate(paras):
            for tag in p["tags"]:
                topic_index[tag].append((ep_key, i))

    # Find pairs with shared tags across episodes
    seen = set()
    for topic, locations in topic_index.items():
        for j in range(len(locations)):
            for k in range(j + 1, len(locations)):
                ep1, idx1 = locations[j]
                ep2, idx2 = locations[k]
                if ep1 == ep2:
                    continue  # Skip same-episode refs
                pair = tuple(sorted([(ep1, idx1), (ep2, idx2)]))
                if pair in seen:
                    continue
                seen.add(pair)

                # Count shared tags
                tags1 = set(tagged_paragraphs[ep1][idx1]["tags"])
                tags2 = set(tagged_paragraphs[ep2][idx2]["tags"])
                shared = tags1 & tags2

                if len(shared) >= min_shared_tags:
                    crossrefs.append({
                        "source": {"episode": ep1, "paragraph": idx1},
                        "target": {"episode": ep2, "paragraph": idx2},
                        "shared_topics": sorted(shared),
                        "score": len(shared) / max(len(tags1), len(tags2), 1)
                    })

    # Sort by score
    crossrefs.sort(key=lambda x: -x["score"])
    return crossrefs


def main():
    project_dir = sys.argv[1] if len(sys.argv) > 1 else "."
    data_dir = os.path.join(project_dir, "data")

    # Load transcript index
    srt_path = os.path.join(data_dir, "srt_index.json")
    if not os.path.exists(srt_path):
        print(f"ERROR: {srt_path} not found")
        sys.exit(1)

    with open(srt_path, "r", encoding="utf-8") as f:
        srt_data = json.load(f)

    # Tag all paragraphs
    tagged = {}
    total_tags = 0

    for ep_key, ep_data in srt_data.items():
        tagged[ep_key] = []
        for p in ep_data["paragraphs"]:
            tags = tag_paragraph(p["text"])
            tagged[ep_key].append({
                "start": p["start"],
                "end": p["end"],
                "tags": tags,
                "text_preview": p["text"][:100]
            })
            total_tags += len(tags)

    print(f"Tagged {sum(len(v) for v in tagged.values())} paragraphs")
    print(f"Total tags assigned: {total_tags}")

    # Show tag distribution
    tag_counts = defaultdict(int)
    for ep_paras in tagged.values():
        for p in ep_paras:
            for t in p["tags"]:
                tag_counts[t] += 1

    print(f"\nTop topics:")
    for tag, count in sorted(tag_counts.items(), key=lambda x: -x[1])[:15]:
        print(f"  {tag}: {count} paragraphs")

    # Find cross-references
    crossrefs = find_crossrefs(tagged)
    print(f"\nCross-references: {len(crossrefs)} (min 2 shared tags)")

    # Output
    output = {
        "taxonomy": {k: {"keywords": v} for k, v in TOPIC_TAXONOMY.items()},
        "tagged_paragraphs": tagged,
        "crossrefs": crossrefs[:500],  # Top 500
    }

    output_path = os.path.join(data_dir, "topics_index.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

    # Copy to webapp
    webapp_dir = os.path.join(project_dir, "webapp")
    if os.path.isdir(webapp_dir):
        import shutil
        shutil.copy2(output_path, os.path.join(webapp_dir, "topics_index.json"))

    print(f"\nOutput: {output_path}")


if __name__ == "__main__":
    main()