diff --git a/scripts/index_topics.py b/scripts/index_topics.py
new file mode 100644
index 0000000..61df157
--- /dev/null
+++ b/scripts/index_topics.py
@@ -0,0 +1,198 @@
+#!/usr/bin/env python3
+"""Index transcript paragraphs with topic tags for cross-referencing.
+
+Reads srt_index.json, assigns topic tags to each paragraph,
+finds cross-references between paragraphs across episodes.
+Outputs topics_index.json for the webapp.
+
+This script processes paragraphs in batches and uses simple
+keyword/phrase extraction — no external LLM API needed.
+"""
+
+import json
+import os
+import re
+import sys
+from collections import defaultdict
+from difflib import SequenceMatcher
+
+# Topic taxonomy — shared vocabulary for cross-podcast matching
+TOPIC_TAXONOMY = {
+ # Wirtschaft
+ "wachstum": ["wachstum", "bip", "bruttoinlandsprodukt", "wirtschaftswachstum", "konjunktur", "rezession", "wohlstand"],
+ "schulden": ["schulden", "staatsverschuldung", "schuldenbremse", "kredit", "anleihen", "haushalt", "investition"],
+ "vermoegen": ["vermögen", "reichtum", "eigentum", "besitz", "erbschaft", "ungleichheit", "verteilung", "kodierung"],
+ "steuern": ["steuer", "erbschaftssteuer", "vermögensteuer", "kapitalertrag", "steuergerechtigkeit", "umverteilung"],
+ "markt": ["markt", "marktwirtschaft", "kapitalismus", "neoliberal", "deregulierung", "privatisierung", "wettbewerb"],
+ "innovation": ["innovation", "forschung", "technologie", "digitalisierung", "transformation", "start-up", "patent"],
+ "arbeit": ["arbeit", "beschäftigung", "arbeitsmarkt", "fachkräfte", "mindestlohn", "gewerkschaft", "care-arbeit"],
+
+ # Sicherheit
+ "klimakrise": ["klima", "klimawandel", "co2", "emission", "erderwärmung", "paris", "erneuerbar", "fossil", "ökologisch"],
+ "geopolitik": ["geopolitik", "nato", "russland", "china", "europa", "sicherheitspolitik", "verteidigung", "aufrüstung", "krieg"],
+ "soziale_sicherheit": ["sozial", "sicherheit", "rente", "pflege", "gesundheit", "daseinsvorsorge", "sozialstaat"],
+ "digitale_sicherheit": ["plattform", "monopol", "google", "meta", "tiktok", "algorithmus", "datenschutz", "überwachung"],
+
+ # Demokratie
+ "demokratie": ["demokratie", "wahl", "parlament", "regierung", "rechtsstaat", "gewaltenteilung", "verfassung", "grundgesetz"],
+ "macht": ["macht", "herrschaft", "elite", "oligarchie", "lobbyismus", "einfluss", "gestaltung"],
+ "polarisierung": ["polarisierung", "spaltung", "populismus", "radikalisierung", "filterblas", "desinformation", "fake"],
+ "buerokratie": ["bürokratie", "verwaltung", "regulierung", "föderalismus", "kommune", "behörde"],
+ "kompromiss": ["kompromiss", "verhandlung", "koalition", "konsens", "diskurs", "debatte", "streit"],
+ "extremismus": ["extremismus", "faschismus", "rechtsextrem", "autoritär", "rassismus", "antisemitismus", "gewalt"],
+
+ # Freiheit
+ "freiheit": ["freiheit", "grundrecht", "menschenrecht", "selbstbestimmung", "autonomie", "emanzipation"],
+ "diskriminierung": ["diskriminierung", "rassismus", "gleichstellung", "gender", "minderheit", "inklusion", "vielfalt"],
+ "bildung": ["bildung", "schule", "universität", "lernen", "kompetenz", "pisa", "chancengleichheit"],
+ "gesundheit": ["gesundheit", "krankheit", "prävention", "medizin", "mental health", "ernährung", "allergie"],
+ "migration": ["migration", "flucht", "integration", "zuwanderung", "asyl", "fachkräfteeinwanderung"],
+
+ # Querschnitt
+ "ohnmacht": ["ohnmacht", "hilflosigkeit", "resignation", "vertrauen", "selbstwirksamkeit", "beteiligung", "engagement"],
+ "narrative": ["narrativ", "erzählung", "framing", "kommunikation", "medien", "öffentlichkeit", "diskurs"],
+ "generationen": ["generation", "jugend", "alter", "demografie", "zukunft", "intergenerationell", "boomer"],
+ "infrastruktur": ["infrastruktur", "bahn", "straße", "brücke", "netz", "glasfaser", "öpnv", "mobilität"],
+}
+
+
+def tag_paragraph(text, taxonomy=TOPIC_TAXONOMY):
+ """Assign topic tags to a paragraph based on keyword matching."""
+ text_lower = text.lower()
+ scores = {}
+
+ for topic, keywords in taxonomy.items():
+ score = 0
+ for kw in keywords:
+ # Count occurrences, weight longer keywords higher
+ count = text_lower.count(kw)
+ if count > 0:
+ score += count * (1 + len(kw) / 10)
+ if score > 0:
+ scores[topic] = score
+
+ # Return top tags (normalized score > threshold)
+ if not scores:
+ return []
+
+ max_score = max(scores.values())
+ threshold = max_score * 0.3
+ tags = sorted(
+ [(t, s) for t, s in scores.items() if s >= threshold],
+ key=lambda x: -x[1]
+ )
+ return [t for t, s in tags[:5]] # max 5 tags
+
+
+def find_crossrefs(tagged_paragraphs, min_shared_tags=2):
+ """Find cross-references between paragraphs in different episodes."""
+ crossrefs = []
+
+ # Build index: topic → list of (episode_key, para_idx)
+ topic_index = defaultdict(list)
+ for ep_key, paras in tagged_paragraphs.items():
+ for i, p in enumerate(paras):
+ for tag in p["tags"]:
+ topic_index[tag].append((ep_key, i))
+
+ # Find pairs with shared tags across episodes
+ seen = set()
+ for topic, locations in topic_index.items():
+ for j in range(len(locations)):
+ for k in range(j + 1, len(locations)):
+ ep1, idx1 = locations[j]
+ ep2, idx2 = locations[k]
+ if ep1 == ep2:
+ continue # Skip same-episode refs
+ pair = tuple(sorted([(ep1, idx1), (ep2, idx2)]))
+ if pair in seen:
+ continue
+ seen.add(pair)
+
+ # Count shared tags
+ tags1 = set(tagged_paragraphs[ep1][idx1]["tags"])
+ tags2 = set(tagged_paragraphs[ep2][idx2]["tags"])
+ shared = tags1 & tags2
+
+ if len(shared) >= min_shared_tags:
+ crossrefs.append({
+ "source": {"episode": ep1, "paragraph": idx1},
+ "target": {"episode": ep2, "paragraph": idx2},
+ "shared_topics": sorted(shared),
+ "score": len(shared) / max(len(tags1), len(tags2), 1)
+ })
+
+ # Sort by score
+ crossrefs.sort(key=lambda x: -x["score"])
+ return crossrefs
+
+
+def main():
+ project_dir = sys.argv[1] if len(sys.argv) > 1 else "."
+ data_dir = os.path.join(project_dir, "data")
+
+ # Load transcript index
+ srt_path = os.path.join(data_dir, "srt_index.json")
+ if not os.path.exists(srt_path):
+ print(f"ERROR: {srt_path} not found")
+ sys.exit(1)
+
+ with open(srt_path, "r", encoding="utf-8") as f:
+ srt_data = json.load(f)
+
+ # Tag all paragraphs
+ tagged = {}
+ total_tags = 0
+
+ for ep_key, ep_data in srt_data.items():
+ tagged[ep_key] = []
+ for p in ep_data["paragraphs"]:
+ tags = tag_paragraph(p["text"])
+ tagged[ep_key].append({
+ "start": p["start"],
+ "end": p["end"],
+ "tags": tags,
+ "text_preview": p["text"][:100]
+ })
+ total_tags += len(tags)
+
+ print(f"Tagged {sum(len(v) for v in tagged.values())} paragraphs")
+ print(f"Total tags assigned: {total_tags}")
+
+ # Show tag distribution
+ tag_counts = defaultdict(int)
+ for ep_paras in tagged.values():
+ for p in ep_paras:
+ for t in p["tags"]:
+ tag_counts[t] += 1
+
+ print(f"\nTop topics:")
+ for tag, count in sorted(tag_counts.items(), key=lambda x: -x[1])[:15]:
+ print(f" {tag}: {count} paragraphs")
+
+ # Find cross-references
+ crossrefs = find_crossrefs(tagged)
+ print(f"\nCross-references: {len(crossrefs)} (min 2 shared tags)")
+
+ # Output
+ output = {
+ "taxonomy": {k: {"keywords": v} for k, v in TOPIC_TAXONOMY.items()},
+ "tagged_paragraphs": tagged,
+ "crossrefs": crossrefs[:500], # Top 500
+ }
+
+ output_path = os.path.join(data_dir, "topics_index.json")
+ with open(output_path, "w", encoding="utf-8") as f:
+ json.dump(output, f, ensure_ascii=False, indent=2)
+
+ # Copy to webapp
+ webapp_dir = os.path.join(project_dir, "webapp")
+ if os.path.isdir(webapp_dir):
+ import shutil
+ shutil.copy2(output_path, os.path.join(webapp_dir, "topics_index.json"))
+
+ print(f"\nOutput: {output_path}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/webapp/index.html b/webapp/index.html
index f7ef9e9..03e7919 100644
--- a/webapp/index.html
+++ b/webapp/index.html
@@ -218,6 +218,50 @@
}
#audio-bar .bar-transcript-btn:hover { border-color: var(--accent); color: var(--text); }
+ /* ── Backlinks (Obsidian-style) ── */
+ .backlinks {
+ margin-top: 8px; padding: 8px 0;
+ border-top: 1px solid var(--border);
+ }
+ .backlinks-title {
+ font-size: 10px; color: var(--text-muted); text-transform: uppercase;
+ letter-spacing: 0.05em; margin-bottom: 6px;
+ }
+ .backlink {
+ font-size: 11px; color: var(--accent); cursor: pointer;
+ padding: 3px 8px; border-radius: 4px; display: block;
+ transition: background 0.15s;
+ }
+ .backlink:hover { background: var(--surface2); }
+ .backlink .bl-episode { font-weight: 600; }
+ .backlink .bl-preview { color: var(--text-muted); font-style: italic; }
+
+ .topic-tag {
+ display: inline-block; padding: 1px 7px; border-radius: 10px;
+ font-size: 9px; background: var(--surface2); color: var(--text-muted);
+ margin: 1px; border: 1px solid var(--border);
+ }
+
+ /* ── Soundbite Export ── */
+ .export-btn {
+ background: transparent; border: 1px solid var(--border);
+ color: var(--text-muted); padding: 3px 8px; border-radius: 4px;
+ font-size: 10px; cursor: pointer; transition: all 0.2s;
+ }
+ .export-btn:hover { border-color: var(--accent); color: var(--text); }
+
+ /* ── View Tabs ── */
+ .view-tabs {
+ display: flex; gap: 4px; margin-left: 12px;
+ }
+ .view-tab {
+ background: var(--surface2); border: 1px solid var(--border);
+ color: var(--text-muted); padding: 4px 10px; border-radius: 4px;
+ font-size: 11px; cursor: pointer; transition: all 0.2s;
+ }
+ .view-tab:hover { border-color: var(--accent); }
+ .view-tab.active { background: var(--accent); color: var(--bg); border-color: var(--accent); }
+
.welcome { text-align: center; padding: 40px 20px; color: var(--text-muted); }
.welcome h2 { color: var(--text); margin-bottom: 8px; }
.welcome p { font-size: 13px; line-height: 1.6; }
@@ -237,6 +281,10 @@
Podcast Mindmap
+