diff --git a/scripts/index_topics.py b/scripts/index_topics.py new file mode 100644 index 0000000..61df157 --- /dev/null +++ b/scripts/index_topics.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 +"""Index transcript paragraphs with topic tags for cross-referencing. + +Reads srt_index.json, assigns topic tags to each paragraph, +finds cross-references between paragraphs across episodes. +Outputs topics_index.json for the webapp. + +This script processes paragraphs in batches and uses simple +keyword/phrase extraction — no external LLM API needed. +""" + +import json +import os +import re +import sys +from collections import defaultdict +from difflib import SequenceMatcher + +# Topic taxonomy — shared vocabulary for cross-podcast matching +TOPIC_TAXONOMY = { + # Wirtschaft + "wachstum": ["wachstum", "bip", "bruttoinlandsprodukt", "wirtschaftswachstum", "konjunktur", "rezession", "wohlstand"], + "schulden": ["schulden", "staatsverschuldung", "schuldenbremse", "kredit", "anleihen", "haushalt", "investition"], + "vermoegen": ["vermögen", "reichtum", "eigentum", "besitz", "erbschaft", "ungleichheit", "verteilung", "kodierung"], + "steuern": ["steuer", "erbschaftssteuer", "vermögensteuer", "kapitalertrag", "steuergerechtigkeit", "umverteilung"], + "markt": ["markt", "marktwirtschaft", "kapitalismus", "neoliberal", "deregulierung", "privatisierung", "wettbewerb"], + "innovation": ["innovation", "forschung", "technologie", "digitalisierung", "transformation", "start-up", "patent"], + "arbeit": ["arbeit", "beschäftigung", "arbeitsmarkt", "fachkräfte", "mindestlohn", "gewerkschaft", "care-arbeit"], + + # Sicherheit + "klimakrise": ["klima", "klimawandel", "co2", "emission", "erderwärmung", "paris", "erneuerbar", "fossil", "ökologisch"], + "geopolitik": ["geopolitik", "nato", "russland", "china", "europa", "sicherheitspolitik", "verteidigung", "aufrüstung", "krieg"], + "soziale_sicherheit": ["sozial", "sicherheit", "rente", "pflege", "gesundheit", "daseinsvorsorge", "sozialstaat"], + "digitale_sicherheit": ["plattform", "monopol", "google", "meta", "tiktok", "algorithmus", "datenschutz", "überwachung"], + + # Demokratie + "demokratie": ["demokratie", "wahl", "parlament", "regierung", "rechtsstaat", "gewaltenteilung", "verfassung", "grundgesetz"], + "macht": ["macht", "herrschaft", "elite", "oligarchie", "lobbyismus", "einfluss", "gestaltung"], + "polarisierung": ["polarisierung", "spaltung", "populismus", "radikalisierung", "filterblas", "desinformation", "fake"], + "buerokratie": ["bürokratie", "verwaltung", "regulierung", "föderalismus", "kommune", "behörde"], + "kompromiss": ["kompromiss", "verhandlung", "koalition", "konsens", "diskurs", "debatte", "streit"], + "extremismus": ["extremismus", "faschismus", "rechtsextrem", "autoritär", "rassismus", "antisemitismus", "gewalt"], + + # Freiheit + "freiheit": ["freiheit", "grundrecht", "menschenrecht", "selbstbestimmung", "autonomie", "emanzipation"], + "diskriminierung": ["diskriminierung", "rassismus", "gleichstellung", "gender", "minderheit", "inklusion", "vielfalt"], + "bildung": ["bildung", "schule", "universität", "lernen", "kompetenz", "pisa", "chancengleichheit"], + "gesundheit": ["gesundheit", "krankheit", "prävention", "medizin", "mental health", "ernährung", "allergie"], + "migration": ["migration", "flucht", "integration", "zuwanderung", "asyl", "fachkräfteeinwanderung"], + + # Querschnitt + "ohnmacht": ["ohnmacht", "hilflosigkeit", "resignation", "vertrauen", "selbstwirksamkeit", "beteiligung", "engagement"], + "narrative": ["narrativ", "erzählung", "framing", "kommunikation", "medien", "öffentlichkeit", "diskurs"], + "generationen": ["generation", "jugend", "alter", "demografie", "zukunft", "intergenerationell", "boomer"], + "infrastruktur": ["infrastruktur", "bahn", "straße", "brücke", "netz", "glasfaser", "öpnv", "mobilität"], +} + + +def tag_paragraph(text, taxonomy=TOPIC_TAXONOMY): + """Assign topic tags to a paragraph based on keyword matching.""" + text_lower = text.lower() + scores = {} + + for topic, keywords in taxonomy.items(): + score = 0 + for kw in keywords: + # Count occurrences, weight longer keywords higher + count = text_lower.count(kw) + if count > 0: + score += count * (1 + len(kw) / 10) + if score > 0: + scores[topic] = score + + # Return top tags (normalized score > threshold) + if not scores: + return [] + + max_score = max(scores.values()) + threshold = max_score * 0.3 + tags = sorted( + [(t, s) for t, s in scores.items() if s >= threshold], + key=lambda x: -x[1] + ) + return [t for t, s in tags[:5]] # max 5 tags + + +def find_crossrefs(tagged_paragraphs, min_shared_tags=2): + """Find cross-references between paragraphs in different episodes.""" + crossrefs = [] + + # Build index: topic → list of (episode_key, para_idx) + topic_index = defaultdict(list) + for ep_key, paras in tagged_paragraphs.items(): + for i, p in enumerate(paras): + for tag in p["tags"]: + topic_index[tag].append((ep_key, i)) + + # Find pairs with shared tags across episodes + seen = set() + for topic, locations in topic_index.items(): + for j in range(len(locations)): + for k in range(j + 1, len(locations)): + ep1, idx1 = locations[j] + ep2, idx2 = locations[k] + if ep1 == ep2: + continue # Skip same-episode refs + pair = tuple(sorted([(ep1, idx1), (ep2, idx2)])) + if pair in seen: + continue + seen.add(pair) + + # Count shared tags + tags1 = set(tagged_paragraphs[ep1][idx1]["tags"]) + tags2 = set(tagged_paragraphs[ep2][idx2]["tags"]) + shared = tags1 & tags2 + + if len(shared) >= min_shared_tags: + crossrefs.append({ + "source": {"episode": ep1, "paragraph": idx1}, + "target": {"episode": ep2, "paragraph": idx2}, + "shared_topics": sorted(shared), + "score": len(shared) / max(len(tags1), len(tags2), 1) + }) + + # Sort by score + crossrefs.sort(key=lambda x: -x["score"]) + return crossrefs + + +def main(): + project_dir = sys.argv[1] if len(sys.argv) > 1 else "." + data_dir = os.path.join(project_dir, "data") + + # Load transcript index + srt_path = os.path.join(data_dir, "srt_index.json") + if not os.path.exists(srt_path): + print(f"ERROR: {srt_path} not found") + sys.exit(1) + + with open(srt_path, "r", encoding="utf-8") as f: + srt_data = json.load(f) + + # Tag all paragraphs + tagged = {} + total_tags = 0 + + for ep_key, ep_data in srt_data.items(): + tagged[ep_key] = [] + for p in ep_data["paragraphs"]: + tags = tag_paragraph(p["text"]) + tagged[ep_key].append({ + "start": p["start"], + "end": p["end"], + "tags": tags, + "text_preview": p["text"][:100] + }) + total_tags += len(tags) + + print(f"Tagged {sum(len(v) for v in tagged.values())} paragraphs") + print(f"Total tags assigned: {total_tags}") + + # Show tag distribution + tag_counts = defaultdict(int) + for ep_paras in tagged.values(): + for p in ep_paras: + for t in p["tags"]: + tag_counts[t] += 1 + + print(f"\nTop topics:") + for tag, count in sorted(tag_counts.items(), key=lambda x: -x[1])[:15]: + print(f" {tag}: {count} paragraphs") + + # Find cross-references + crossrefs = find_crossrefs(tagged) + print(f"\nCross-references: {len(crossrefs)} (min 2 shared tags)") + + # Output + output = { + "taxonomy": {k: {"keywords": v} for k, v in TOPIC_TAXONOMY.items()}, + "tagged_paragraphs": tagged, + "crossrefs": crossrefs[:500], # Top 500 + } + + output_path = os.path.join(data_dir, "topics_index.json") + with open(output_path, "w", encoding="utf-8") as f: + json.dump(output, f, ensure_ascii=False, indent=2) + + # Copy to webapp + webapp_dir = os.path.join(project_dir, "webapp") + if os.path.isdir(webapp_dir): + import shutil + shutil.copy2(output_path, os.path.join(webapp_dir, "topics_index.json")) + + print(f"\nOutput: {output_path}") + + +if __name__ == "__main__": + main() diff --git a/webapp/index.html b/webapp/index.html index f7ef9e9..03e7919 100644 --- a/webapp/index.html +++ b/webapp/index.html @@ -218,6 +218,50 @@ } #audio-bar .bar-transcript-btn:hover { border-color: var(--accent); color: var(--text); } + /* ── Backlinks (Obsidian-style) ── */ + .backlinks { + margin-top: 8px; padding: 8px 0; + border-top: 1px solid var(--border); + } + .backlinks-title { + font-size: 10px; color: var(--text-muted); text-transform: uppercase; + letter-spacing: 0.05em; margin-bottom: 6px; + } + .backlink { + font-size: 11px; color: var(--accent); cursor: pointer; + padding: 3px 8px; border-radius: 4px; display: block; + transition: background 0.15s; + } + .backlink:hover { background: var(--surface2); } + .backlink .bl-episode { font-weight: 600; } + .backlink .bl-preview { color: var(--text-muted); font-style: italic; } + + .topic-tag { + display: inline-block; padding: 1px 7px; border-radius: 10px; + font-size: 9px; background: var(--surface2); color: var(--text-muted); + margin: 1px; border: 1px solid var(--border); + } + + /* ── Soundbite Export ── */ + .export-btn { + background: transparent; border: 1px solid var(--border); + color: var(--text-muted); padding: 3px 8px; border-radius: 4px; + font-size: 10px; cursor: pointer; transition: all 0.2s; + } + .export-btn:hover { border-color: var(--accent); color: var(--text); } + + /* ── View Tabs ── */ + .view-tabs { + display: flex; gap: 4px; margin-left: 12px; + } + .view-tab { + background: var(--surface2); border: 1px solid var(--border); + color: var(--text-muted); padding: 4px 10px; border-radius: 4px; + font-size: 11px; cursor: pointer; transition: all 0.2s; + } + .view-tab:hover { border-color: var(--accent); } + .view-tab.active { background: var(--accent); color: var(--bg); border-color: var(--accent); } + .welcome { text-align: center; padding: 40px 20px; color: var(--text-muted); } .welcome h2 { color: var(--text); margin-bottom: 8px; } .welcome p { font-size: 13px; line-height: 1.6; } @@ -237,6 +281,10 @@

Podcast Mindmap

+
+ + +
@@ -879,7 +927,8 @@ function buildQuoteCard(q, color) {
"${escHtml(q.verbatim || q.text)}"
${q.speaker} · ${q.episode}${timeStr ? ' · ' + timeStr : ''}${topBadge} - ${hasAudio ? `` : ''} + ${hasAudio ? ` + ` : ''}
`; } @@ -900,6 +949,296 @@ function fmtTime(sec) { function escHtml(s) { return s.replace(/&/g, '&').replace(//g, '>').replace(/"/g, '"'); } + +// ============================================================ +// #2: Obsidian-style Backlinks +// ============================================================ +let TOPICS = null; + +async function loadTopics() { + if (TOPICS) return; + try { + const r = await fetch('topics_index.json'); + TOPICS = await r.json(); + } catch (e) { + try { + const r = await fetch('data/topics_index.json'); + TOPICS = await r.json(); + } catch (e2) { TOPICS = { tagged_paragraphs: {}, crossrefs: [] }; } + } +} + +function buildBacklinks(episodeKey, paraIdx) { + if (!TOPICS || !TOPICS.crossrefs) return ''; + const refs = TOPICS.crossrefs.filter(r => + (r.source.episode === episodeKey && r.source.paragraph === paraIdx) || + (r.target.episode === episodeKey && r.target.paragraph === paraIdx) + ).slice(0, 5); + + if (refs.length === 0) return ''; + + let html = ''; + return html; +} + +function buildTopicTags(episodeKey, paraIdx) { + if (!TOPICS || !TOPICS.tagged_paragraphs[episodeKey]) return ''; + const para = TOPICS.tagged_paragraphs[episodeKey][paraIdx]; + if (!para || !para.tags || para.tags.length === 0) return ''; + return '
' + + para.tags.map(t => `${t.replace('_', ' ')}`).join('') + + '
'; +} + +// Patch TranscriptView.show to include backlinks +const _origTranscriptShow = TranscriptView.show.bind(TranscriptView); +TranscriptView.show = async function(episodeId, seekTime) { + await loadTopics(); + await _origTranscriptShow(episodeId, seekTime); + + // Find the episode key in TOPICS + const epKey = Object.keys(TOPICS.tagged_paragraphs || {}).find(k => k.startsWith(episodeId)); + if (!epKey) return; + + // Add topic tags and backlinks to each paragraph + document.querySelectorAll('.transcript-para').forEach(el => { + const idx = parseInt(el.dataset.idx); + const tags = buildTopicTags(epKey, idx); + const links = buildBacklinks(epKey, idx); + if (tags || links) { + el.insertAdjacentHTML('beforeend', tags + links); + } + }); +}; + +// ============================================================ +// #6: Soundbite Export +// ============================================================ +async function exportSoundbite(quoteId) { + const q = DATA.quotes.find(q => q.id === quoteId); + if (!q || !q.audioFile || q.startTime === null) return; + + const btn = document.querySelector(`#card-${quoteId} .export-btn`); + if (btn) btn.textContent = 'Lädt…'; + + try { + const response = await fetch(`audio/${q.audioFile}`); + const arrayBuffer = await response.arrayBuffer(); + + const audioCtx = new (window.AudioContext || window.webkitAudioContext)(); + const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer); + + const start = q.startTime; + const end = q.endTime || (q.startTime + 30); + const duration = end - start; + + const sampleRate = audioBuffer.sampleRate; + const startSample = Math.floor(start * sampleRate); + const numSamples = Math.floor(duration * sampleRate); + + // Create new buffer with just the clip + const channels = audioBuffer.numberOfChannels; + const clipBuffer = audioCtx.createBuffer(channels, numSamples, sampleRate); + for (let ch = 0; ch < channels; ch++) { + const src = audioBuffer.getChannelData(ch); + const dst = clipBuffer.getChannelData(ch); + for (let i = 0; i < numSamples; i++) { + dst[i] = src[startSample + i] || 0; + } + } + + // Encode as WAV + const wav = encodeWAV(clipBuffer); + const blob = new Blob([wav], { type: 'audio/wav' }); + const url = URL.createObjectURL(blob); + + const a = document.createElement('a'); + a.href = url; + a.download = `${q.episode}-${q.speaker.replace(/\s+/g, '_')}-${Math.floor(start)}s.wav`; + a.click(); + URL.revokeObjectURL(url); + + if (btn) btn.textContent = 'Clip ↓'; + audioCtx.close(); + } catch (e) { + console.error('Export failed:', e); + if (btn) btn.textContent = 'Fehler'; + } +} + +function encodeWAV(buffer) { + const numChannels = buffer.numberOfChannels; + const sampleRate = buffer.sampleRate; + const format = 1; // PCM + const bitDepth = 16; + const blockAlign = numChannels * bitDepth / 8; + const byteRate = sampleRate * blockAlign; + const dataSize = buffer.length * blockAlign; + const headerSize = 44; + + const wav = new ArrayBuffer(headerSize + dataSize); + const view = new DataView(wav); + + // RIFF header + writeString(view, 0, 'RIFF'); + view.setUint32(4, 36 + dataSize, true); + writeString(view, 8, 'WAVE'); + writeString(view, 12, 'fmt '); + view.setUint32(16, 16, true); + view.setUint16(20, format, true); + view.setUint16(22, numChannels, true); + view.setUint32(24, sampleRate, true); + view.setUint32(28, byteRate, true); + view.setUint16(32, blockAlign, true); + view.setUint16(34, bitDepth, true); + writeString(view, 36, 'data'); + view.setUint32(40, dataSize, true); + + // Interleave samples + let offset = 44; + for (let i = 0; i < buffer.length; i++) { + for (let ch = 0; ch < numChannels; ch++) { + const sample = Math.max(-1, Math.min(1, buffer.getChannelData(ch)[i])); + view.setInt16(offset, sample < 0 ? sample * 0x8000 : sample * 0x7FFF, true); + offset += 2; + } + } + return wav; +} + +function writeString(view, offset, str) { + for (let i = 0; i < str.length; i++) { + view.setUint8(offset + i, str.charCodeAt(i)); + } +} + +// ============================================================ +// #7: Timeline View +// ============================================================ +let timelineBuilt = false; + +function switchView(view) { + document.querySelectorAll('.view-tab').forEach(t => t.classList.remove('active')); + document.getElementById(`tab-${view}`).classList.add('active'); + + if (view === 'mindmap') { + document.getElementById('mindmap').style.display = ''; + if (timelineBuilt) { + const tlEl = document.getElementById('timeline-container'); + if (tlEl) tlEl.style.display = 'none'; + } + } else if (view === 'timeline') { + document.getElementById('mindmap').style.display = 'none'; + buildTimeline(); + } +} + +function buildTimeline() { + let container = document.getElementById('timeline-container'); + if (!container) { + container = document.createElement('div'); + container.id = 'timeline-container'; + container.style.cssText = 'grid-row:2; overflow-y:auto; padding:20px; background:var(--bg);'; + document.getElementById('mindmap').parentNode.insertBefore(container, document.getElementById('mindmap').nextSibling); + } + container.style.display = ''; + + let html = '
'; + + DATA.staffeln.forEach(staffel => { + const eps = DATA.episodes.filter(e => e.staffel === staffel.id); + html += `
`; + html += `

Staffel ${staffel.id}: ${staffel.name}

`; + + eps.forEach(ep => { + const quotes = DATA.quotes.filter(q => q.episode === ep.id); + const topQuotes = quotes.filter(q => q.isTopQuote); + + html += `
`; + + // Episode marker + html += `
`; + html += `
${ep.id}
`; + html += `
${ep.guest}
`; + html += `
`; + + // Timeline bar + html += `
`; + html += `
${ep.title}
`; + + // Quote dots on timeline + html += `
`; + + // Find max time for this episode + const maxTime = Math.max(...quotes.filter(q => q.endTime).map(q => q.endTime), 3600); + + quotes.forEach(q => { + if (q.startTime === null) return; + const left = (q.startTime / maxTime) * 100; + const isTop = q.isTopQuote; + html += `
`; + }); + html += `
`; // timeline bar + + // Top quotes below bar + if (topQuotes.length > 0) { + topQuotes.forEach(q => { + html += `
`; + html += `"${escHtml((q.verbatim || q.text).substring(0, 100))}…" ${q.speaker}`; + html += `
`; + }); + } + + html += `
`; // content + html += `
`; // row + }); + + html += `
`; // staffel + }); + + html += '
'; + container.innerHTML = html; + timelineBuilt = true; +} + +function showQuoteInPanel(quoteId) { + const q = DATA.quotes.find(q => q.id === quoteId); + if (!q) return; + const ep = DATA.episodes.find(e => e.id === q.episode); + if (ep) showEpisode(ep); + // Scroll to and highlight the quote card + setTimeout(() => { + const card = document.getElementById(`card-${quoteId}`); + if (card) { + card.scrollIntoView({ behavior: 'smooth', block: 'center' }); + card.style.outline = '2px solid var(--accent)'; + setTimeout(() => card.style.outline = '', 2000); + } + }, 100); +}