diff --git a/backend/app.py b/backend/app.py index fd0d4b5..da335d5 100644 --- a/backend/app.py +++ b/backend/app.py @@ -271,6 +271,21 @@ def get_shifts_analysis(podcast: Optional[str] = None, theme: Optional[str] = No } +@app.get("/api/analyses/cross-themes") +def get_cross_themes(): + """Cross-Podcast-Themen-Cluster (#8/#10): Themen aus verschiedenen Podcasts, + die semantisch zusammengehoeren.""" + path = Path(DATA_DIR) / "theme_clusters.json" + if not path.exists(): + return {"available": False} + try: + with open(path) as f: + data = json.load(f) + except Exception: + return {"available": False} + return {"available": True, **data} + + @app.get("/api/analyses/density") def get_density(podcast_id: Optional[str] = None, bins: int = 20): """Faktendichte (#16): claims-Verteilung je Episode in N Bins ueber die Paragraph-Achse.""" diff --git a/scripts/cluster_themes.py b/scripts/cluster_themes.py new file mode 100644 index 0000000..bf82dc3 --- /dev/null +++ b/scripts/cluster_themes.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +"""#8 Standardisiertes Themen-Tagging: cluster Themes ueber Podcasts hinweg. + +Berechnet Embeddings fuer alle Themes (label + description) und ordnet aehnliche +Themes ueber Podcast-Grenzen hinweg einem gemeinsamen Cluster zu. + +Output: data/theme_clusters.json +{ + "clusters": [ + { + "id": "klima", + "label": "Klima", + "members": [ + {"podcast_id": "ldn", "theme_id": "klima-verkehr", "label": "..."}, + {"podcast_id": "neu-denken", "theme_id": "klimakrise", "label": "..."} + ] + } + ] +} +""" + +import json +import os +import sqlite3 +import sys +from pathlib import Path + +import numpy as np +from openai import OpenAI + +DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite" +OUT_PATH = sys.argv[2] if len(sys.argv) > 2 else "data/theme_clusters.json" +THRESHOLD = float(os.environ.get("THEME_CLUSTER_THRESHOLD", "0.65")) + +API_KEY = os.environ.get("DASHSCOPE_API_KEY", "") +BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" +EMBED_MODEL = "text-embedding-v3" + + +def embed(client, texts): + resp = client.embeddings.create(model=EMBED_MODEL, input=texts, dimensions=1024) + return [item.embedding for item in resp.data] + + +def normalize(v): + n = np.linalg.norm(v) + return v / n if n else v + + +def main(): + if not API_KEY: + print("DASHSCOPE_API_KEY nicht gesetzt.") + sys.exit(1) + client = OpenAI(api_key=API_KEY, base_url=BASE_URL) + db = sqlite3.connect(DB_PATH) + db.row_factory = sqlite3.Row + + rows = db.execute(""" + SELECT podcast_id, id, label, description, color + FROM themes ORDER BY podcast_id, id + """).fetchall() + print(f"Themes gefunden: {len(rows)}") + if not rows: + print("Nichts zu tun.") + return + + texts = [] + for r in rows: + snippet = r["label"] + if r["description"]: + snippet += " — " + r["description"] + texts.append(snippet[:500]) + + # Batch limit der DashScope-API ist 10 Texte je Call + embs = [] + for i in range(0, len(texts), 8): + embs.extend(embed(client, texts[i:i + 8])) + vectors = np.array([normalize(np.array(e, dtype=np.float32)) for e in embs]) + + # Single-Linkage Clustering ueber THRESHOLD + n = len(rows) + parent = list(range(n)) + + def find(x): + while parent[x] != x: + parent[x] = parent[parent[x]] + x = parent[x] + return x + + def union(a, b): + ra, rb = find(a), find(b) + if ra != rb: + parent[rb] = ra + + sim = vectors @ vectors.T + for i in range(n): + for j in range(i + 1, n): + # Nur ueber Podcast-Grenzen oder bei sehr hoher Aehnlichkeit clustern + if rows[i]["podcast_id"] != rows[j]["podcast_id"] and sim[i, j] >= THRESHOLD: + union(i, j) + + # Cluster bilden + cluster_map = {} + for i in range(n): + cluster_map.setdefault(find(i), []).append(i) + + clusters = [] + for cid, idxs in cluster_map.items(): + members = [] + for i in idxs: + r = rows[i] + members.append({ + "podcast_id": r["podcast_id"], + "theme_id": r["id"], + "label": r["label"], + "color": r["color"], + }) + # Cluster-Label: kuerzeste Member-Bezeichnung + cluster_label = min((m["label"] for m in members), key=len) + if len(members) > 1: + # Cluster-ID aus erstem Theme-ID + cid_str = members[0]["theme_id"] + else: + cid_str = members[0]["theme_id"] + clusters.append({ + "id": cid_str, + "label": cluster_label, + "n_members": len(members), + "is_cross": len(set(m["podcast_id"] for m in members)) > 1, + "members": members, + }) + + # Sortieren: Cross-Cluster zuerst, dann nach Mitgliederzahl + clusters.sort(key=lambda c: (-c["is_cross"], -c["n_members"])) + + out = {"threshold": THRESHOLD, "clusters": clusters} + Path(OUT_PATH).parent.mkdir(parents=True, exist_ok=True) + with open(OUT_PATH, "w", encoding="utf-8") as f: + json.dump(out, f, ensure_ascii=False, indent=2) + print(f"\n{len(clusters)} Cluster geschrieben nach {OUT_PATH}") + cross = [c for c in clusters if c["is_cross"]] + print(f" davon cross-podcast: {len(cross)}") + for c in cross: + members = ", ".join(f"{m['podcast_id']}/{m['theme_id']}" for m in c["members"]) + print(f" [{c['label']}] {members}") + db.close() + + +if __name__ == "__main__": + main() diff --git a/webapp/index.html b/webapp/index.html index a817bcb..62d18bb 100644 --- a/webapp/index.html +++ b/webapp/index.html @@ -746,7 +746,7 @@ const AnalysisView = { async show(episodeId, mode) { if (!CURRENT_PODCAST) return; - TranscriptView.hide(); AnalysisView.hide(); GapsView.hide(); ShiftsView.hide(); DebatesView.hide(); ArgumentsView.hide(); DensityView.hide(); + TranscriptView.hide(); AnalysisView.hide(); GapsView.hide(); ShiftsView.hide(); DebatesView.hide(); ArgumentsView.hide(); DensityView.hide(); CrossMindmapView.hide(); this.episodeId = episodeId; this.mode = mode; this.visible = true; @@ -919,7 +919,7 @@ const GapsView = { minSize: 0, async show() { - TranscriptView.hide(); AnalysisView.hide(); GapsView.hide(); ShiftsView.hide(); DebatesView.hide(); ArgumentsView.hide(); DensityView.hide(); + TranscriptView.hide(); AnalysisView.hide(); GapsView.hide(); ShiftsView.hide(); DebatesView.hide(); ArgumentsView.hide(); DensityView.hide(); CrossMindmapView.hide(); this.visible = true; const panel = document.getElementById('panel'); panel.innerHTML = `
Lädt …
`; @@ -1041,7 +1041,7 @@ const ShiftsView = { expanded: {}, async show() { - TranscriptView.hide(); AnalysisView.hide(); GapsView.hide(); ShiftsView.hide(); DebatesView.hide(); ArgumentsView.hide(); DensityView.hide(); + TranscriptView.hide(); AnalysisView.hide(); GapsView.hide(); ShiftsView.hide(); DebatesView.hide(); ArgumentsView.hide(); DensityView.hide(); CrossMindmapView.hide(); this.visible = true; const panel = document.getElementById('panel'); panel.innerHTML = `Lädt …
`; @@ -1425,7 +1425,7 @@ const DensityView = { sort: 'density', // 'density' | 'order' async show() { - TranscriptView.hide(); AnalysisView.hide(); GapsView.hide(); ShiftsView.hide(); DebatesView.hide(); ArgumentsView.hide(); DensityView.hide(); + TranscriptView.hide(); AnalysisView.hide(); GapsView.hide(); ShiftsView.hide(); DebatesView.hide(); ArgumentsView.hide(); DensityView.hide(); CrossMindmapView.hide(); this.visible = true; const panel = document.getElementById('panel'); panel.innerHTML = `Lädt …
`; @@ -1526,6 +1526,187 @@ const DensityView = { hide() { this.visible = false; } }; +// ── Cross Mindmap (#8/#10 Combined) ── +const CrossMindmapView = { + visible: false, + data: null, + podcastsData: null, + + async show() { + TranscriptView.hide(); AnalysisView.hide(); GapsView.hide(); ShiftsView.hide(); DebatesView.hide(); ArgumentsView.hide(); DensityView.hide(); CrossMindmapView.hide(); + this.visible = true; + + // Mindmap-Container fuer SVG aufbereiten + const mindmap = document.getElementById('mindmap'); + mindmap.style.overflow = 'hidden'; + mindmap.style.display = ''; + mindmap.style.padding = ''; + mindmap.innerHTML = ''; + + document.querySelectorAll('.view-tab').forEach(t => t.classList.remove('active')); + document.getElementById('tab-mindmap')?.classList.add('active'); + document.getElementById('staffel-filters').innerHTML = ''; + document.getElementById('app-title').innerHTML = `← Cross-Mindmap`; + document.title = 'Cross-Mindmap — Podcast Mindmap'; + + const panel = document.getElementById('panel'); + panel.innerHTML = `Themen aus verschiedenen Podcasts in einer Visualisierung. Cross-Cluster verbinden Themen, die semantisch zusammengehören.
Lädt …
Fehler: ${escHtml(e.message)}
`; + return; + } + + requestAnimationFrame(() => requestAnimationFrame(() => this.render())); + }, + + render() { + const svg = d3.select('#svg'); + svg.selectAll('*').remove(); + const container = document.getElementById('mindmap'); + let W = container.clientWidth, H = container.clientHeight; + if (!W || W < 100) W = window.innerWidth - 400; + if (!H || H < 100) H = window.innerHeight - 52; + if (W < 200) W = 800; + if (H < 200) H = 600; + const isMobile = W < 600; + const sc = isMobile ? 0.6 : 1; + svg.attr('viewBox', `0 0 ${W} ${H}`).attr('preserveAspectRatio', 'xMidYMid meet'); + + const nodes = [], links = []; + const podcastIds = Object.keys(this.podcastsData); + const colors = { 0: '#60a5fa', 1: '#dc7850' }; // blau / orange als Hub-Farben + + // Hubs links und rechts + podcastIds.forEach((pid, i) => { + const data = this.podcastsData[pid]; + const cx = i === 0 ? W * 0.28 : W * 0.72; + const cy = H * 0.5; + nodes.push({ + id: `hub-${pid}`, type: 'hub', pid, + label: data.name, + r: 36 * sc, fx: cx, fy: cy, color: colors[i] || '#888', + }); + }); + + // Themen je Podcast + podcastIds.forEach(pid => { + const data = this.podcastsData[pid]; + (data.themes || []).forEach(t => { + nodes.push({ + id: `t-${pid}-${t.id}`, type: 'theme', pid, + themeId: t.id, label: t.label.length > 22 ? t.label.slice(0, 20) + '…' : t.label, + fullLabel: t.label, r: 22 * sc, color: t.color, + }); + links.push({ source: `hub-${pid}`, target: `t-${pid}-${t.id}`, type: 'hub-theme' }); + }); + }); + + // Cross-Links zwischen Themen verschiedener Podcasts + const crossClusters = (this.data?.clusters || []).filter(c => c.is_cross); + crossClusters.forEach(c => { + const memberIds = c.members.map(m => `t-${m.podcast_id}-${m.theme_id}`); + // Vollverbinde alle Mitglieder eines Cross-Clusters + for (let i = 0; i < memberIds.length; i++) { + for (let j = i + 1; j < memberIds.length; j++) { + links.push({ source: memberIds[i], target: memberIds[j], type: 'cross', cluster: c.label }); + } + } + }); + + const sim = d3.forceSimulation(nodes) + .force('link', d3.forceLink(links).id(d => d.id).distance(d => d.type === 'hub-theme' ? 130 * sc : 220 * sc).strength(d => d.type === 'cross' ? 0.06 : 0.6)) + .force('charge', d3.forceManyBody().strength(d => d.type === 'hub' ? -1200 * sc : -250 * sc)) + .force('collision', d3.forceCollide().radius(d => d.r + 6)) + .alphaDecay(0.02); + + const zoom = d3.zoom().scaleExtent([0.3, 3]).on('zoom', e => g.attr('transform', e.transform)); + svg.call(zoom); + const g = svg.append('g'); + + const linkEls = g.append('g').selectAll('line').data(links).join('line') + .attr('stroke', d => d.type === 'cross' ? '#fcd34d' : '#374151') + .attr('stroke-width', d => d.type === 'cross' ? 2.4 : 1) + .attr('stroke-dasharray', d => d.type === 'cross' ? '6,3' : null) + .attr('opacity', d => d.type === 'cross' ? 0.85 : 0.5); + + // Cross-Cluster-Label auf den Verbindungen + const crossLabels = g.append('g').selectAll('text').data(links.filter(l => l.type === 'cross')).join('text') + .attr('font-size', '10px').attr('fill', '#fcd34d').attr('text-anchor', 'middle') + .text(d => d.cluster); + + const nodeG = g.append('g'); + const themeNodes = nodeG.selectAll('.cross-theme').data(nodes.filter(n => n.type === 'theme')).join('g') + .attr('class', 'cross-theme').style('cursor', 'pointer') + .on('click', (e, d) => CrossMindmapView.openTheme(d.pid, d.themeId)); + themeNodes.append('circle').attr('r', d => d.r).attr('fill', d => d.color + '55').attr('stroke', d => d.color).attr('stroke-width', 1.6); + themeNodes.append('text').attr('dy', d => -d.r - 6).attr('text-anchor', 'middle').attr('font-size', '11px').attr('fill', 'var(--text)').text(d => d.label); + themeNodes.append('title').text(d => d.fullLabel); + + const hubNodes = nodeG.selectAll('.cross-hub').data(nodes.filter(n => n.type === 'hub')).join('g') + .attr('class', 'cross-hub').style('cursor', 'pointer') + .on('click', (e, d) => selectPodcast(d.pid)); + hubNodes.append('circle').attr('r', d => d.r).attr('fill', d => d.color + '22').attr('stroke', d => d.color).attr('stroke-width', 2.5); + hubNodes.append('text').attr('text-anchor', 'middle').attr('font-size', '12px').attr('font-weight', '700').attr('fill', d => d.color) + .selectAll('tspan').data(d => d.label.split(/\s+/)).join('tspan').attr('x', 0).attr('dy', (d, i) => i === 0 ? '-0.3em' : '1.2em').text(d => d); + + sim.on('tick', () => { + linkEls.attr('x1', d => d.source.x).attr('y1', d => d.source.y).attr('x2', d => d.target.x).attr('y2', d => d.target.y); + crossLabels.attr('x', d => (d.source.x + d.target.x) / 2).attr('y', d => (d.source.y + d.target.y) / 2 - 4); + themeNodes.attr('transform', d => `translate(${d.x},${d.y})`); + hubNodes.attr('transform', d => `translate(${d.x},${d.y})`); + }); + + // Panel: Cluster-Liste + const panel = document.getElementById('panel'); + let html = `${this.data.clusters.length} Themen-Cluster · ${cross.length} davon cross-podcast (Cosinus-Schwelle ${(this.data.threshold || 0).toFixed(2)})
`; + if (cross.length) { + html += `Keine Treffer für "${escHtml(query)}"
`; @@ -1650,7 +1831,7 @@ const Search = { }, showSemanticResults(results, query) { - TranscriptView.hide(); AnalysisView.hide(); GapsView.hide(); ShiftsView.hide(); DebatesView.hide(); ArgumentsView.hide(); DensityView.hide(); + TranscriptView.hide(); AnalysisView.hide(); GapsView.hide(); ShiftsView.hide(); DebatesView.hide(); ArgumentsView.hide(); DensityView.hide(); CrossMindmapView.hide(); const panel = document.getElementById('panel'); let html = `