diff --git a/Dockerfile b/Dockerfile index 8c67c2b..f8edbe4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,6 +11,8 @@ COPY backend/ . # Copy webapp as static files COPY webapp/index.html webapp/d3.v7.min.js /static/ +COPY webapp/manifest.json webapp/sw.js /static/ +COPY webapp/icon-192.png webapp/icon-512.png /static/ EXPOSE 8000 diff --git a/backend/app.py b/backend/app.py index 3150dbf..c74f16b 100644 --- a/backend/app.py +++ b/backend/app.py @@ -88,6 +88,32 @@ def get_transcript(podcast_id: str, episode_id: str): return {"paragraphs": [{"start": p["start_time"], "end": p["end_time"], "text": p["text"]} for p in paras]} +@app.get("/api/podcasts/{podcast_id}/transcript/{episode_id}/words") +def get_words(podcast_id: str, episode_id: str): + """Get word-level timestamps for an episode.""" + db = get_db() + # Check if words table exists + try: + words = db.execute( + "SELECT segment_idx, word_idx, word, start_time, end_time FROM words " + "WHERE podcast_id = ? AND episode_id = ? ORDER BY segment_idx, word_idx", + (podcast_id, episode_id) + ).fetchall() + except Exception: + db.close() + return {"words": [], "available": False} + db.close() + + if not words: + return {"words": [], "available": False} + + return { + "available": True, + "words": [{"seg": w["segment_idx"], "idx": w["word_idx"], + "word": w["word"], "start": w["start_time"], "end": w["end_time"]} for w in words] + } + + @app.get("/api/search") def search(q: str = Query(..., min_length=2), podcast_id: Optional[str] = None, limit: int = 50): """Full-text search across all transcripts.""" @@ -213,6 +239,78 @@ def get_precomputed_similar(podcast_id: str, episode_id: str, para_idx: int, lim } for r in rows] +@app.get("/api/compare") +def compare_podcasts(a: str = Query(...), b: str = Query(...)): + """Compare two podcasts: shared topics, stats, cross-links.""" + db = get_db() + + # Basic stats + stats = {} + for pid in (a, b): + podcast = db.execute("SELECT * FROM podcasts WHERE id = ?", (pid,)).fetchone() + if not podcast: + raise HTTPException(404, f"Podcast '{pid}' not found") + ep_count = db.execute("SELECT COUNT(*) as c FROM episodes WHERE podcast_id = ?", (pid,)).fetchone()["c"] + q_count = db.execute("SELECT COUNT(*) as c FROM quotes WHERE podcast_id = ?", (pid,)).fetchone()["c"] + p_count = db.execute("SELECT COUNT(*) as c FROM paragraphs WHERE podcast_id = ?", (pid,)).fetchone()["c"] + stats[pid] = {"name": podcast["name"], "episodes": ep_count, "quotes": q_count, "paragraphs": p_count} + + # Shared topics via topic tags + topics_a = db.execute( + "SELECT DISTINCT t.tag FROM topics t JOIN paragraphs p ON t.paragraph_id = p.id WHERE p.podcast_id = ?", (a,) + ).fetchall() + topics_b = db.execute( + "SELECT DISTINCT t.tag FROM topics t JOIN paragraphs p ON t.paragraph_id = p.id WHERE p.podcast_id = ?", (b,) + ).fetchall() + + set_a = {r["tag"] for r in topics_a} + set_b = {r["tag"] for r in topics_b} + shared = sorted(set_a & set_b) + only_a = sorted(set_a - set_b) + only_b = sorted(set_b - set_a) + + # Cross-podcast semantic links count + cross_links = 0 + top_links = [] + try: + cross_links = db.execute( + "SELECT COUNT(*) as c FROM semantic_links WHERE " + "(podcast_id = ? AND target_podcast = ?) OR (podcast_id = ? AND target_podcast = ?)", + (a, b, b, a) + ).fetchone()["c"] + + top_links = db.execute( + "SELECT sl.*, p1.text as source_text, p2.text as target_text, " + "e1.title as source_title, e2.title as target_title " + "FROM semantic_links sl " + "JOIN paragraphs p1 ON sl.podcast_id = p1.podcast_id AND sl.source_episode = p1.episode_id AND sl.source_idx = p1.idx " + "JOIN paragraphs p2 ON sl.target_podcast = p2.podcast_id AND sl.target_episode = p2.episode_id AND sl.target_idx = p2.idx " + "JOIN episodes e1 ON sl.podcast_id = e1.podcast_id AND sl.source_episode = e1.id " + "JOIN episodes e2 ON sl.target_podcast = e2.podcast_id AND sl.target_episode = e2.id " + "WHERE (sl.podcast_id = ? AND sl.target_podcast = ?) OR (sl.podcast_id = ? AND sl.target_podcast = ?) " + "ORDER BY sl.score DESC LIMIT 20", + (a, b, b, a) + ).fetchall() + except Exception: + pass # semantic_links table may not exist yet + + db.close() + + return { + "stats": stats, + "shared_topics": shared, + "only_in": {a: only_a, b: only_b}, + "cross_links_count": cross_links, + "top_cross_links": [{ + "source_podcast": r["podcast_id"], "source_episode": r["source_episode"], + "source_text": r["source_text"][:150], "source_title": r["source_title"], + "target_podcast": r["target_podcast"], "target_episode": r["target_episode"], + "target_text": r["target_text"][:150], "target_title": r["target_title"], + "score": r["score"] + } for r in top_links] + } + + @app.get("/api/semantic-search") def semantic_search(q: str = Query(..., min_length=3), podcast_id: Optional[str] = None, limit: int = 20): """Semantic search using query embedding.""" diff --git a/scripts/import_words.py b/scripts/import_words.py new file mode 100644 index 0000000..df4f7ce --- /dev/null +++ b/scripts/import_words.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +"""Importiert Wort-Level-Timestamps in die SQLite-Datenbank. + +Liest *.words.json-Dateien und schreibt in die Tabelle `words`. + +Nutzung: + python3 import_words.py [db-pfad] + +Beispiel: + python3 import_words.py neu-denken ../data/neu-denken/words/ ../data/db.sqlite +""" + +import json +import os +import sys +import sqlite3 +from pathlib import Path + + +def init_words_table(db): + """Erstelle words-Tabelle falls nicht vorhanden.""" + db.executescript(""" + CREATE TABLE IF NOT EXISTS words ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + podcast_id TEXT NOT NULL, + episode_id TEXT NOT NULL, + segment_idx INTEGER NOT NULL, + word_idx INTEGER NOT NULL, + word TEXT NOT NULL, + start_time REAL NOT NULL, + end_time REAL NOT NULL, + UNIQUE(podcast_id, episode_id, segment_idx, word_idx) + ); + CREATE INDEX IF NOT EXISTS idx_words_episode ON words(podcast_id, episode_id); + CREATE INDEX IF NOT EXISTS idx_words_time ON words(podcast_id, episode_id, start_time); + """) + + +def import_words_file(db, podcast_id: str, words_file: Path): + """Importiere eine *.words.json-Datei.""" + data = json.loads(words_file.read_text()) + episode_name = data["episode"] + + # Episode-ID aus Dateinamen: S1E1-Wachstum → S1E1 + episode_id = episode_name.split("-")[0] + + # Alte Einträge löschen + db.execute("DELETE FROM words WHERE podcast_id = ? AND episode_id = ?", (podcast_id, episode_id)) + + count = 0 + for seg_idx, segment in enumerate(data.get("segments", [])): + for word_idx, w in enumerate(segment.get("words", [])): + db.execute( + "INSERT INTO words (podcast_id, episode_id, segment_idx, word_idx, word, start_time, end_time) " + "VALUES (?, ?, ?, ?, ?, ?, ?)", + (podcast_id, episode_id, seg_idx, word_idx, w["word"], w["start"], w["end"]) + ) + count += 1 + + return count + + +def main(): + if len(sys.argv) < 3: + print(f"Nutzung: {sys.argv[0]} [db-pfad]") + sys.exit(1) + + podcast_id = sys.argv[1] + words_dir = Path(sys.argv[2]) + db_path = sys.argv[3] if len(sys.argv) > 3 else os.environ.get("DB_PATH", "data/db.sqlite") + + db = sqlite3.connect(db_path) + init_words_table(db) + + files = sorted(words_dir.glob("*.words.json")) + if not files: + print(f"Keine *.words.json-Dateien in {words_dir} gefunden.") + sys.exit(1) + + print(f"Importiere {len(files)} Dateien für Podcast '{podcast_id}'") + + total_words = 0 + for f in files: + count = import_words_file(db, podcast_id, f) + print(f" {f.stem}: {count} Wörter") + total_words += count + + db.commit() + db.close() + + print(f"Fertig: {total_words} Wörter importiert.") + + +if __name__ == "__main__": + main() diff --git a/scripts/transcribe_words.py b/scripts/transcribe_words.py new file mode 100644 index 0000000..197d35e --- /dev/null +++ b/scripts/transcribe_words.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +"""Batch-Transkription mit wortgenauen Timestamps via mlx-whisper. + +Erzeugt pro Episode eine JSON-Datei mit Wort-Level-Timing. +Läuft auf Apple Silicon (mlx-metal). + +Nutzung: + python3 transcribe_words.py /pfad/zu/audio/ /pfad/zu/output/ + python3 transcribe_words.py /pfad/zu/audio/S1E1-Wachstum.m4a # einzelne Datei + +Modell: whisper-large-v3-turbo (schnell + genau, ~1.5 GB VRAM) +""" + +import json +import os +import sys +import time +from pathlib import Path + +# ── Config ── +MODEL = "mlx-community/whisper-large-v3-turbo" +LANGUAGE = "de" +AUDIO_EXTENSIONS = {".m4a", ".mp3", ".wav", ".flac", ".ogg", ".opus"} + + +def transcribe_episode(audio_path: str, output_dir: str) -> dict: + """Transkribiere eine Episode mit Wort-Timestamps.""" + import mlx_whisper + + name = Path(audio_path).stem + output_file = Path(output_dir) / f"{name}.words.json" + + # Skip wenn bereits vorhanden + if output_file.exists(): + print(f" ⏭ {name} — bereits vorhanden, überspringe") + return json.loads(output_file.read_text()) + + print(f" ▶ {name} — transkribiere…") + t0 = time.time() + + result = mlx_whisper.transcribe( + audio_path, + path_or_hf_repo=MODEL, + language=LANGUAGE, + word_timestamps=True, + verbose=False, + condition_on_previous_text=True, + initial_prompt="NEU DENKEN Podcast mit Maja Göpel. Themen: Wirtschaft, Demokratie, Sicherheit, Freiheit.", + ) + + elapsed = time.time() - t0 + + # Extrahiere Wörter aus Segmenten + words = [] + for segment in result.get("segments", []): + for w in segment.get("words", []): + words.append({ + "word": w["word"].strip(), + "start": round(w["start"], 3), + "end": round(w["end"], 3), + }) + + # Auch Segment-Level behalten (für Absatz-Mapping) + segments = [] + for seg in result.get("segments", []): + segments.append({ + "start": round(seg["start"], 3), + "end": round(seg["end"], 3), + "text": seg["text"].strip(), + "words": [{ + "word": w["word"].strip(), + "start": round(w["start"], 3), + "end": round(w["end"], 3), + } for w in seg.get("words", [])], + }) + + output = { + "episode": name, + "model": MODEL, + "language": LANGUAGE, + "duration_seconds": round(elapsed, 1), + "word_count": len(words), + "segment_count": len(segments), + "segments": segments, + } + + output_file.write_text(json.dumps(output, ensure_ascii=False, indent=2)) + print(f" ✓ {name} — {len(words)} Wörter, {len(segments)} Segmente, {elapsed:.0f}s") + return output + + +def main(): + if len(sys.argv) < 2: + print(f"Nutzung: {sys.argv[0]} [output-verzeichnis]") + sys.exit(1) + + input_path = Path(sys.argv[1]) + output_dir = Path(sys.argv[2]) if len(sys.argv) > 2 else input_path if input_path.is_dir() else input_path.parent + + output_dir.mkdir(parents=True, exist_ok=True) + + # Einzelne Datei oder Verzeichnis? + if input_path.is_file(): + files = [input_path] + elif input_path.is_dir(): + files = sorted([f for f in input_path.iterdir() if f.suffix.lower() in AUDIO_EXTENSIONS]) + else: + print(f"Fehler: {input_path} existiert nicht.") + sys.exit(1) + + if not files: + print("Keine Audio-Dateien gefunden.") + sys.exit(1) + + print(f"Transkribiere {len(files)} Dateien → {output_dir}/") + print(f"Modell: {MODEL}") + print() + + total_t0 = time.time() + results = [] + + for i, f in enumerate(files, 1): + print(f"[{i}/{len(files)}] {f.name}") + try: + result = transcribe_episode(str(f), str(output_dir)) + results.append(result) + except Exception as e: + print(f" ✗ FEHLER: {e}") + + total_elapsed = time.time() - total_t0 + total_words = sum(r.get("word_count", 0) for r in results) + + print() + print(f"Fertig: {len(results)}/{len(files)} Episoden, {total_words} Wörter, {total_elapsed:.0f}s gesamt") + + +if __name__ == "__main__": + main() diff --git a/webapp/icon-192.png b/webapp/icon-192.png new file mode 100644 index 0000000..b959067 Binary files /dev/null and b/webapp/icon-192.png differ diff --git a/webapp/icon-512.png b/webapp/icon-512.png new file mode 100644 index 0000000..3a08494 Binary files /dev/null and b/webapp/icon-512.png differ diff --git a/webapp/index.html b/webapp/index.html index 8d8c368..d60c562 100644 --- a/webapp/index.html +++ b/webapp/index.html @@ -3,6 +3,10 @@ + + + + Podcast Mindmap