podcast-mindmap/scripts/import_words.py
Dotty Dotter e678f75ee1 #8 Multi-Podcast-Dashboard, #9 PWA, #10 Cross-Podcast-Links, #12 Wort-Timestamps
- Backend: /api/compare Endpoint für Podcast-Vergleich (Stats, gemeinsame Topics,
  Top-Querverbindungen), /api/.../words Endpoint für Wort-Timestamps
- Frontend: Podcast-Vergleichsansicht mit Statistiken und Cross-Links,
  Cross-Podcast-Suche-Toggle, semantische Links im Transkript (lazy-loaded),
  Podcast-Switcher mit Zurück-Navigation
- PWA: manifest.json, Service Worker (stale-while-revalidate für Assets,
  network-first für API, cache-on-success für Audio), Icons
- Scripts: transcribe_words.py (mlx-whisper Batch-Transkription mit Wort-Timestamps),
  import_words.py (Wort-Timestamps in DB importieren)
- Dockerfile: PWA-Assets in Container kopieren

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-23 20:53:06 +02:00

96 lines
2.8 KiB
Python

#!/usr/bin/env python3
"""Importiert Wort-Level-Timestamps in die SQLite-Datenbank.
Liest *.words.json-Dateien und schreibt in die Tabelle `words`.
Nutzung:
python3 import_words.py <podcast_id> <words-json-verzeichnis> [db-pfad]
Beispiel:
python3 import_words.py neu-denken ../data/neu-denken/words/ ../data/db.sqlite
"""
import json
import os
import sys
import sqlite3
from pathlib import Path
def init_words_table(db):
"""Erstelle words-Tabelle falls nicht vorhanden."""
db.executescript("""
CREATE TABLE IF NOT EXISTS words (
id INTEGER PRIMARY KEY AUTOINCREMENT,
podcast_id TEXT NOT NULL,
episode_id TEXT NOT NULL,
segment_idx INTEGER NOT NULL,
word_idx INTEGER NOT NULL,
word TEXT NOT NULL,
start_time REAL NOT NULL,
end_time REAL NOT NULL,
UNIQUE(podcast_id, episode_id, segment_idx, word_idx)
);
CREATE INDEX IF NOT EXISTS idx_words_episode ON words(podcast_id, episode_id);
CREATE INDEX IF NOT EXISTS idx_words_time ON words(podcast_id, episode_id, start_time);
""")
def import_words_file(db, podcast_id: str, words_file: Path):
"""Importiere eine *.words.json-Datei."""
data = json.loads(words_file.read_text())
episode_name = data["episode"]
# Episode-ID aus Dateinamen: S1E1-Wachstum → S1E1
episode_id = episode_name.split("-")[0]
# Alte Einträge löschen
db.execute("DELETE FROM words WHERE podcast_id = ? AND episode_id = ?", (podcast_id, episode_id))
count = 0
for seg_idx, segment in enumerate(data.get("segments", [])):
for word_idx, w in enumerate(segment.get("words", [])):
db.execute(
"INSERT INTO words (podcast_id, episode_id, segment_idx, word_idx, word, start_time, end_time) "
"VALUES (?, ?, ?, ?, ?, ?, ?)",
(podcast_id, episode_id, seg_idx, word_idx, w["word"], w["start"], w["end"])
)
count += 1
return count
def main():
if len(sys.argv) < 3:
print(f"Nutzung: {sys.argv[0]} <podcast_id> <words-verzeichnis> [db-pfad]")
sys.exit(1)
podcast_id = sys.argv[1]
words_dir = Path(sys.argv[2])
db_path = sys.argv[3] if len(sys.argv) > 3 else os.environ.get("DB_PATH", "data/db.sqlite")
db = sqlite3.connect(db_path)
init_words_table(db)
files = sorted(words_dir.glob("*.words.json"))
if not files:
print(f"Keine *.words.json-Dateien in {words_dir} gefunden.")
sys.exit(1)
print(f"Importiere {len(files)} Dateien für Podcast '{podcast_id}'")
total_words = 0
for f in files:
count = import_words_file(db, podcast_id, f)
print(f" {f.stem}: {count} Wörter")
total_words += count
db.commit()
db.close()
print(f"Fertig: {total_words} Wörter importiert.")
if __name__ == "__main__":
main()