- Backend: /api/compare Endpoint für Podcast-Vergleich (Stats, gemeinsame Topics, Top-Querverbindungen), /api/.../words Endpoint für Wort-Timestamps - Frontend: Podcast-Vergleichsansicht mit Statistiken und Cross-Links, Cross-Podcast-Suche-Toggle, semantische Links im Transkript (lazy-loaded), Podcast-Switcher mit Zurück-Navigation - PWA: manifest.json, Service Worker (stale-while-revalidate für Assets, network-first für API, cache-on-success für Audio), Icons - Scripts: transcribe_words.py (mlx-whisper Batch-Transkription mit Wort-Timestamps), import_words.py (Wort-Timestamps in DB importieren) - Dockerfile: PWA-Assets in Container kopieren Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
96 lines
2.8 KiB
Python
96 lines
2.8 KiB
Python
#!/usr/bin/env python3
|
|
"""Importiert Wort-Level-Timestamps in die SQLite-Datenbank.
|
|
|
|
Liest *.words.json-Dateien und schreibt in die Tabelle `words`.
|
|
|
|
Nutzung:
|
|
python3 import_words.py <podcast_id> <words-json-verzeichnis> [db-pfad]
|
|
|
|
Beispiel:
|
|
python3 import_words.py neu-denken ../data/neu-denken/words/ ../data/db.sqlite
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
import sqlite3
|
|
from pathlib import Path
|
|
|
|
|
|
def init_words_table(db):
|
|
"""Erstelle words-Tabelle falls nicht vorhanden."""
|
|
db.executescript("""
|
|
CREATE TABLE IF NOT EXISTS words (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
podcast_id TEXT NOT NULL,
|
|
episode_id TEXT NOT NULL,
|
|
segment_idx INTEGER NOT NULL,
|
|
word_idx INTEGER NOT NULL,
|
|
word TEXT NOT NULL,
|
|
start_time REAL NOT NULL,
|
|
end_time REAL NOT NULL,
|
|
UNIQUE(podcast_id, episode_id, segment_idx, word_idx)
|
|
);
|
|
CREATE INDEX IF NOT EXISTS idx_words_episode ON words(podcast_id, episode_id);
|
|
CREATE INDEX IF NOT EXISTS idx_words_time ON words(podcast_id, episode_id, start_time);
|
|
""")
|
|
|
|
|
|
def import_words_file(db, podcast_id: str, words_file: Path):
|
|
"""Importiere eine *.words.json-Datei."""
|
|
data = json.loads(words_file.read_text())
|
|
episode_name = data["episode"]
|
|
|
|
# Episode-ID aus Dateinamen: S1E1-Wachstum → S1E1
|
|
episode_id = episode_name.split("-")[0]
|
|
|
|
# Alte Einträge löschen
|
|
db.execute("DELETE FROM words WHERE podcast_id = ? AND episode_id = ?", (podcast_id, episode_id))
|
|
|
|
count = 0
|
|
for seg_idx, segment in enumerate(data.get("segments", [])):
|
|
for word_idx, w in enumerate(segment.get("words", [])):
|
|
db.execute(
|
|
"INSERT INTO words (podcast_id, episode_id, segment_idx, word_idx, word, start_time, end_time) "
|
|
"VALUES (?, ?, ?, ?, ?, ?, ?)",
|
|
(podcast_id, episode_id, seg_idx, word_idx, w["word"], w["start"], w["end"])
|
|
)
|
|
count += 1
|
|
|
|
return count
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) < 3:
|
|
print(f"Nutzung: {sys.argv[0]} <podcast_id> <words-verzeichnis> [db-pfad]")
|
|
sys.exit(1)
|
|
|
|
podcast_id = sys.argv[1]
|
|
words_dir = Path(sys.argv[2])
|
|
db_path = sys.argv[3] if len(sys.argv) > 3 else os.environ.get("DB_PATH", "data/db.sqlite")
|
|
|
|
db = sqlite3.connect(db_path)
|
|
init_words_table(db)
|
|
|
|
files = sorted(words_dir.glob("*.words.json"))
|
|
if not files:
|
|
print(f"Keine *.words.json-Dateien in {words_dir} gefunden.")
|
|
sys.exit(1)
|
|
|
|
print(f"Importiere {len(files)} Dateien für Podcast '{podcast_id}'")
|
|
|
|
total_words = 0
|
|
for f in files:
|
|
count = import_words_file(db, podcast_id, f)
|
|
print(f" {f.stem}: {count} Wörter")
|
|
total_words += count
|
|
|
|
db.commit()
|
|
db.close()
|
|
|
|
print(f"Fertig: {total_words} Wörter importiert.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|