feat(tour): ElevenLabs-Voice für die Tour (#185 Phase 2)

Audio-Backend: - ``app/tour_audio.py`` ruft ElevenLabs-TTS mit voice_id=Domi (AZnzlk1XvdvUeBnXmlld) und model=eleven_multilingual_v2. ENV-konfiguriert via ``ELEVENLABS_API_KEY``, ``ELEVENLABS_VOICE_ID``, ``ELEVENLABS_MODEL_ID``. - Voice-Settings: stability 0.55, similarity_boost 0.7 (warm, klar, natürlich). - Caching: SHA-256(text|voice|model) → ``data/tour_audio/<hash>.mp3``. Folgeabrufe gehen aus dem Datei-Cache, kein API-Quota-Verbrauch. Endpoint: ``GET /api/tour/voice?text=...`` rate-limited 30/min, liefert audio/mpeg mit Cache-Control 30 Tage. Bei fehlendem API-Key 503 — Frontend fällt dann auf ``speechSynthesis`` zurück (Browser-eingebaute Stimme). Frontend (tour.html): - ``speak()`` versucht erst Server-Audio (ElevenLabs), bei 503/Fehler Fallback auf Web Speech API. - Session-Cache via Blob-URL: Vor/Zurück-Navigation in der Tour zieht nicht jedes Mal eine neue Network-Roundtrip. - ``stopSpeak()`` stoppt beide Audio-Pfade sauber. Konfiguration für dev: ``ELEVENLABS_API_KEY`` und (optional) ``ELEVENLABS_VOICE_ID`` in ``/opt/gwoe-antragspruefer-dev/.env`` setzen, dann Container restart.
2026-05-09 03:17:06 +02:00 · 2026-05-09 03:17:06 +02:00 · 6ec05d2b86
commit 6ec05d2b86
parent e31ee1ad07
3 changed files with 198 additions and 3 deletions
--- a/app/main.py
+++ b/app/main.py
@ -2129,6 +2129,36 @@ async def quellen_search(
    })
@app.get("/api/tour/voice")
@limiter.limit("30/minute")
 async def tour_voice(request: Request, text: str = Query(..., min_length=2, max_length=2000)):
    """Generiert (oder liefert aus Cache) eine MP3 für Tour-Erklär-Texte (#185).
    Nutzt ElevenLabs-TTS, wenn ENV ``ELEVENLABS_API_KEY`` gesetzt ist —
    sonst 503, damit das Frontend auf ``speechSynthesis`` (browser-
    eingebaute Stimme) zurückfällt.
    Caching: pro (text, voice_id, model_id) wird einmal generiert und in
    ``data/tour_audio/<hash>.mp3`` gespeichert. Folgeabrufe gehen aus
    dem Cache und kosten kein API-Quota.
    """
    from .tour_audio import get_or_generate, is_available
    if not is_available():
        raise HTTPException(
            status_code=503,
            detail="ElevenLabs nicht konfiguriert (ELEVENLABS_API_KEY fehlt)",
        )
    audio = await get_or_generate(text)
    if audio is None:
        raise HTTPException(status_code=502, detail="TTS-Generierung fehlgeschlagen")
    from fastapi.responses import Response
    return Response(
        content=audio,
        media_type="audio/mpeg",
        headers={"Cache-Control": "public, max-age=2592000"},   # 30 Tage Browser-Cache
    )
@app.get("/api/wahlprogramm-cite")
 async def wahlprogramm_cite(
    request: Request,
--- a/app/templates/v3/components/tour.html
+++ b/app/templates/v3/components/tour.html
@ -152,11 +152,14 @@
  let _resolvedSteps = [];   // STEPS gefiltert auf vorhandene Elemente
  let _tourMuted = false;
  let _tourUtter = null;
  let _tourAudio = null;    // <audio>-Element für ElevenLabs-MP3
  // Cache nur in dieser Session: vermeidet doppelte API-Roundtrips bei
  // Vor/Zurück-Navigation. Server-seitig sind die MP3s ohnehin gecacht.
  const _audioBlobCache = {};
  function $(id) { return document.getElementById(id); }
-  function speak(text) {
+  function speakWebSpeech(text) {
    if (_tourMuted) return;
    if (!('speechSynthesis' in window)) return;
    try {
      window.speechSynthesis.cancel();
@ -164,7 +167,6 @@
      u.lang = 'de-DE';
      u.rate = 1.0;
      u.pitch = 1.0;
      // Versuche eine deutsche, möglichst weibliche Stimme zu wählen.
      const voices = window.speechSynthesis.getVoices();
      const de = voices.filter(v => /de(-DE|_DE)?/i.test(v.lang));
      const female = de.find(v => /female|frau|anna|petra|katja|helga|marlene|vicki/i.test(v.name));
@ -174,11 +176,54 @@
    } catch (_) { /* TTS optional */ }
  }
  async function speakElevenLabs(text) {
    // Server-Endpoint ruft ElevenLabs + cacht; bei 503 (kein API-Key)
    // fallen wir auf Web Speech zurück.
    if (_audioBlobCache[text]) {
      playAudio(_audioBlobCache[text]);
      return true;
    }
    try {
      const url = '/api/tour/voice?text=' + encodeURIComponent(text);
      const resp = await fetch(url);
      if (resp.status === 503) return false;   // ElevenLabs nicht konfiguriert
      if (!resp.ok) {
        console.warn('Tour-Audio ' + resp.status + ': fallback auf Web Speech');
        return false;
      }
      const blob = await resp.blob();
      const blobUrl = URL.createObjectURL(blob);
      _audioBlobCache[text] = blobUrl;
      playAudio(blobUrl);
      return true;
    } catch (e) {
      console.warn('Tour-Audio-Fehler, fallback Web Speech:', e);
      return false;
    }
  }
  function playAudio(blobUrl) {
    stopSpeak();
    _tourAudio = new Audio(blobUrl);
    _tourAudio.play().catch(() => { /* Autoplay-Block, harmless */ });
  }
  async function speak(text) {
    if (_tourMuted) return;
    // Bevorzugt ElevenLabs (Server). Bei nicht-konfiguriert auf Web Speech.
    const ok = await speakElevenLabs(text);
    if (!ok) speakWebSpeech(text);
  }
  function stopSpeak() {
    if ('speechSynthesis' in window) {
      try { window.speechSynthesis.cancel(); } catch (_) {}
    }
    _tourUtter = null;
    if (_tourAudio) {
      try { _tourAudio.pause(); _tourAudio.currentTime = 0; } catch (_) {}
      _tourAudio = null;
    }
  }
  function resolveSteps() {
--- a/app/tour_audio.py
+++ b/app/tour_audio.py
@ -0,0 +1,120 @@
 """Tour-Audio-Generator über ElevenLabs (#185 Phase 2).
 Architektur:
 - Pro Tour-Station ein Text-String. Wir hashen (text, voice_id) und
  cachen die fertige MP3 im ``data/tour_audio/<hash>.mp3``-Cache.
 - Beim ersten Abruf wird die ElevenLabs-Text-to-Speech-API aufgerufen,
  die MP3 gespeichert, dann ausgeliefert.
 - Folgeabrufe gehen direkt aus dem Cache. Kein Re-API-Call solange der
  Text identisch bleibt.
 ENV:
 - ``ELEVENLABS_API_KEY``    — Pflicht; ohne fällt die Tour auf
                              browser-internes ``speechSynthesis`` zurück.
 - ``ELEVENLABS_VOICE_ID``   — optional; Default Domi (AZnzlk1XvdvUeBnXmlld).
 - ``ELEVENLABS_MODEL_ID``   — optional; Default ``eleven_multilingual_v2``.
 Caching-Strategie: SHA-256(text + voice_id + model_id) als Dateiname.
 Damit:
 - Text-Edit → neuer Hash → frische Generierung.
 - Voice-Wechsel → neuer Hash, alte Voice bleibt im Cache (kann manuell
  weggeräumt werden).
 """
 import hashlib
 import logging
 import os
 from pathlib import Path
 from typing import Optional
 import httpx
 from .config import settings
 logger = logging.getLogger(__name__)
 DEFAULT_VOICE_ID = "AZnzlk1XvdvUeBnXmlld"  # Domi
 DEFAULT_MODEL_ID = "eleven_multilingual_v2"
 ELEVENLABS_TTS_URL = "https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
 CACHE_DIR = settings.data_dir / "tour_audio"
 def _config() -> tuple[Optional[str], str, str]:
    """API-Key (None wenn nicht gesetzt) + Voice-ID + Model-ID."""
    api_key = os.environ.get("ELEVENLABS_API_KEY")
    voice_id = os.environ.get("ELEVENLABS_VOICE_ID") or DEFAULT_VOICE_ID
    model_id = os.environ.get("ELEVENLABS_MODEL_ID") or DEFAULT_MODEL_ID
    return (api_key or None), voice_id, model_id
 def is_available() -> bool:
    """True, wenn die ENV den API-Key liefert."""
    return _config()[0] is not None
 def _cache_key(text: str, voice_id: str, model_id: str) -> str:
    h = hashlib.sha256(f"{voice_id}|{model_id}|{text}".encode("utf-8")).hexdigest()
    return h[:32]
 def _cache_path(text: str, voice_id: str, model_id: str) -> Path:
    CACHE_DIR.mkdir(parents=True, exist_ok=True)
    return CACHE_DIR / f"{_cache_key(text, voice_id, model_id)}.mp3"
 async def get_or_generate(text: str) -> Optional[bytes]:
    """Liefert die MP3-Bytes für ``text`` (aus Cache oder frisch generiert).
    Returns None, wenn der API-Key fehlt — der Tour-Frontend-Code fällt
    dann auf ``speechSynthesis`` (Browser-eingebaut) zurück.
    """
    api_key, voice_id, model_id = _config()
    if not api_key:
        return None
    if not text or len(text) > 5000:
        logger.warning("Tour-Audio: Text leer oder zu lang (%d chars)", len(text))
        return None
    cache_file = _cache_path(text, voice_id, model_id)
    if cache_file.exists():
        return cache_file.read_bytes()
    url = ELEVENLABS_TTS_URL.format(voice_id=voice_id)
    payload = {
        "text": text,
        "model_id": model_id,
        "voice_settings": {
            # Für eine warme, klare Erzähl-Stimme: stability hoch (=ruhig),
            # similarity_boost mittel (=natürlich, nicht über-poliert).
            "stability": 0.55,
            "similarity_boost": 0.7,
            "style": 0.0,
            "use_speaker_boost": True,
        },
    }
    headers = {
        "xi-api-key": api_key,
        "Content-Type": "application/json",
        "Accept": "audio/mpeg",
    }
    try:
        async with httpx.AsyncClient(timeout=httpx.Timeout(30.0)) as client:
            r = await client.post(url, headers=headers, json=payload)
        if r.status_code != 200:
            logger.warning(
                "ElevenLabs-TTS Status %d für voice=%s: %s",
                r.status_code, voice_id, r.text[:200],
            )
            return None
        audio = r.content
        cache_file.write_bytes(audio)
        logger.info(
            "Tour-Audio cached: %s (%d bytes, voice=%s)",
            cache_file.name, len(audio), voice_id,
        )
        return audio
    except Exception:
        logger.exception("ElevenLabs-TTS-Aufruf fehlgeschlagen")
        return None