From 6ec05d2b86f82e602640b2b7199cea23daf15816 Mon Sep 17 00:00:00 2001
From: Dotty Dotter <dotty@Mac.wideopen.space>
Date: Sat, 9 May 2026 03:17:06 +0200
Subject: [PATCH] =?UTF-8?q?feat(tour):=20ElevenLabs-Voice=20f=C3=BCr=20die?=
 =?UTF-8?q?=20Tour=20(#185=20Phase=202)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Audio-Backend:
- ``app/tour_audio.py`` ruft ElevenLabs-TTS mit voice_id=Domi
  (AZnzlk1XvdvUeBnXmlld) und model=eleven_multilingual_v2. ENV-konfiguriert
  via ``ELEVENLABS_API_KEY``, ``ELEVENLABS_VOICE_ID``, ``ELEVENLABS_MODEL_ID``.
- Voice-Settings: stability 0.55, similarity_boost 0.7 (warm, klar, natürlich).
- Caching: SHA-256(text|voice|model) → ``data/tour_audio/<hash>.mp3``.
  Folgeabrufe gehen aus dem Datei-Cache, kein API-Quota-Verbrauch.

Endpoint: ``GET /api/tour/voice?text=...`` rate-limited 30/min,
liefert audio/mpeg mit Cache-Control 30 Tage. Bei fehlendem
API-Key 503 — Frontend fällt dann auf ``speechSynthesis`` zurück
(Browser-eingebaute Stimme).

Frontend (tour.html):
- ``speak()`` versucht erst Server-Audio (ElevenLabs), bei 503/Fehler
  Fallback auf Web Speech API.
- Session-Cache via Blob-URL: Vor/Zurück-Navigation in der Tour zieht
  nicht jedes Mal eine neue Network-Roundtrip.
- ``stopSpeak()`` stoppt beide Audio-Pfade sauber.

Konfiguration für dev: ``ELEVENLABS_API_KEY`` und (optional)
``ELEVENLABS_VOICE_ID`` in ``/opt/gwoe-antragspruefer-dev/.env`` setzen,
dann Container restart.
---
 app/main.py                           |  30 +++++++
 app/templates/v3/components/tour.html |  51 ++++++++++-
 app/tour_audio.py                     | 120 ++++++++++++++++++++++++++
 3 files changed, 198 insertions(+), 3 deletions(-)
 create mode 100644 app/tour_audio.py
diff --git a/app/main.py b/app/main.py
index 947cc3f..3ceeac3 100644
--- a/app/main.py
+++ b/app/main.py
@@ -2129,6 +2129,36 @@ async def quellen_search(
     })
 
 
+@app.get("/api/tour/voice")
+@limiter.limit("30/minute")
+async def tour_voice(request: Request, text: str = Query(..., min_length=2, max_length=2000)):
+    """Generiert (oder liefert aus Cache) eine MP3 für Tour-Erklär-Texte (#185).
+
+    Nutzt ElevenLabs-TTS, wenn ENV ``ELEVENLABS_API_KEY`` gesetzt ist —
+    sonst 503, damit das Frontend auf ``speechSynthesis`` (browser-
+    eingebaute Stimme) zurückfällt.
+
+    Caching: pro (text, voice_id, model_id) wird einmal generiert und in
+    ``data/tour_audio/<hash>.mp3`` gespeichert. Folgeabrufe gehen aus
+    dem Cache und kosten kein API-Quota.
+    """
+    from .tour_audio import get_or_generate, is_available
+    if not is_available():
+        raise HTTPException(
+            status_code=503,
+            detail="ElevenLabs nicht konfiguriert (ELEVENLABS_API_KEY fehlt)",
+        )
+    audio = await get_or_generate(text)
+    if audio is None:
+        raise HTTPException(status_code=502, detail="TTS-Generierung fehlgeschlagen")
+    from fastapi.responses import Response
+    return Response(
+        content=audio,
+        media_type="audio/mpeg",
+        headers={"Cache-Control": "public, max-age=2592000"},   # 30 Tage Browser-Cache
+    )
+
+
 @app.get("/api/wahlprogramm-cite")
 async def wahlprogramm_cite(
     request: Request,
diff --git a/app/templates/v3/components/tour.html b/app/templates/v3/components/tour.html
index 6bd67de..f53b67a 100644
--- a/app/templates/v3/components/tour.html
+++ b/app/templates/v3/components/tour.html
@@ -152,11 +152,14 @@
   let _resolvedSteps = [];   // STEPS gefiltert auf vorhandene Elemente
   let _tourMuted = false;
   let _tourUtter = null;
+  let _tourAudio = null;    // <audio>-Element für ElevenLabs-MP3
+  // Cache nur in dieser Session: vermeidet doppelte API-Roundtrips bei
+  // Vor/Zurück-Navigation. Server-seitig sind die MP3s ohnehin gecacht.
+  const _audioBlobCache = {};
 
   function $(id) { return document.getElementById(id); }
 
-  function speak(text) {
-    if (_tourMuted) return;
+  function speakWebSpeech(text) {
     if (!('speechSynthesis' in window)) return;
     try {
       window.speechSynthesis.cancel();
@@ -164,7 +167,6 @@
       u.lang = 'de-DE';
       u.rate = 1.0;
       u.pitch = 1.0;
-      // Versuche eine deutsche, möglichst weibliche Stimme zu wählen.
       const voices = window.speechSynthesis.getVoices();
       const de = voices.filter(v => /de(-DE|_DE)?/i.test(v.lang));
       const female = de.find(v => /female|frau|anna|petra|katja|helga|marlene|vicki/i.test(v.name));
@@ -174,11 +176,54 @@
     } catch (_) { /* TTS optional */ }
   }
 
+  async function speakElevenLabs(text) {
+    // Server-Endpoint ruft ElevenLabs + cacht; bei 503 (kein API-Key)
+    // fallen wir auf Web Speech zurück.
+    if (_audioBlobCache[text]) {
+      playAudio(_audioBlobCache[text]);
+      return true;
+    }
+    try {
+      const url = '/api/tour/voice?text=' + encodeURIComponent(text);
+      const resp = await fetch(url);
+      if (resp.status === 503) return false;   // ElevenLabs nicht konfiguriert
+      if (!resp.ok) {
+        console.warn('Tour-Audio ' + resp.status + ': fallback auf Web Speech');
+        return false;
+      }
+      const blob = await resp.blob();
+      const blobUrl = URL.createObjectURL(blob);
+      _audioBlobCache[text] = blobUrl;
+      playAudio(blobUrl);
+      return true;
+    } catch (e) {
+      console.warn('Tour-Audio-Fehler, fallback Web Speech:', e);
+      return false;
+    }
+  }
+
+  function playAudio(blobUrl) {
+    stopSpeak();
+    _tourAudio = new Audio(blobUrl);
+    _tourAudio.play().catch(() => { /* Autoplay-Block, harmless */ });
+  }
+
+  async function speak(text) {
+    if (_tourMuted) return;
+    // Bevorzugt ElevenLabs (Server). Bei nicht-konfiguriert auf Web Speech.
+    const ok = await speakElevenLabs(text);
+    if (!ok) speakWebSpeech(text);
+  }
+
   function stopSpeak() {
     if ('speechSynthesis' in window) {
       try { window.speechSynthesis.cancel(); } catch (_) {}
     }
     _tourUtter = null;
+    if (_tourAudio) {
+      try { _tourAudio.pause(); _tourAudio.currentTime = 0; } catch (_) {}
+      _tourAudio = null;
+    }
   }
 
   function resolveSteps() {
diff --git a/app/tour_audio.py b/app/tour_audio.py
new file mode 100644
index 0000000..d8aecf6
--- /dev/null
+++ b/app/tour_audio.py
@@ -0,0 +1,120 @@
+"""Tour-Audio-Generator über ElevenLabs (#185 Phase 2).
+
+Architektur:
+- Pro Tour-Station ein Text-String. Wir hashen (text, voice_id) und
+  cachen die fertige MP3 im ``data/tour_audio/<hash>.mp3``-Cache.
+- Beim ersten Abruf wird die ElevenLabs-Text-to-Speech-API aufgerufen,
+  die MP3 gespeichert, dann ausgeliefert.
+- Folgeabrufe gehen direkt aus dem Cache. Kein Re-API-Call solange der
+  Text identisch bleibt.
+
+ENV:
+- ``ELEVENLABS_API_KEY``    — Pflicht; ohne fällt die Tour auf
+                              browser-internes ``speechSynthesis`` zurück.
+- ``ELEVENLABS_VOICE_ID``   — optional; Default Domi (AZnzlk1XvdvUeBnXmlld).
+- ``ELEVENLABS_MODEL_ID``   — optional; Default ``eleven_multilingual_v2``.
+
+Caching-Strategie: SHA-256(text + voice_id + model_id) als Dateiname.
+Damit:
+- Text-Edit → neuer Hash → frische Generierung.
+- Voice-Wechsel → neuer Hash, alte Voice bleibt im Cache (kann manuell
+  weggeräumt werden).
+"""
+import hashlib
+import logging
+import os
+from pathlib import Path
+from typing import Optional
+
+import httpx
+
+from .config import settings
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_VOICE_ID = "AZnzlk1XvdvUeBnXmlld"  # Domi
+DEFAULT_MODEL_ID = "eleven_multilingual_v2"
+ELEVENLABS_TTS_URL = "https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
+
+CACHE_DIR = settings.data_dir / "tour_audio"
+
+
+def _config() -> tuple[Optional[str], str, str]:
+    """API-Key (None wenn nicht gesetzt) + Voice-ID + Model-ID."""
+    api_key = os.environ.get("ELEVENLABS_API_KEY")
+    voice_id = os.environ.get("ELEVENLABS_VOICE_ID") or DEFAULT_VOICE_ID
+    model_id = os.environ.get("ELEVENLABS_MODEL_ID") or DEFAULT_MODEL_ID
+    return (api_key or None), voice_id, model_id
+
+
+def is_available() -> bool:
+    """True, wenn die ENV den API-Key liefert."""
+    return _config()[0] is not None
+
+
+def _cache_key(text: str, voice_id: str, model_id: str) -> str:
+    h = hashlib.sha256(f"{voice_id}|{model_id}|{text}".encode("utf-8")).hexdigest()
+    return h[:32]
+
+
+def _cache_path(text: str, voice_id: str, model_id: str) -> Path:
+    CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    return CACHE_DIR / f"{_cache_key(text, voice_id, model_id)}.mp3"
+
+
+async def get_or_generate(text: str) -> Optional[bytes]:
+    """Liefert die MP3-Bytes für ``text`` (aus Cache oder frisch generiert).
+
+    Returns None, wenn der API-Key fehlt — der Tour-Frontend-Code fällt
+    dann auf ``speechSynthesis`` (Browser-eingebaut) zurück.
+    """
+    api_key, voice_id, model_id = _config()
+    if not api_key:
+        return None
+
+    if not text or len(text) > 5000:
+        logger.warning("Tour-Audio: Text leer oder zu lang (%d chars)", len(text))
+        return None
+
+    cache_file = _cache_path(text, voice_id, model_id)
+    if cache_file.exists():
+        return cache_file.read_bytes()
+
+    url = ELEVENLABS_TTS_URL.format(voice_id=voice_id)
+    payload = {
+        "text": text,
+        "model_id": model_id,
+        "voice_settings": {
+            # Für eine warme, klare Erzähl-Stimme: stability hoch (=ruhig),
+            # similarity_boost mittel (=natürlich, nicht über-poliert).
+            "stability": 0.55,
+            "similarity_boost": 0.7,
+            "style": 0.0,
+            "use_speaker_boost": True,
+        },
+    }
+    headers = {
+        "xi-api-key": api_key,
+        "Content-Type": "application/json",
+        "Accept": "audio/mpeg",
+    }
+
+    try:
+        async with httpx.AsyncClient(timeout=httpx.Timeout(30.0)) as client:
+            r = await client.post(url, headers=headers, json=payload)
+        if r.status_code != 200:
+            logger.warning(
+                "ElevenLabs-TTS Status %d für voice=%s: %s",
+                r.status_code, voice_id, r.text[:200],
+            )
+            return None
+        audio = r.content
+        cache_file.write_bytes(audio)
+        logger.info(
+            "Tour-Audio cached: %s (%d bytes, voice=%s)",
+            cache_file.name, len(audio), voice_id,
+        )
+        return audio
+    except Exception:
+        logger.exception("ElevenLabs-TTS-Aufruf fehlgeschlagen")
+        return None