feat(tour): ElevenLabs-Voice für die Tour (#185 Phase 2)
Audio-Backend: - ``app/tour_audio.py`` ruft ElevenLabs-TTS mit voice_id=Domi (AZnzlk1XvdvUeBnXmlld) und model=eleven_multilingual_v2. ENV-konfiguriert via ``ELEVENLABS_API_KEY``, ``ELEVENLABS_VOICE_ID``, ``ELEVENLABS_MODEL_ID``. - Voice-Settings: stability 0.55, similarity_boost 0.7 (warm, klar, natürlich). - Caching: SHA-256(text|voice|model) → ``data/tour_audio/<hash>.mp3``. Folgeabrufe gehen aus dem Datei-Cache, kein API-Quota-Verbrauch. Endpoint: ``GET /api/tour/voice?text=...`` rate-limited 30/min, liefert audio/mpeg mit Cache-Control 30 Tage. Bei fehlendem API-Key 503 — Frontend fällt dann auf ``speechSynthesis`` zurück (Browser-eingebaute Stimme). Frontend (tour.html): - ``speak()`` versucht erst Server-Audio (ElevenLabs), bei 503/Fehler Fallback auf Web Speech API. - Session-Cache via Blob-URL: Vor/Zurück-Navigation in der Tour zieht nicht jedes Mal eine neue Network-Roundtrip. - ``stopSpeak()`` stoppt beide Audio-Pfade sauber. Konfiguration für dev: ``ELEVENLABS_API_KEY`` und (optional) ``ELEVENLABS_VOICE_ID`` in ``/opt/gwoe-antragspruefer-dev/.env`` setzen, dann Container restart.
This commit is contained in:
parent
e31ee1ad07
commit
6ec05d2b86
30
app/main.py
30
app/main.py
@ -2129,6 +2129,36 @@ async def quellen_search(
|
|||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/tour/voice")
|
||||||
|
@limiter.limit("30/minute")
|
||||||
|
async def tour_voice(request: Request, text: str = Query(..., min_length=2, max_length=2000)):
|
||||||
|
"""Generiert (oder liefert aus Cache) eine MP3 für Tour-Erklär-Texte (#185).
|
||||||
|
|
||||||
|
Nutzt ElevenLabs-TTS, wenn ENV ``ELEVENLABS_API_KEY`` gesetzt ist —
|
||||||
|
sonst 503, damit das Frontend auf ``speechSynthesis`` (browser-
|
||||||
|
eingebaute Stimme) zurückfällt.
|
||||||
|
|
||||||
|
Caching: pro (text, voice_id, model_id) wird einmal generiert und in
|
||||||
|
``data/tour_audio/<hash>.mp3`` gespeichert. Folgeabrufe gehen aus
|
||||||
|
dem Cache und kosten kein API-Quota.
|
||||||
|
"""
|
||||||
|
from .tour_audio import get_or_generate, is_available
|
||||||
|
if not is_available():
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=503,
|
||||||
|
detail="ElevenLabs nicht konfiguriert (ELEVENLABS_API_KEY fehlt)",
|
||||||
|
)
|
||||||
|
audio = await get_or_generate(text)
|
||||||
|
if audio is None:
|
||||||
|
raise HTTPException(status_code=502, detail="TTS-Generierung fehlgeschlagen")
|
||||||
|
from fastapi.responses import Response
|
||||||
|
return Response(
|
||||||
|
content=audio,
|
||||||
|
media_type="audio/mpeg",
|
||||||
|
headers={"Cache-Control": "public, max-age=2592000"}, # 30 Tage Browser-Cache
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.get("/api/wahlprogramm-cite")
|
@app.get("/api/wahlprogramm-cite")
|
||||||
async def wahlprogramm_cite(
|
async def wahlprogramm_cite(
|
||||||
request: Request,
|
request: Request,
|
||||||
|
|||||||
@ -152,11 +152,14 @@
|
|||||||
let _resolvedSteps = []; // STEPS gefiltert auf vorhandene Elemente
|
let _resolvedSteps = []; // STEPS gefiltert auf vorhandene Elemente
|
||||||
let _tourMuted = false;
|
let _tourMuted = false;
|
||||||
let _tourUtter = null;
|
let _tourUtter = null;
|
||||||
|
let _tourAudio = null; // <audio>-Element für ElevenLabs-MP3
|
||||||
|
// Cache nur in dieser Session: vermeidet doppelte API-Roundtrips bei
|
||||||
|
// Vor/Zurück-Navigation. Server-seitig sind die MP3s ohnehin gecacht.
|
||||||
|
const _audioBlobCache = {};
|
||||||
|
|
||||||
function $(id) { return document.getElementById(id); }
|
function $(id) { return document.getElementById(id); }
|
||||||
|
|
||||||
function speak(text) {
|
function speakWebSpeech(text) {
|
||||||
if (_tourMuted) return;
|
|
||||||
if (!('speechSynthesis' in window)) return;
|
if (!('speechSynthesis' in window)) return;
|
||||||
try {
|
try {
|
||||||
window.speechSynthesis.cancel();
|
window.speechSynthesis.cancel();
|
||||||
@ -164,7 +167,6 @@
|
|||||||
u.lang = 'de-DE';
|
u.lang = 'de-DE';
|
||||||
u.rate = 1.0;
|
u.rate = 1.0;
|
||||||
u.pitch = 1.0;
|
u.pitch = 1.0;
|
||||||
// Versuche eine deutsche, möglichst weibliche Stimme zu wählen.
|
|
||||||
const voices = window.speechSynthesis.getVoices();
|
const voices = window.speechSynthesis.getVoices();
|
||||||
const de = voices.filter(v => /de(-DE|_DE)?/i.test(v.lang));
|
const de = voices.filter(v => /de(-DE|_DE)?/i.test(v.lang));
|
||||||
const female = de.find(v => /female|frau|anna|petra|katja|helga|marlene|vicki/i.test(v.name));
|
const female = de.find(v => /female|frau|anna|petra|katja|helga|marlene|vicki/i.test(v.name));
|
||||||
@ -174,11 +176,54 @@
|
|||||||
} catch (_) { /* TTS optional */ }
|
} catch (_) { /* TTS optional */ }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function speakElevenLabs(text) {
|
||||||
|
// Server-Endpoint ruft ElevenLabs + cacht; bei 503 (kein API-Key)
|
||||||
|
// fallen wir auf Web Speech zurück.
|
||||||
|
if (_audioBlobCache[text]) {
|
||||||
|
playAudio(_audioBlobCache[text]);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
const url = '/api/tour/voice?text=' + encodeURIComponent(text);
|
||||||
|
const resp = await fetch(url);
|
||||||
|
if (resp.status === 503) return false; // ElevenLabs nicht konfiguriert
|
||||||
|
if (!resp.ok) {
|
||||||
|
console.warn('Tour-Audio ' + resp.status + ': fallback auf Web Speech');
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
const blob = await resp.blob();
|
||||||
|
const blobUrl = URL.createObjectURL(blob);
|
||||||
|
_audioBlobCache[text] = blobUrl;
|
||||||
|
playAudio(blobUrl);
|
||||||
|
return true;
|
||||||
|
} catch (e) {
|
||||||
|
console.warn('Tour-Audio-Fehler, fallback Web Speech:', e);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function playAudio(blobUrl) {
|
||||||
|
stopSpeak();
|
||||||
|
_tourAudio = new Audio(blobUrl);
|
||||||
|
_tourAudio.play().catch(() => { /* Autoplay-Block, harmless */ });
|
||||||
|
}
|
||||||
|
|
||||||
|
async function speak(text) {
|
||||||
|
if (_tourMuted) return;
|
||||||
|
// Bevorzugt ElevenLabs (Server). Bei nicht-konfiguriert auf Web Speech.
|
||||||
|
const ok = await speakElevenLabs(text);
|
||||||
|
if (!ok) speakWebSpeech(text);
|
||||||
|
}
|
||||||
|
|
||||||
function stopSpeak() {
|
function stopSpeak() {
|
||||||
if ('speechSynthesis' in window) {
|
if ('speechSynthesis' in window) {
|
||||||
try { window.speechSynthesis.cancel(); } catch (_) {}
|
try { window.speechSynthesis.cancel(); } catch (_) {}
|
||||||
}
|
}
|
||||||
_tourUtter = null;
|
_tourUtter = null;
|
||||||
|
if (_tourAudio) {
|
||||||
|
try { _tourAudio.pause(); _tourAudio.currentTime = 0; } catch (_) {}
|
||||||
|
_tourAudio = null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function resolveSteps() {
|
function resolveSteps() {
|
||||||
|
|||||||
120
app/tour_audio.py
Normal file
120
app/tour_audio.py
Normal file
@ -0,0 +1,120 @@
|
|||||||
|
"""Tour-Audio-Generator über ElevenLabs (#185 Phase 2).
|
||||||
|
|
||||||
|
Architektur:
|
||||||
|
- Pro Tour-Station ein Text-String. Wir hashen (text, voice_id) und
|
||||||
|
cachen die fertige MP3 im ``data/tour_audio/<hash>.mp3``-Cache.
|
||||||
|
- Beim ersten Abruf wird die ElevenLabs-Text-to-Speech-API aufgerufen,
|
||||||
|
die MP3 gespeichert, dann ausgeliefert.
|
||||||
|
- Folgeabrufe gehen direkt aus dem Cache. Kein Re-API-Call solange der
|
||||||
|
Text identisch bleibt.
|
||||||
|
|
||||||
|
ENV:
|
||||||
|
- ``ELEVENLABS_API_KEY`` — Pflicht; ohne fällt die Tour auf
|
||||||
|
browser-internes ``speechSynthesis`` zurück.
|
||||||
|
- ``ELEVENLABS_VOICE_ID`` — optional; Default Domi (AZnzlk1XvdvUeBnXmlld).
|
||||||
|
- ``ELEVENLABS_MODEL_ID`` — optional; Default ``eleven_multilingual_v2``.
|
||||||
|
|
||||||
|
Caching-Strategie: SHA-256(text + voice_id + model_id) als Dateiname.
|
||||||
|
Damit:
|
||||||
|
- Text-Edit → neuer Hash → frische Generierung.
|
||||||
|
- Voice-Wechsel → neuer Hash, alte Voice bleibt im Cache (kann manuell
|
||||||
|
weggeräumt werden).
|
||||||
|
"""
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from .config import settings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
DEFAULT_VOICE_ID = "AZnzlk1XvdvUeBnXmlld" # Domi
|
||||||
|
DEFAULT_MODEL_ID = "eleven_multilingual_v2"
|
||||||
|
ELEVENLABS_TTS_URL = "https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
|
||||||
|
|
||||||
|
CACHE_DIR = settings.data_dir / "tour_audio"
|
||||||
|
|
||||||
|
|
||||||
|
def _config() -> tuple[Optional[str], str, str]:
|
||||||
|
"""API-Key (None wenn nicht gesetzt) + Voice-ID + Model-ID."""
|
||||||
|
api_key = os.environ.get("ELEVENLABS_API_KEY")
|
||||||
|
voice_id = os.environ.get("ELEVENLABS_VOICE_ID") or DEFAULT_VOICE_ID
|
||||||
|
model_id = os.environ.get("ELEVENLABS_MODEL_ID") or DEFAULT_MODEL_ID
|
||||||
|
return (api_key or None), voice_id, model_id
|
||||||
|
|
||||||
|
|
||||||
|
def is_available() -> bool:
|
||||||
|
"""True, wenn die ENV den API-Key liefert."""
|
||||||
|
return _config()[0] is not None
|
||||||
|
|
||||||
|
|
||||||
|
def _cache_key(text: str, voice_id: str, model_id: str) -> str:
|
||||||
|
h = hashlib.sha256(f"{voice_id}|{model_id}|{text}".encode("utf-8")).hexdigest()
|
||||||
|
return h[:32]
|
||||||
|
|
||||||
|
|
||||||
|
def _cache_path(text: str, voice_id: str, model_id: str) -> Path:
|
||||||
|
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
return CACHE_DIR / f"{_cache_key(text, voice_id, model_id)}.mp3"
|
||||||
|
|
||||||
|
|
||||||
|
async def get_or_generate(text: str) -> Optional[bytes]:
|
||||||
|
"""Liefert die MP3-Bytes für ``text`` (aus Cache oder frisch generiert).
|
||||||
|
|
||||||
|
Returns None, wenn der API-Key fehlt — der Tour-Frontend-Code fällt
|
||||||
|
dann auf ``speechSynthesis`` (Browser-eingebaut) zurück.
|
||||||
|
"""
|
||||||
|
api_key, voice_id, model_id = _config()
|
||||||
|
if not api_key:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if not text or len(text) > 5000:
|
||||||
|
logger.warning("Tour-Audio: Text leer oder zu lang (%d chars)", len(text))
|
||||||
|
return None
|
||||||
|
|
||||||
|
cache_file = _cache_path(text, voice_id, model_id)
|
||||||
|
if cache_file.exists():
|
||||||
|
return cache_file.read_bytes()
|
||||||
|
|
||||||
|
url = ELEVENLABS_TTS_URL.format(voice_id=voice_id)
|
||||||
|
payload = {
|
||||||
|
"text": text,
|
||||||
|
"model_id": model_id,
|
||||||
|
"voice_settings": {
|
||||||
|
# Für eine warme, klare Erzähl-Stimme: stability hoch (=ruhig),
|
||||||
|
# similarity_boost mittel (=natürlich, nicht über-poliert).
|
||||||
|
"stability": 0.55,
|
||||||
|
"similarity_boost": 0.7,
|
||||||
|
"style": 0.0,
|
||||||
|
"use_speaker_boost": True,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
headers = {
|
||||||
|
"xi-api-key": api_key,
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"Accept": "audio/mpeg",
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=httpx.Timeout(30.0)) as client:
|
||||||
|
r = await client.post(url, headers=headers, json=payload)
|
||||||
|
if r.status_code != 200:
|
||||||
|
logger.warning(
|
||||||
|
"ElevenLabs-TTS Status %d für voice=%s: %s",
|
||||||
|
r.status_code, voice_id, r.text[:200],
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
audio = r.content
|
||||||
|
cache_file.write_bytes(audio)
|
||||||
|
logger.info(
|
||||||
|
"Tour-Audio cached: %s (%d bytes, voice=%s)",
|
||||||
|
cache_file.name, len(audio), voice_id,
|
||||||
|
)
|
||||||
|
return audio
|
||||||
|
except Exception:
|
||||||
|
logger.exception("ElevenLabs-TTS-Aufruf fehlgeschlagen")
|
||||||
|
return None
|
||||||
Loading…
Reference in New Issue
Block a user