feat(quellen): Semantische Volltextsuche über alle Programme (#235)
GET /api/quellen/search?q=&filter=current|all&top_k=&bundesland=&partei= nutzt text-embedding-v4 für wortunscharfe Suche (Endungen + Synonyme). Filter: - filter=current: nur Programme mit gueltig_bis IS NULL (Default) - filter=all: auch historische Programme Response liefert pro Treffer name, partei, bundesland, gueltig_ab/bis, seite, gekürztes Snippet, similarity, plus pdf_url mit Direkt-Sprung ins highlightete PDF (über /api/wahlprogramm-cite). UI auf /quellen oben über der BL-Auflistung: - Suchfeld + Submit - Radio-Toggle "nur aktuelle Programme" / "auch historische" - Treffer-Karten mit Partei-Badge, gültig-Pille (grün/grau), Seite + Relevanz-%, Snippet, Direktlink ins PDF - Filter-Wechsel triggert automatischen Re-Run Smoketest dev: "Klimaschutz" liefert 13 Treffer in aktuellen Programmen mit korrekter Similarity-Sortierung; "Solidarität" mischt Wahl- und Grundsatzprogramme. Zugriff erfordert keinen Login (read-only).
This commit is contained in:
parent
18ea326e43
commit
27fd92c15f
78
app/main.py
78
app/main.py
@ -5,7 +5,8 @@ import uuid
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import FastAPI, File, Form, UploadFile, Request, BackgroundTasks, HTTPException, Depends
|
||||
from fastapi import FastAPI, File, Form, UploadFile, Request, BackgroundTasks, HTTPException, Depends, Query
|
||||
import urllib.parse
|
||||
from fastapi.responses import HTMLResponse, FileResponse, JSONResponse, Response
|
||||
from pydantic import BaseModel
|
||||
from starlette.middleware.base import BaseHTTPMiddleware
|
||||
@ -2026,6 +2027,81 @@ async def quellen_page(request: Request, current_user: Optional[dict] = Depends(
|
||||
})
|
||||
|
||||
|
||||
@app.get("/api/quellen/search")
|
||||
async def quellen_search(
|
||||
request: Request,
|
||||
q: str = Query(..., min_length=2, max_length=200, description="Suchbegriff"),
|
||||
filter: str = Query("current", regex="^(current|all)$"),
|
||||
bundesland: Optional[str] = None,
|
||||
partei: Optional[str] = None,
|
||||
top_k: int = Query(20, ge=1, le=50),
|
||||
):
|
||||
"""Semantische Volltextsuche über alle indizierten Wahlprogramme.
|
||||
|
||||
Nutzt die DashScope-Embeddings (text-embedding-v4): wortunscharf,
|
||||
findet auch synonyme/verwandte Begriffe. Filter:
|
||||
- ``filter=current``: nur aktuelle Programme (gueltig_bis IS NULL)
|
||||
- ``filter=all``: auch historische Programme
|
||||
- ``bundesland`` / ``partei``: optionale Eingrenzung
|
||||
|
||||
Returns: JSON mit Trefferliste {name, partei, bundesland, seite, text,
|
||||
gueltig_ab, gueltig_bis, similarity, pdf_url, programm_id}.
|
||||
"""
|
||||
from .embeddings import find_relevant_chunks
|
||||
from .programme import get_programm
|
||||
|
||||
try:
|
||||
chunks = find_relevant_chunks(
|
||||
query=q,
|
||||
parteien=[partei] if partei else None,
|
||||
bundesland=bundesland,
|
||||
top_k=top_k * 2, # mehr holen, danach historisch-Filter
|
||||
min_similarity=0.30,
|
||||
)
|
||||
except Exception:
|
||||
logger.exception("quellen_search: embedding failed")
|
||||
return JSONResponse({"error": "Suche fehlgeschlagen"}, status_code=500)
|
||||
|
||||
results = []
|
||||
for c in chunks:
|
||||
pid = c.get("programm_id")
|
||||
prog = get_programm(pid) if pid else None
|
||||
if prog is None:
|
||||
continue
|
||||
if filter == "current" and prog.get("gueltig_bis") is not None:
|
||||
continue # historisches Programm ausgefiltert
|
||||
seite = c.get("seite") or 1
|
||||
text = (c.get("text") or "").strip()
|
||||
if len(text) > 320:
|
||||
text = text[:317].rstrip() + "…"
|
||||
results.append({
|
||||
"programm_id": pid,
|
||||
"name": prog.get("name", pid),
|
||||
"partei": prog.get("partei"),
|
||||
"bundesland": prog.get("bundesland"),
|
||||
"typ": prog.get("typ"),
|
||||
"wp": prog.get("wp"),
|
||||
"seite": seite,
|
||||
"text": text,
|
||||
"gueltig_ab": prog.get("gueltig_ab"),
|
||||
"gueltig_bis": prog.get("gueltig_bis"),
|
||||
"similarity": round(float(c.get("similarity", 0.0)), 4),
|
||||
"pdf_url": (
|
||||
f"/api/wahlprogramm-cite?pid={pid}&seite={seite}"
|
||||
f"&q={urllib.parse.quote_plus(q)[:160]}#page={seite}"
|
||||
),
|
||||
})
|
||||
if len(results) >= top_k:
|
||||
break
|
||||
|
||||
return JSONResponse({
|
||||
"query": q,
|
||||
"filter": filter,
|
||||
"n_results": len(results),
|
||||
"results": results,
|
||||
})
|
||||
|
||||
|
||||
@app.get("/api/wahlprogramm-cite")
|
||||
async def wahlprogramm_cite(
|
||||
request: Request,
|
||||
|
||||
@ -70,6 +70,81 @@
|
||||
padding-bottom: 4px;
|
||||
border-bottom: 2px solid var(--ecg-teal);
|
||||
}
|
||||
.search-box {
|
||||
display: flex;
|
||||
gap: 8px;
|
||||
flex-wrap: wrap;
|
||||
align-items: center;
|
||||
margin: 8px 0 4px;
|
||||
}
|
||||
.search-box input[type="text"] {
|
||||
flex: 1 1 240px;
|
||||
min-width: 200px;
|
||||
padding: 8px 10px;
|
||||
font-family: var(--font-mono);
|
||||
font-size: 13px;
|
||||
border: 1px solid var(--ecg-border);
|
||||
border-radius: 4px;
|
||||
background: var(--ecg-card-bg);
|
||||
color: var(--ecg-dark);
|
||||
}
|
||||
.search-box button {
|
||||
padding: 8px 16px;
|
||||
font-family: var(--font-mono);
|
||||
font-size: 12px;
|
||||
background: var(--ecg-teal);
|
||||
color: #fff;
|
||||
border: none;
|
||||
border-radius: 4px;
|
||||
cursor: pointer;
|
||||
}
|
||||
.search-box button:disabled { opacity: 0.5; cursor: wait; }
|
||||
.search-filter {
|
||||
display: inline-flex;
|
||||
gap: 14px;
|
||||
font-size: 12px;
|
||||
font-family: var(--font-mono);
|
||||
}
|
||||
.search-filter label { cursor: pointer; }
|
||||
.search-results { margin-top: 12px; }
|
||||
.search-hit {
|
||||
padding: 10px 12px;
|
||||
margin-bottom: 8px;
|
||||
background: var(--ecg-card-bg);
|
||||
border: 1px solid var(--ecg-border);
|
||||
border-radius: 4px;
|
||||
border-left: 3px solid var(--ecg-teal);
|
||||
}
|
||||
.search-hit.historic { border-left-color: var(--ecg-dark); opacity: 0.85; }
|
||||
.search-hit-meta {
|
||||
font-family: var(--font-mono);
|
||||
font-size: 11px;
|
||||
opacity: 0.7;
|
||||
margin-top: 2px;
|
||||
margin-bottom: 6px;
|
||||
}
|
||||
.search-hit-text {
|
||||
font-size: 13px;
|
||||
line-height: 1.5;
|
||||
color: var(--ecg-dark);
|
||||
}
|
||||
.search-hit-actions {
|
||||
margin-top: 6px;
|
||||
font-size: 11px;
|
||||
font-family: var(--font-mono);
|
||||
}
|
||||
.search-hit-actions a { color: var(--ecg-teal); margin-right: 12px; }
|
||||
.search-status { font-size: 12px; opacity: 0.7; margin-top: 8px; font-family: var(--font-mono); }
|
||||
.gueltig-pill {
|
||||
display: inline-block;
|
||||
padding: 1px 6px;
|
||||
font-size: 10px;
|
||||
font-family: var(--font-mono);
|
||||
border-radius: 3px;
|
||||
margin-left: 4px;
|
||||
}
|
||||
.gueltig-pill.aktuell { background: var(--ecg-green); color: #fff; }
|
||||
.gueltig-pill.historisch { background: var(--ecg-dark); color: #fff; opacity: 0.7; }
|
||||
</style>
|
||||
{% endblock %}
|
||||
|
||||
@ -113,6 +188,27 @@
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Volltextsuche (semantisch) -->
|
||||
<div class="v2-kasten outline-blue" style="margin-bottom:1.5rem;">
|
||||
<h4>Volltextsuche durch alle Programme</h4>
|
||||
<p style="font-size:12px;opacity:0.75;margin:4px 0 8px;">
|
||||
Semantische Suche über alle indizierten Wahl- und Grundsatzprogramme.
|
||||
Wortunscharf — Endungen sind egal, verwandte Begriffe werden ebenfalls
|
||||
gefunden.
|
||||
</p>
|
||||
<form id="quellen-search-form" class="search-box" onsubmit="return runSearch(event)">
|
||||
<input type="text" id="quellen-q" name="q" placeholder="z.B. Klimaschutz, soziale Gerechtigkeit, Mietpreisbremse"
|
||||
autocomplete="off" minlength="2" maxlength="200">
|
||||
<button type="submit" id="quellen-q-btn">Suchen</button>
|
||||
</form>
|
||||
<div class="search-filter" style="margin-top:6px;">
|
||||
<label><input type="radio" name="qfilter" value="current" checked> nur aktuelle Programme</label>
|
||||
<label><input type="radio" name="qfilter" value="all"> auch historische</label>
|
||||
</div>
|
||||
<div id="quellen-search-status" class="search-status"></div>
|
||||
<div id="quellen-search-results" class="search-results"></div>
|
||||
</div>
|
||||
|
||||
<!-- Wahlprogramme nach BL -->
|
||||
{% for bl_name, bl_progs in wahlprogramme_grouped %}
|
||||
<h2 class="section-h2">{{ bl_name }}</h2>
|
||||
@ -215,5 +311,83 @@ async function indexAll() {
|
||||
statusEl.textContent = 'Fehler: ' + e.message;
|
||||
}
|
||||
}
|
||||
|
||||
function escHtml(s) {
|
||||
return String(s == null ? '' : s)
|
||||
.replace(/&/g, '&').replace(/</g, '<')
|
||||
.replace(/>/g, '>').replace(/"/g, '"').replace(/'/g, ''');
|
||||
}
|
||||
|
||||
async function runSearch(ev) {
|
||||
if (ev) ev.preventDefault();
|
||||
const q = document.getElementById('quellen-q').value.trim();
|
||||
const filter = (document.querySelector('input[name="qfilter"]:checked') || {}).value || 'current';
|
||||
const statusEl = document.getElementById('quellen-search-status');
|
||||
const resultsEl = document.getElementById('quellen-search-results');
|
||||
const btn = document.getElementById('quellen-q-btn');
|
||||
if (q.length < 2) {
|
||||
statusEl.textContent = 'Bitte mindestens 2 Zeichen eingeben.';
|
||||
return false;
|
||||
}
|
||||
btn.disabled = true;
|
||||
statusEl.textContent = 'Suche läuft …';
|
||||
resultsEl.innerHTML = '';
|
||||
try {
|
||||
const params = new URLSearchParams({ q: q, filter: filter, top_k: '20' });
|
||||
const resp = await fetch('/api/quellen/search?' + params.toString());
|
||||
if (!resp.ok) {
|
||||
statusEl.textContent = 'Fehler: ' + resp.status + ' ' + resp.statusText;
|
||||
return false;
|
||||
}
|
||||
const data = await resp.json();
|
||||
if (!data.results || data.results.length === 0) {
|
||||
statusEl.textContent = 'Keine Treffer.';
|
||||
return false;
|
||||
}
|
||||
statusEl.textContent = data.n_results + ' Treffer'
|
||||
+ (filter === 'current' ? ' in aktuellen Programmen' : ' (alle Programme)') + '.';
|
||||
const html = data.results.map(r => {
|
||||
const aktuell = r.gueltig_bis === null;
|
||||
const cls = aktuell ? '' : ' historic';
|
||||
const pill = aktuell
|
||||
? '<span class="gueltig-pill aktuell">aktuell</span>'
|
||||
: '<span class="gueltig-pill historisch">' + escHtml(r.gueltig_ab) + ' bis ' + escHtml(r.gueltig_bis) + '</span>';
|
||||
const sim = (r.similarity * 100).toFixed(0) + '%';
|
||||
const partei = escHtml(r.partei || '');
|
||||
const bl = escHtml(r.bundesland || '');
|
||||
const wp = r.wp ? ' · WP' + escHtml(r.wp) : '';
|
||||
const seite = escHtml(r.seite);
|
||||
return '<div class="search-hit' + cls + '">'
|
||||
+ '<div style="font-family:var(--font-display);font-size:13px;">'
|
||||
+ '<span class="prog-badge badge-' + partei.toLowerCase().replace(/ü/g,'ue').replace(/ä/g,'ae').replace(/ö/g,'oe') + '">' + partei + '</span> '
|
||||
+ escHtml(r.name) + ' ' + pill
|
||||
+ '</div>'
|
||||
+ '<div class="search-hit-meta">'
|
||||
+ 'Seite ' + seite + ' · ' + bl + wp + ' · Relevanz ' + sim
|
||||
+ '</div>'
|
||||
+ '<div class="search-hit-text">' + escHtml(r.text) + '</div>'
|
||||
+ '<div class="search-hit-actions">'
|
||||
+ '<a href="' + escHtml(r.pdf_url) + '" target="_blank">→ Stelle im PDF anzeigen</a>'
|
||||
+ '</div>'
|
||||
+ '</div>';
|
||||
}).join('');
|
||||
resultsEl.innerHTML = html;
|
||||
} catch (e) {
|
||||
statusEl.textContent = 'Fehler: ' + e.message;
|
||||
} finally {
|
||||
btn.disabled = false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Re-Run bei Filter-Wechsel, wenn schon eine Query da ist
|
||||
document.addEventListener('DOMContentLoaded', () => {
|
||||
document.querySelectorAll('input[name="qfilter"]').forEach(r => {
|
||||
r.addEventListener('change', () => {
|
||||
const q = (document.getElementById('quellen-q') || {}).value || '';
|
||||
if (q.trim().length >= 2) runSearch();
|
||||
});
|
||||
});
|
||||
});
|
||||
</script>
|
||||
{% endblock %}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user