feat(quellen): Semantische Volltextsuche über alle Programme (#235)
GET /api/quellen/search?q=&filter=current|all&top_k=&bundesland=&partei= nutzt text-embedding-v4 für wortunscharfe Suche (Endungen + Synonyme). Filter: - filter=current: nur Programme mit gueltig_bis IS NULL (Default) - filter=all: auch historische Programme Response liefert pro Treffer name, partei, bundesland, gueltig_ab/bis, seite, gekürztes Snippet, similarity, plus pdf_url mit Direkt-Sprung ins highlightete PDF (über /api/wahlprogramm-cite). UI auf /quellen oben über der BL-Auflistung: - Suchfeld + Submit - Radio-Toggle "nur aktuelle Programme" / "auch historische" - Treffer-Karten mit Partei-Badge, gültig-Pille (grün/grau), Seite + Relevanz-%, Snippet, Direktlink ins PDF - Filter-Wechsel triggert automatischen Re-Run Smoketest dev: "Klimaschutz" liefert 13 Treffer in aktuellen Programmen mit korrekter Similarity-Sortierung; "Solidarität" mischt Wahl- und Grundsatzprogramme. Zugriff erfordert keinen Login (read-only).
This commit is contained in:
parent
18ea326e43
commit
27fd92c15f
78
app/main.py
78
app/main.py
@ -5,7 +5,8 @@ import uuid
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from fastapi import FastAPI, File, Form, UploadFile, Request, BackgroundTasks, HTTPException, Depends
|
from fastapi import FastAPI, File, Form, UploadFile, Request, BackgroundTasks, HTTPException, Depends, Query
|
||||||
|
import urllib.parse
|
||||||
from fastapi.responses import HTMLResponse, FileResponse, JSONResponse, Response
|
from fastapi.responses import HTMLResponse, FileResponse, JSONResponse, Response
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from starlette.middleware.base import BaseHTTPMiddleware
|
from starlette.middleware.base import BaseHTTPMiddleware
|
||||||
@ -2026,6 +2027,81 @@ async def quellen_page(request: Request, current_user: Optional[dict] = Depends(
|
|||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/quellen/search")
|
||||||
|
async def quellen_search(
|
||||||
|
request: Request,
|
||||||
|
q: str = Query(..., min_length=2, max_length=200, description="Suchbegriff"),
|
||||||
|
filter: str = Query("current", regex="^(current|all)$"),
|
||||||
|
bundesland: Optional[str] = None,
|
||||||
|
partei: Optional[str] = None,
|
||||||
|
top_k: int = Query(20, ge=1, le=50),
|
||||||
|
):
|
||||||
|
"""Semantische Volltextsuche über alle indizierten Wahlprogramme.
|
||||||
|
|
||||||
|
Nutzt die DashScope-Embeddings (text-embedding-v4): wortunscharf,
|
||||||
|
findet auch synonyme/verwandte Begriffe. Filter:
|
||||||
|
- ``filter=current``: nur aktuelle Programme (gueltig_bis IS NULL)
|
||||||
|
- ``filter=all``: auch historische Programme
|
||||||
|
- ``bundesland`` / ``partei``: optionale Eingrenzung
|
||||||
|
|
||||||
|
Returns: JSON mit Trefferliste {name, partei, bundesland, seite, text,
|
||||||
|
gueltig_ab, gueltig_bis, similarity, pdf_url, programm_id}.
|
||||||
|
"""
|
||||||
|
from .embeddings import find_relevant_chunks
|
||||||
|
from .programme import get_programm
|
||||||
|
|
||||||
|
try:
|
||||||
|
chunks = find_relevant_chunks(
|
||||||
|
query=q,
|
||||||
|
parteien=[partei] if partei else None,
|
||||||
|
bundesland=bundesland,
|
||||||
|
top_k=top_k * 2, # mehr holen, danach historisch-Filter
|
||||||
|
min_similarity=0.30,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("quellen_search: embedding failed")
|
||||||
|
return JSONResponse({"error": "Suche fehlgeschlagen"}, status_code=500)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for c in chunks:
|
||||||
|
pid = c.get("programm_id")
|
||||||
|
prog = get_programm(pid) if pid else None
|
||||||
|
if prog is None:
|
||||||
|
continue
|
||||||
|
if filter == "current" and prog.get("gueltig_bis") is not None:
|
||||||
|
continue # historisches Programm ausgefiltert
|
||||||
|
seite = c.get("seite") or 1
|
||||||
|
text = (c.get("text") or "").strip()
|
||||||
|
if len(text) > 320:
|
||||||
|
text = text[:317].rstrip() + "…"
|
||||||
|
results.append({
|
||||||
|
"programm_id": pid,
|
||||||
|
"name": prog.get("name", pid),
|
||||||
|
"partei": prog.get("partei"),
|
||||||
|
"bundesland": prog.get("bundesland"),
|
||||||
|
"typ": prog.get("typ"),
|
||||||
|
"wp": prog.get("wp"),
|
||||||
|
"seite": seite,
|
||||||
|
"text": text,
|
||||||
|
"gueltig_ab": prog.get("gueltig_ab"),
|
||||||
|
"gueltig_bis": prog.get("gueltig_bis"),
|
||||||
|
"similarity": round(float(c.get("similarity", 0.0)), 4),
|
||||||
|
"pdf_url": (
|
||||||
|
f"/api/wahlprogramm-cite?pid={pid}&seite={seite}"
|
||||||
|
f"&q={urllib.parse.quote_plus(q)[:160]}#page={seite}"
|
||||||
|
),
|
||||||
|
})
|
||||||
|
if len(results) >= top_k:
|
||||||
|
break
|
||||||
|
|
||||||
|
return JSONResponse({
|
||||||
|
"query": q,
|
||||||
|
"filter": filter,
|
||||||
|
"n_results": len(results),
|
||||||
|
"results": results,
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
@app.get("/api/wahlprogramm-cite")
|
@app.get("/api/wahlprogramm-cite")
|
||||||
async def wahlprogramm_cite(
|
async def wahlprogramm_cite(
|
||||||
request: Request,
|
request: Request,
|
||||||
|
|||||||
@ -70,6 +70,81 @@
|
|||||||
padding-bottom: 4px;
|
padding-bottom: 4px;
|
||||||
border-bottom: 2px solid var(--ecg-teal);
|
border-bottom: 2px solid var(--ecg-teal);
|
||||||
}
|
}
|
||||||
|
.search-box {
|
||||||
|
display: flex;
|
||||||
|
gap: 8px;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
align-items: center;
|
||||||
|
margin: 8px 0 4px;
|
||||||
|
}
|
||||||
|
.search-box input[type="text"] {
|
||||||
|
flex: 1 1 240px;
|
||||||
|
min-width: 200px;
|
||||||
|
padding: 8px 10px;
|
||||||
|
font-family: var(--font-mono);
|
||||||
|
font-size: 13px;
|
||||||
|
border: 1px solid var(--ecg-border);
|
||||||
|
border-radius: 4px;
|
||||||
|
background: var(--ecg-card-bg);
|
||||||
|
color: var(--ecg-dark);
|
||||||
|
}
|
||||||
|
.search-box button {
|
||||||
|
padding: 8px 16px;
|
||||||
|
font-family: var(--font-mono);
|
||||||
|
font-size: 12px;
|
||||||
|
background: var(--ecg-teal);
|
||||||
|
color: #fff;
|
||||||
|
border: none;
|
||||||
|
border-radius: 4px;
|
||||||
|
cursor: pointer;
|
||||||
|
}
|
||||||
|
.search-box button:disabled { opacity: 0.5; cursor: wait; }
|
||||||
|
.search-filter {
|
||||||
|
display: inline-flex;
|
||||||
|
gap: 14px;
|
||||||
|
font-size: 12px;
|
||||||
|
font-family: var(--font-mono);
|
||||||
|
}
|
||||||
|
.search-filter label { cursor: pointer; }
|
||||||
|
.search-results { margin-top: 12px; }
|
||||||
|
.search-hit {
|
||||||
|
padding: 10px 12px;
|
||||||
|
margin-bottom: 8px;
|
||||||
|
background: var(--ecg-card-bg);
|
||||||
|
border: 1px solid var(--ecg-border);
|
||||||
|
border-radius: 4px;
|
||||||
|
border-left: 3px solid var(--ecg-teal);
|
||||||
|
}
|
||||||
|
.search-hit.historic { border-left-color: var(--ecg-dark); opacity: 0.85; }
|
||||||
|
.search-hit-meta {
|
||||||
|
font-family: var(--font-mono);
|
||||||
|
font-size: 11px;
|
||||||
|
opacity: 0.7;
|
||||||
|
margin-top: 2px;
|
||||||
|
margin-bottom: 6px;
|
||||||
|
}
|
||||||
|
.search-hit-text {
|
||||||
|
font-size: 13px;
|
||||||
|
line-height: 1.5;
|
||||||
|
color: var(--ecg-dark);
|
||||||
|
}
|
||||||
|
.search-hit-actions {
|
||||||
|
margin-top: 6px;
|
||||||
|
font-size: 11px;
|
||||||
|
font-family: var(--font-mono);
|
||||||
|
}
|
||||||
|
.search-hit-actions a { color: var(--ecg-teal); margin-right: 12px; }
|
||||||
|
.search-status { font-size: 12px; opacity: 0.7; margin-top: 8px; font-family: var(--font-mono); }
|
||||||
|
.gueltig-pill {
|
||||||
|
display: inline-block;
|
||||||
|
padding: 1px 6px;
|
||||||
|
font-size: 10px;
|
||||||
|
font-family: var(--font-mono);
|
||||||
|
border-radius: 3px;
|
||||||
|
margin-left: 4px;
|
||||||
|
}
|
||||||
|
.gueltig-pill.aktuell { background: var(--ecg-green); color: #fff; }
|
||||||
|
.gueltig-pill.historisch { background: var(--ecg-dark); color: #fff; opacity: 0.7; }
|
||||||
</style>
|
</style>
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|
||||||
@ -113,6 +188,27 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<!-- Volltextsuche (semantisch) -->
|
||||||
|
<div class="v2-kasten outline-blue" style="margin-bottom:1.5rem;">
|
||||||
|
<h4>Volltextsuche durch alle Programme</h4>
|
||||||
|
<p style="font-size:12px;opacity:0.75;margin:4px 0 8px;">
|
||||||
|
Semantische Suche über alle indizierten Wahl- und Grundsatzprogramme.
|
||||||
|
Wortunscharf — Endungen sind egal, verwandte Begriffe werden ebenfalls
|
||||||
|
gefunden.
|
||||||
|
</p>
|
||||||
|
<form id="quellen-search-form" class="search-box" onsubmit="return runSearch(event)">
|
||||||
|
<input type="text" id="quellen-q" name="q" placeholder="z.B. Klimaschutz, soziale Gerechtigkeit, Mietpreisbremse"
|
||||||
|
autocomplete="off" minlength="2" maxlength="200">
|
||||||
|
<button type="submit" id="quellen-q-btn">Suchen</button>
|
||||||
|
</form>
|
||||||
|
<div class="search-filter" style="margin-top:6px;">
|
||||||
|
<label><input type="radio" name="qfilter" value="current" checked> nur aktuelle Programme</label>
|
||||||
|
<label><input type="radio" name="qfilter" value="all"> auch historische</label>
|
||||||
|
</div>
|
||||||
|
<div id="quellen-search-status" class="search-status"></div>
|
||||||
|
<div id="quellen-search-results" class="search-results"></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<!-- Wahlprogramme nach BL -->
|
<!-- Wahlprogramme nach BL -->
|
||||||
{% for bl_name, bl_progs in wahlprogramme_grouped %}
|
{% for bl_name, bl_progs in wahlprogramme_grouped %}
|
||||||
<h2 class="section-h2">{{ bl_name }}</h2>
|
<h2 class="section-h2">{{ bl_name }}</h2>
|
||||||
@ -215,5 +311,83 @@ async function indexAll() {
|
|||||||
statusEl.textContent = 'Fehler: ' + e.message;
|
statusEl.textContent = 'Fehler: ' + e.message;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function escHtml(s) {
|
||||||
|
return String(s == null ? '' : s)
|
||||||
|
.replace(/&/g, '&').replace(/</g, '<')
|
||||||
|
.replace(/>/g, '>').replace(/"/g, '"').replace(/'/g, ''');
|
||||||
|
}
|
||||||
|
|
||||||
|
async function runSearch(ev) {
|
||||||
|
if (ev) ev.preventDefault();
|
||||||
|
const q = document.getElementById('quellen-q').value.trim();
|
||||||
|
const filter = (document.querySelector('input[name="qfilter"]:checked') || {}).value || 'current';
|
||||||
|
const statusEl = document.getElementById('quellen-search-status');
|
||||||
|
const resultsEl = document.getElementById('quellen-search-results');
|
||||||
|
const btn = document.getElementById('quellen-q-btn');
|
||||||
|
if (q.length < 2) {
|
||||||
|
statusEl.textContent = 'Bitte mindestens 2 Zeichen eingeben.';
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
btn.disabled = true;
|
||||||
|
statusEl.textContent = 'Suche läuft …';
|
||||||
|
resultsEl.innerHTML = '';
|
||||||
|
try {
|
||||||
|
const params = new URLSearchParams({ q: q, filter: filter, top_k: '20' });
|
||||||
|
const resp = await fetch('/api/quellen/search?' + params.toString());
|
||||||
|
if (!resp.ok) {
|
||||||
|
statusEl.textContent = 'Fehler: ' + resp.status + ' ' + resp.statusText;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
const data = await resp.json();
|
||||||
|
if (!data.results || data.results.length === 0) {
|
||||||
|
statusEl.textContent = 'Keine Treffer.';
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
statusEl.textContent = data.n_results + ' Treffer'
|
||||||
|
+ (filter === 'current' ? ' in aktuellen Programmen' : ' (alle Programme)') + '.';
|
||||||
|
const html = data.results.map(r => {
|
||||||
|
const aktuell = r.gueltig_bis === null;
|
||||||
|
const cls = aktuell ? '' : ' historic';
|
||||||
|
const pill = aktuell
|
||||||
|
? '<span class="gueltig-pill aktuell">aktuell</span>'
|
||||||
|
: '<span class="gueltig-pill historisch">' + escHtml(r.gueltig_ab) + ' bis ' + escHtml(r.gueltig_bis) + '</span>';
|
||||||
|
const sim = (r.similarity * 100).toFixed(0) + '%';
|
||||||
|
const partei = escHtml(r.partei || '');
|
||||||
|
const bl = escHtml(r.bundesland || '');
|
||||||
|
const wp = r.wp ? ' · WP' + escHtml(r.wp) : '';
|
||||||
|
const seite = escHtml(r.seite);
|
||||||
|
return '<div class="search-hit' + cls + '">'
|
||||||
|
+ '<div style="font-family:var(--font-display);font-size:13px;">'
|
||||||
|
+ '<span class="prog-badge badge-' + partei.toLowerCase().replace(/ü/g,'ue').replace(/ä/g,'ae').replace(/ö/g,'oe') + '">' + partei + '</span> '
|
||||||
|
+ escHtml(r.name) + ' ' + pill
|
||||||
|
+ '</div>'
|
||||||
|
+ '<div class="search-hit-meta">'
|
||||||
|
+ 'Seite ' + seite + ' · ' + bl + wp + ' · Relevanz ' + sim
|
||||||
|
+ '</div>'
|
||||||
|
+ '<div class="search-hit-text">' + escHtml(r.text) + '</div>'
|
||||||
|
+ '<div class="search-hit-actions">'
|
||||||
|
+ '<a href="' + escHtml(r.pdf_url) + '" target="_blank">→ Stelle im PDF anzeigen</a>'
|
||||||
|
+ '</div>'
|
||||||
|
+ '</div>';
|
||||||
|
}).join('');
|
||||||
|
resultsEl.innerHTML = html;
|
||||||
|
} catch (e) {
|
||||||
|
statusEl.textContent = 'Fehler: ' + e.message;
|
||||||
|
} finally {
|
||||||
|
btn.disabled = false;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Re-Run bei Filter-Wechsel, wenn schon eine Query da ist
|
||||||
|
document.addEventListener('DOMContentLoaded', () => {
|
||||||
|
document.querySelectorAll('input[name="qfilter"]').forEach(r => {
|
||||||
|
r.addEventListener('change', () => {
|
||||||
|
const q = (document.getElementById('quellen-q') || {}).value || '';
|
||||||
|
if (q.trim().length >= 2) runSearch();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
</script>
|
</script>
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user