gwoe-antragspruefer/app/main.py

"""GWÖ-Antragsprüfer — FastAPI Webapp."""

import logging
import uuid
from pathlib import Path
from typing import Optional

from fastapi import FastAPI, File, Form, UploadFile, Request, BackgroundTasks, HTTPException, Depends
from fastapi.responses import HTMLResponse, FileResponse, JSONResponse, Response
from starlette.middleware.base import BaseHTTPMiddleware
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
from slowapi import Limiter, _rate_limit_exceeded_handler
from slowapi.util import get_remote_address
from slowapi.errors import RateLimitExceeded

from .validators import (
    MAX_SEARCH_QUERY_LEN,
    validate_drucksache,
    validate_search_query,
)

# Strukturiertes Logging für die ganze App. uvicorn registriert seinen
# eigenen Root-Handler erst beim Start; wir setzen ein neutrales Format
# für unsere Module früh, damit logger.exception() auch beim Boot greift.
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)-7s %(name)s: %(message)s",
)
logger = logging.getLogger(__name__)

from .config import settings
from .database import (
    init_db, get_job, create_job, update_job,
    get_all_assessments, get_assessment, delete_assessment,
    upsert_assessment, import_json_assessments,
    search_assessments,
)
from .parlamente import get_adapter, ADAPTERS
from .bundeslaender import alle_bundeslaender
from .analyzer import analyze_antrag
from .auth import get_current_user, require_auth, keycloak_login_url, _is_auth_enabled


def _pick_best_title(llm_title: str, doc_title: Optional[str], drucksache: str) -> str:
    """Wähle den besten Titel aus LLM-Output und Adapter-Metadata.

    Priorität:
    1. doc_title, wenn ein echter Titel (nicht "Drucksache XX")
    2. llm_title, wenn nicht leer und nicht generisch
    3. Generischer Fallback "Drucksache XX"
    """
    generic_prefix = f"Drucksache {drucksache.split('/')[0]}"
    # doc_title gut? (nicht generisch, nicht leer)
    if doc_title and not doc_title.startswith("Drucksache ") and len(doc_title) > 5:
        return doc_title
    # LLM-Titel gut? (nicht generisch)
    if llm_title and not llm_title.startswith("Drucksache ") and len(llm_title) > 5:
        return llm_title
    # doc_title als Fallback (auch wenn generisch)
    return doc_title or llm_title or f"Drucksache {drucksache}"
from .report import generate_html_report, generate_pdf_report
from .embeddings import (
    init_embeddings_db, get_programme_info, get_indexing_status,
    index_programm, render_highlighted_page, PROGRAMME,
)

app = FastAPI(
    title=settings.app_name,
    version=settings.app_version,
    docs_url=None,      # Disable /docs in production
    redoc_url=None,     # Disable /redoc in production
    openapi_url=None,   # Disable /openapi.json in production
)


# Rate-Limiter — fängt Resource-Exhaustion auf den teuren POST-Endpoints
# (LLM-Calls + Indexing). Issue #57 Befund #1 (HIGH). Default in-memory
# Storage; für mehrere Worker müsste man auf Redis umstellen, solange wir
# auf einem Container laufen reicht das Default-Storage.
limiter = Limiter(key_func=get_remote_address, default_limits=[])
app.state.limiter = limiter
app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)


# Security Headers Middleware
class SecurityHeadersMiddleware(BaseHTTPMiddleware):
    async def dispatch(self, request: Request, call_next):
        response = await call_next(request)
        response.headers["X-Content-Type-Options"] = "nosniff"
        response.headers["X-Frame-Options"] = "DENY"
        response.headers["X-XSS-Protection"] = "1; mode=block"
        response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
        response.headers["Permissions-Policy"] = "geolocation=(), microphone=(), camera=()"
        # CSP: Allow self, inline styles (for templates), and PDF viewer
        response.headers["Content-Security-Policy"] = (
            "default-src 'self'; "
            "style-src 'self' 'unsafe-inline'; "
            "script-src 'self' 'unsafe-inline'; "
            "img-src 'self' data:; "
            "frame-ancestors 'none';"
        )
        return response

app.add_middleware(SecurityHeadersMiddleware)


# Setup directories
settings.data_dir.mkdir(exist_ok=True)
settings.reports_dir.mkdir(exist_ok=True)

# Static files and templates
static_dir = Path(__file__).parent / "static"
templates_dir = Path(__file__).parent / "templates"
static_dir.mkdir(exist_ok=True)
templates_dir.mkdir(exist_ok=True)

app.mount("/static", StaticFiles(directory=static_dir), name="static")
templates = Jinja2Templates(directory=str(templates_dir))


@app.on_event("startup")
async def startup():
    await init_db()
    init_embeddings_db()
    # Job-Queue Worker starten (#95)
    from .queue import start_worker, re_enqueue_pending
    await re_enqueue_pending()
    start_worker()
    # JSON import disabled - all assessments now live in SQLite DB only
    # Legacy import would overwrite new v5 assessments with old format
    # count = await import_json_assessments(settings.data_dir / "assessments")
    # if count > 0:
    #     print(f"Imported {count} assessments from JSON files")


@app.get("/", response_class=HTMLResponse)
async def index(request: Request):
    """Landing page with upload form."""
    # Frontend-Liste: synthetischer "ALL"-Eintrag (Bundesweit) zuerst, dann
    # die echten Bundesländer aus der Konfig. Der "ALL"-Code ist eine reine
    # Frontend/API-Konvention, kein Eintrag in bundeslaender.py.
    bl_list = [{"code": "ALL", "name": "🌍 Bundesweit", "active": True}]
    bl_list.extend(
        {"code": bl.code, "name": bl.name, "active": bl.aktiv}
        for bl in alle_bundeslaender()
    )
    # Map code → parlament_name, damit das Frontend ohne extra Backend-Call
    # für jeden Antrag den Parlamentsnamen anzeigen kann.
    parlament_names = {
        bl.code: bl.parlament_name for bl in alle_bundeslaender()
    }
    return templates.TemplateResponse("index.html", {
        "request": request,
        "app_name": settings.app_name,
        "bundeslaender": bl_list,
        "parlament_names": parlament_names,
    })


@app.post("/analyze")
@limiter.limit("10/minute")
async def start_analysis(
    request: Request,
    background_tasks: BackgroundTasks,
    text: Optional[str] = Form(None),
    file: Optional[UploadFile] = File(None),
    bundesland: str = Form("NRW"),
    model: str = Form("qwen-plus"),
    user: dict = Depends(require_auth),
):
    """Start analysis job."""
    if not text and not file:
        raise HTTPException(status_code=400, detail="Entweder Text oder PDF-Datei erforderlich")
    
    # Extract text from PDF if uploaded
    if file and file.filename:
        import fitz  # PyMuPDF
        pdf_bytes = await file.read()
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
        text = ""
        for page in doc:
            text += page.get_text()
        doc.close()
    
    # Create job
    job_id = str(uuid.uuid4())
    await create_job(job_id, text[:500], bundesland, model)
    
    # Start background analysis
    background_tasks.add_task(run_analysis, job_id, text, bundesland, model)
    
    return JSONResponse({"job_id": job_id, "status": "queued"})


async def run_analysis(job_id: str, text: str, bundesland: str, model: str):
    """Background task for analysis."""
    try:
        await update_job(job_id, status="processing")
        
        # Run LLM analysis
        assessment = await analyze_antrag(text, bundesland, model)
        
        # Generate reports
        html_path = settings.reports_dir / f"{job_id}.html"
        pdf_path = settings.reports_dir / f"{job_id}.pdf"
        
        await generate_html_report(assessment, html_path, bundesland=bundesland)
        await generate_pdf_report(assessment, pdf_path, bundesland=bundesland)

        await update_job(
            job_id,
            status="completed",
            result=assessment.model_dump_json(),
            html_path=str(html_path),
            pdf_path=str(pdf_path),
        )
    except Exception as e:
        await update_job(job_id, status="failed", error=str(e))


@app.get("/status/{job_id}")
async def get_status(job_id: str):
    """Get job status."""
    job = await get_job(job_id)
    if not job:
        raise HTTPException(status_code=404, detail="Job nicht gefunden")
    return JSONResponse({
        "job_id": job_id,
        "status": job["status"],
        "created_at": job["created_at"],
    })


@app.get("/result/{job_id}", response_class=HTMLResponse)
async def get_result(request: Request, job_id: str):
    """Get analysis result as HTML."""
    job = await get_job(job_id)
    if not job:
        raise HTTPException(status_code=404, detail="Job nicht gefunden")
    if job["status"] != "completed":
        raise HTTPException(status_code=400, detail=f"Job noch nicht fertig: {job['status']}")
    
    html_path = Path(job["html_path"])
    if html_path.exists():
        return HTMLResponse(html_path.read_text())
    
    raise HTTPException(status_code=500, detail="Report nicht gefunden")


@app.get("/result/{job_id}/pdf")
async def get_pdf(job_id: str):
    """Download PDF report."""
    job = await get_job(job_id)
    if not job:
        raise HTTPException(status_code=404, detail="Job nicht gefunden")
    if job["status"] != "completed":
        raise HTTPException(status_code=400, detail=f"Job noch nicht fertig: {job['status']}")
    
    pdf_path = Path(job["pdf_path"])
    if pdf_path.exists():
        return FileResponse(
            pdf_path,
            media_type="application/pdf",
            filename=f"gwoe-bericht-{job_id[:8]}.pdf"
        )
    
    raise HTTPException(status_code=500, detail="PDF nicht gefunden")


# ─── Queue-Status (#95) ─────────────────────────────────────────────────────

@app.get("/api/queue/status")
async def queue_status():
    """Aktueller Queue-Stand: wartende Jobs, geschätzte Wartezeit."""
    from .queue import get_queue_status
    return get_queue_status()


# ─── Auth-Endpoints (#43) ───────────────────────────────────────────────────

@app.get("/api/auth/me")
async def auth_me(user=Depends(get_current_user)):
    """User-Info oder null wenn nicht eingeloggt.

    Das Frontend ruft diesen Endpoint beim Load auf, um zu entscheiden
    ob "Bewerten" aktiv oder ausgegraut ist.
    """
    if user:
        return {"authenticated": True, **user}
    return {"authenticated": False}


@app.get("/api/auth/callback")
async def auth_callback(request: Request, code: str = "", state: str = ""):
    """OIDC Authorization Code → Access Token Exchange.

    Keycloak redirects hierher nach Login mit ?code=... Parameter.
    Wir tauschen den Code gegen ein Access Token und setzen es als Cookie.
    """
    if not _is_auth_enabled() or not code:
        from fastapi.responses import RedirectResponse
        return RedirectResponse("/")

    from .auth import _keycloak_issuer
    token_url = f"{_keycloak_issuer()}/protocol/openid-connect/token"

    # Construct the same redirect_uri used for the auth request
    base = str(request.base_url).rstrip("/").replace("http://", "https://")
    redirect_uri = f"{base}/api/auth/callback"

    import httpx as _httpx
    async with _httpx.AsyncClient(timeout=10) as client:
        resp = await client.post(token_url, data={
            "grant_type": "authorization_code",
            "client_id": settings.keycloak_client_id,
            "code": code,
            "redirect_uri": redirect_uri,
        })

    if resp.status_code != 200:
        logger.error("Token exchange failed: %s %s", resp.status_code, resp.text[:200])
        raise HTTPException(status_code=401, detail="Login fehlgeschlagen")

    tokens = resp.json()
    access_token = tokens.get("access_token", "")
    expires_in = tokens.get("expires_in", 3600)

    # HTML-Response statt RedirectResponse: setzt Cookie UND redirected.
    # RedirectResponse mit Set-Cookie wird von manchen Browsern bei
    # 307/302 ignoriert (insb. hinter Reverse-Proxies).
    return HTMLResponse(
        f"""<!DOCTYPE html><html><head>
        <meta http-equiv="refresh" content="0;url=/">
        </head><body><p>Anmeldung erfolgreich, Weiterleitung...</p></body></html>""",
        headers={
            "Set-Cookie": (
                f"access_token={access_token}; Path=/; Secure; HttpOnly; "
                f"SameSite=Lax; Max-Age={expires_in}"
            )
        },
    )


@app.get("/api/auth/login-url")
async def auth_login_url(request: Request, redirect: str = "/"):
    """Keycloak-Login-URL für den Browser-Redirect."""
    if not _is_auth_enabled():
        return {"enabled": False, "url": ""}
    # redirect_uri muss auf den Callback-Endpoint zeigen, nicht auf die
    # Zielseite — der Callback tauscht den Code gegen ein Token.
    base = str(request.base_url).rstrip("/").replace("http://", "https://")
    url = keycloak_login_url(f"{base}/api/auth/callback")
    return {"enabled": True, "url": url}


# API: Load assessments from database
@app.get("/api/assessments")
async def list_assessments(bundesland: Optional[str] = None):
    """Return assessments from database, optionally filtered by Bundesland.

    ``bundesland="ALL"`` and missing parameter both mean "no filter".
    """
    rows = await get_all_assessments(bundesland)

    # Convert DB format to frontend format
    assessments = []
    for row in rows:
        assessments.append({
            "drucksache": row.get("drucksache"),
            "title": row.get("title"),
            "fraktionen": row.get("fraktionen", []),
            "datum": row.get("datum"),
            "link": row.get("link"),
            "bundesland": row.get("bundesland"),
            "gwoeScore": row.get("gwoe_score"),
            "gwoeBegründung": row.get("gwoe_begruendung"),
            "gwoeMatrix": row.get("gwoe_matrix", []),
            "gwoeSchwerpunkt": row.get("gwoe_schwerpunkt", []),
            "wahlprogrammScores": row.get("wahlprogramm_scores", []),
            "verbesserungen": row.get("verbesserungen", []),
            "stärken": row.get("staerken", []),
            "schwächen": row.get("schwaechen", []),
            "empfehlung": row.get("empfehlung"),
            "empfehlungSymbol": row.get("empfehlung_symbol"),
            "verbesserungspotenzial": row.get("verbesserungspotenzial"),
            "themen": row.get("themen", []),
            "antragZusammenfassung": row.get("antrag_zusammenfassung"),
            "antragKernpunkte": row.get("antrag_kernpunkte", []),
            "updatedAt": row.get("updated_at"),
            "source": row.get("source"),
            "model": row.get("model"),
        })

    return assessments


# API: Get single assessment (use query param for drucksache with /)
@app.get("/api/assessment")
async def get_single_assessment(drucksache: str):
    """Get a single assessment by drucksache ID."""
    drucksache = validate_drucksache(drucksache)
    row = await get_assessment(drucksache)
    if not row:
        raise HTTPException(status_code=404, detail="Assessment nicht gefunden")
    
    return {
        "drucksache": row.get("drucksache"),
        "title": row.get("title"),
        "fraktionen": row.get("fraktionen", []),
        "datum": row.get("datum"),
        "link": row.get("link"),
        "bundesland": row.get("bundesland"),
        "gwoeScore": row.get("gwoe_score"),
        "gwoeBegründung": row.get("gwoe_begruendung"),
        "gwoeMatrix": row.get("gwoe_matrix", []),
        "gwoeSchwerpunkt": row.get("gwoe_schwerpunkt", []),
        "wahlprogrammScores": row.get("wahlprogramm_scores", []),
        "verbesserungen": row.get("verbesserungen", []),
        "stärken": row.get("staerken", []),
        "schwächen": row.get("schwaechen", []),
        "empfehlung": row.get("empfehlung"),
        "empfehlungSymbol": row.get("empfehlung_symbol"),
        "verbesserungspotenzial": row.get("verbesserungspotenzial"),
        "themen": row.get("themen", []),
        "antragZusammenfassung": row.get("antrag_zusammenfassung"),
        "antragKernpunkte": row.get("antrag_kernpunkte", []),
        "updatedAt": row.get("updated_at"),
        "source": row.get("source"),
        "model": row.get("model"),
    }


# API: Delete assessment for re-analysis (#97)
@app.delete("/api/assessment/delete")
async def delete_assessment_endpoint(
    drucksache: str,
    user: dict = Depends(require_auth),
):
    """Löscht ein Assessment, damit es neu analysiert werden kann."""
    drucksache = validate_drucksache(drucksache)
    deleted = await delete_assessment(drucksache)
    if not deleted:
        raise HTTPException(status_code=404, detail="Assessment nicht gefunden")
    return {"status": "deleted", "drucksache": drucksache}


# API: Generate PDF on demand for an assessment
@app.get("/api/assessment/pdf")
async def download_assessment_pdf(drucksache: str):
    """Generate and download PDF for an assessment."""
    from .models import Assessment

    drucksache = validate_drucksache(drucksache)
    row = await get_assessment(drucksache)
    if not row:
        raise HTTPException(status_code=404, detail="Assessment nicht gefunden")
    
    # Check if PDF already exists
    safe_name = drucksache.replace("/", "-")
    pdf_path = settings.reports_dir / f"{safe_name}.pdf"
    
    if not pdf_path.exists():
        # Convert DB row to Assessment model for report generation
        assessment_data = {
            "drucksache": row.get("drucksache"),
            "title": row.get("title"),
            "fraktionen": row.get("fraktionen", []),
            "datum": row.get("datum"),
            "link": row.get("link"),
            "gwoe_score": row.get("gwoe_score") or 0,
            "gwoe_begruendung": row.get("gwoe_begruendung") or "",
            "gwoe_matrix": row.get("gwoe_matrix", []),
            "gwoe_schwerpunkt": row.get("gwoe_schwerpunkt", []),
            "wahlprogramm_scores": row.get("wahlprogramm_scores", []),
            "verbesserungen": row.get("verbesserungen", []),
            "staerken": row.get("staerken", []),
            "schwaechen": row.get("schwaechen", []),
            "empfehlung": row.get("empfehlung") or "",
            "empfehlung_symbol": row.get("empfehlung_symbol") or "",
            "verbesserungspotenzial": row.get("verbesserungspotenzial") or "",
            "themen": row.get("themen", []),
            "antrag_zusammenfassung": row.get("antrag_zusammenfassung") or "",
            "antrag_kernpunkte": row.get("antrag_kernpunkte", []),
        }
        
        try:
            assessment = Assessment(**assessment_data)
            await generate_pdf_report(
                assessment,
                pdf_path,
                bundesland=row.get("bundesland"),
            )
        except Exception as e:
            raise HTTPException(status_code=500, detail=f"PDF-Generierung fehlgeschlagen: {e}")
    
    return FileResponse(
        pdf_path,
        media_type="application/pdf",
        filename=f"gwoe-{safe_name}.pdf"
    )


# API: Search internal DB only
@app.get("/api/search")
async def search_internal(
    q: str,
    bundesland: str = "NRW",
    limit: int = 50
):
    """
    Search internal assessments database only.
    """
    q = validate_search_query(q)
    db_results = await search_assessments(q, bundesland, limit)
    
    results = []
    for row in db_results:
        results.append({
            "drucksache": row.get("drucksache"),
            "title": row.get("title"),
            "fraktionen": row.get("fraktionen", []),
            "datum": row.get("datum"),
            "link": row.get("link"),
            "bundesland": bundesland,
            "gwoeScore": row.get("gwoe_score"),
            "themen": row.get("themen", []),
            "status": "checked",
        })
    
    return results


# API: Search external parliament portal (Landtag)
@app.get("/api/search-landtag")
async def search_landtag(
    q: str,
    bundesland: str = "NRW",
    limit: int = 20
):
    """
    Search external parliament portal (e.g., NRW OPAL).
    Returns results that can be analyzed with "Jetzt prüfen".

    Requires a concrete Bundesland — the special "ALL" / Bundesweit mode
    cannot pick a single Landtag adapter and is rejected with HTTP 400.
    """
    q = validate_search_query(q)
    if not bundesland or bundesland == "ALL":
        raise HTTPException(
            status_code=400,
            detail="Landtag-Suche benötigt ein konkretes Bundesland",
        )
    adapter = get_adapter(bundesland)
    if not adapter:
        return {"error": f"Bundesland {bundesland} noch nicht unterstützt"}
    
    try:
        external = await adapter.search(q, limit)
        results = []
        for doc in external:
            results.append({
                "drucksache": doc.drucksache,
                "title": doc.title,
                "fraktionen": doc.fraktionen,
                "datum": doc.datum,
                "link": doc.link,
                "bundesland": bundesland,
                "typ": doc.typ,
                "gwoeScore": None,
                "status": "unchecked",
            })
        return results
    except Exception as e:
        logger.exception("Landtag search error for q=%r bundesland=%s", q, bundesland)
        return {"error": f"Suchfehler: {str(e)}"}


# API: Batch-Analyse (#44) — enqueued ungeprüfte Drucksachen eines BL
@app.post("/api/batch-analyze")
@limiter.limit("3/minute")
async def batch_analyze(
    request: Request,
    bundesland: str = Form(...),
    limit: int = Form(10),
    user: dict = Depends(require_auth),
):
    """Sucht die neuesten Drucksachen im Landtag-Portal und enqueued
    alle, die noch nicht in der DB bewertet sind.

    Returns: Liste der enqueued Drucksachen + Queue-Position.
    """
    from .queue import enqueue, QueueFullError

    if limit < 1 or limit > 100:
        raise HTTPException(status_code=400, detail="limit muss 1-100 sein")

    adapter = get_adapter(bundesland)
    if not adapter:
        raise HTTPException(status_code=400, detail=f"Bundesland {bundesland} nicht unterstützt")

    # Neueste Drucksachen vom Landtag holen (leer = neueste Anträge)
    drucksachen = await adapter.search("", limit=limit * 3)  # 3× holen wegen Typ-Filter

    enqueued = []
    skipped = 0
    for doc in drucksachen:
        if len(enqueued) >= limit:
            break
        # Schon bewertet?
        existing = await get_assessment(doc.drucksache)
        if existing:
            skipped += 1
            continue
        # Text herunterladen
        text = await adapter.download_text(doc.drucksache)
        if not text:
            continue
        # Enqueue
        job_id = str(uuid.uuid4())
        await create_job(job_id, text[:500], bundesland, "qwen-plus")
        try:
            position = await enqueue(
                job_id,
                run_drucksache_analysis,
                job_id, doc.drucksache, text, bundesland, "qwen-plus", doc,
            )
            enqueued.append({
                "drucksache": doc.drucksache,
                "title": doc.title,
                "job_id": job_id,
                "queue_position": position,
            })
        except QueueFullError:
            break

    return {
        "status": "batch_enqueued",
        "bundesland": bundesland,
        "enqueued": len(enqueued),
        "skipped_existing": skipped,
        "jobs": enqueued,
    }


# API: Analyze a document from parliament portal
@app.post("/api/analyze-drucksache")
@limiter.limit("10/minute")
async def analyze_drucksache(
    request: Request,
    background_tasks: BackgroundTasks,
    drucksache: str = Form(...),
    bundesland: str = Form("NRW"),
    model: str = Form("qwen-plus"),
    user: dict = Depends(require_auth),
):
    """
    Download a document from parliament portal and analyze it.
    """
    drucksache = validate_drucksache(drucksache)
    # Check if already analyzed
    existing = await get_assessment(drucksache)
    if existing:
        return {"status": "already_checked", "drucksache": drucksache}
    
    # Get adapter and download
    adapter = get_adapter(bundesland)
    if not adapter:
        raise HTTPException(status_code=400, detail=f"Bundesland {bundesland} nicht unterstützt")
    
    # Download text
    text = await adapter.download_text(drucksache)
    if not text:
        raise HTTPException(status_code=404, detail=f"Dokument {drucksache} nicht gefunden")
    
    # Get document metadata
    doc = await adapter.get_document(drucksache)
    
    # Create job and enqueue (#95)
    from .queue import enqueue, QueueFullError
    job_id = str(uuid.uuid4())
    await create_job(job_id, text[:500], bundesland, model)

    try:
        position = await enqueue(
            job_id,
            run_drucksache_analysis,
            job_id, drucksache, text, bundesland, model, doc,
        )
    except QueueFullError:
        await update_job(job_id, status="rejected", error="Queue voll")
        raise HTTPException(
            status_code=429,
            detail="Analyse-Queue ist voll. Bitte später erneut versuchen.",
            headers={"Retry-After": "60"},
        )
    
    return {
        "status": "queued",
        "job_id": job_id,
        "drucksache": drucksache,
        "queue_position": position,
    }


async def run_drucksache_analysis(
    job_id: str, 
    drucksache: str, 
    text: str, 
    bundesland: str, 
    model: str,
    doc
):
    """Background task for drucksache analysis."""
    try:
        await update_job(job_id, status="processing")
        
        # Run LLM analysis
        assessment = await analyze_antrag(text, bundesland, model)
        
        # Prepare data for DB
        assessment_data = {
            "drucksache": drucksache,
            # Titel-Priorität: LLM-generierter Titel > doc.title,
            # ABER nur wenn doc.title ein echter Titel ist (nicht "Drucksache XX",
            # wie NRW's get_document zurückgibt). Sonst überschreibt der
            # generische doc.title den echten LLM-Titel.
            "title": _pick_best_title(assessment.title, doc.title if doc else None, drucksache),
            "fraktionen": assessment.fraktionen,
            "datum": assessment.datum or (doc.datum if doc else ""),
            "link": doc.link if doc else "",
            "bundesland": bundesland,
            "gwoeScore": assessment.gwoe_score,
            "gwoeBegründung": assessment.gwoe_begruendung,
            "gwoeMatrix": [m.model_dump() for m in assessment.gwoe_matrix],
            "gwoeSchwerpunkt": assessment.gwoe_schwerpunkt,
            "wahlprogrammScores": [w.model_dump() for w in assessment.wahlprogramm_scores],
            "verbesserungen": [v.model_dump() for v in assessment.verbesserungen],
            "stärken": assessment.staerken,
            "schwächen": assessment.schwaechen,
            "empfehlung": assessment.empfehlung,
            "empfehlungSymbol": assessment.empfehlung_symbol,
            "verbesserungspotenzial": assessment.verbesserungspotenzial,
            "themen": assessment.themen,
            "antragZusammenfassung": assessment.antrag_zusammenfassung,
            "antragKernpunkte": assessment.antrag_kernpunkte,
            "source": "webapp",
            "model": model,
        }
        
        # Save to DB
        await upsert_assessment(assessment_data)
        
        # Generate reports
        html_path = settings.reports_dir / f"{job_id}.html"
        pdf_path = settings.reports_dir / f"{job_id}.pdf"

        await generate_html_report(assessment, html_path, bundesland=bundesland)
        await generate_pdf_report(assessment, pdf_path, bundesland=bundesland)

        await update_job(
            job_id,
            status="completed",
            result=assessment.model_dump_json(),
            html_path=str(html_path),
            pdf_path=str(pdf_path),
        )
    except Exception as e:
        # Volltext-Stack via logger.exception, NICHT via print — landet so im
        # strukturierten Container-Log und wird vom logging-Framework formatiert
        logger.exception("run_drucksache_analysis failed for drucksache=%s", drucksache)
        await update_job(job_id, status="failed", error=str(e))


# API: List available Bundesländer
@app.get("/api/bundeslaender")
async def list_bundeslaender():
    """List available bundesländer with their status.

    Includes the synthetic "ALL" / Bundesweit entry as the first item so
    that the frontend can render it directly. ``parlament_name`` is added
    so the detail view can show the source parliament without an extra
    backend round-trip.
    """
    out = [{
        "code": "ALL",
        "name": "🌍 Bundesweit",
        "parlament_name": None,
        "active": True,
    }]
    out.extend({
        "code": bl.code,
        "name": bl.name,
        "parlament_name": bl.parlament_name,
        "active": bl.aktiv,
    } for bl in alle_bundeslaender())
    return out


# === Quellen / Programme ===

@app.get("/methodik", response_class=HTMLResponse)
async def methodik_page(request: Request):
    """Transparenz-/Methodik-Seite (#96)."""
    from .bundeslaender import aktive_bundeslaender, BUNDESLAENDER
    from .embeddings import get_indexing_status

    bl_list = []
    for bl in aktive_bundeslaender():
        bl_list.append({
            "code": bl.code,
            "name": bl.name,
            "doku_system": bl.doku_system,
        })

    status = get_indexing_status()

    return templates.TemplateResponse("methodik.html", {
        "request": request,
        "app_name": settings.app_name,
        "adapter_count": len(ADAPTERS),
        "model_name": settings.llm_model_default,
        "programme_count": status.get("total", 0),
        "chunk_count": sum(p.get("chunks", 0) for p in status.get("programmes", [])),
        "bundeslaender": sorted(bl_list, key=lambda x: x["name"]),
    })


@app.get("/quellen", response_class=HTMLResponse)
async def quellen_page(request: Request):
    """Quellen-Seite mit allen Wahl- und Parteiprogrammen, nach BL gruppiert."""
    from .bundeslaender import BUNDESLAENDER
    programmes = get_programme_info()
    status = get_indexing_status()

    # Wahlprogramme nach Bundesland gruppieren
    by_bl: dict[str, list] = {}
    grundsatz = []
    for prog in programmes:
        if prog["typ"] == "parteiprogramm":
            grundsatz.append(prog)
        else:
            bl = prog.get("bundesland") or "Sonstige"
            bl_name = BUNDESLAENDER[bl].name if bl in BUNDESLAENDER else bl
            by_bl.setdefault(bl_name, []).append(prog)

    # Sortieren: alphabetisch nach BL-Name
    wahlprogramme_grouped = sorted(by_bl.items())

    return templates.TemplateResponse("quellen.html", {
        "request": request,
        "app_name": settings.app_name,
        "programmes": programmes,
        "wahlprogramme_grouped": wahlprogramme_grouped,
        "grundsatzprogramme": grundsatz,
        "status": status,
    })


@app.get("/api/wahlprogramm-cite")
async def wahlprogramm_cite(
    request: Request,
    background_tasks: BackgroundTasks,
    pid: str = "", pdf: str = "", seite: int = 1, q: str = "",
    ds: str = "", bl: str = "",
):
    """Render eine Wahlprogramm-Seite mit gelb hervorgehobener Zitat-Stelle.

    Issue #47: Klick auf eine Zitat-Quelle im Report soll direkt zur
    Stelle im Wahlprogramm-PDF springen, mit dem zitierten Snippet
    visuell markiert. Statt das ganze PDF auszuliefern (Browser scrollt
    auf #page=N und Leser muss von Hand suchen), liefern wir hier ein
    1-Seiten-PDF mit ``add_highlight_annot``-Annotation auf den per
    ``page.search_for`` gefundenen Bounding-Boxes.

    Akzeptiert ``pid`` (PROGRAMME-Key) ODER ``pdf`` (Dateiname wie
    ``spd-grundsatzprogramm.pdf``). Letzterer ermöglicht die retroaktive
    Nutzung von Pre-#47-URLs im Frontend, wo nur der statische Pfad
    ``/static/referenzen/<pdf>#page=<N>`` gespeichert ist.

    Security: ``pid`` muss ein registrierter PROGRAMME-Key sein —
    verhindert Path-Traversal und arbiträren File-Read aus dem
    referenzen-Verzeichnis. ``seite`` wird per Pydantic-Coercion
    auf int gezwungen. ``q`` ist auf 200 Zeichen begrenzt im Renderer.
    """
    # Reverse-Lookup: pdf-Filename → programm_id, falls nur pdf angegeben.
    # Zwei Stufen: exakter Match, dann fuzzy (Year-Suffix-Stripping), weil
    # Pre-#47 Assessments halluzinierte Dateinamen haben können, z.B.
    # "gruene-grundsatzprogramm-2020.pdf" statt "gruene-grundsatzprogramm.pdf".
    if not pid and pdf:
        # Stage 1: exakt
        for p, info in PROGRAMME.items():
            if info.get("pdf") == pdf:
                pid = p
                break
        # Stage 2: Year-Suffix stripping (z.B. "X-2020.pdf" → "X.pdf")
        if not pid:
            import re
            stripped = re.sub(r"-\d{4}\.pdf$", ".pdf", pdf)
            if stripped != pdf:
                for p, info in PROGRAMME.items():
                    if info.get("pdf") == stripped:
                        pid = p
                        break
    if pid not in PROGRAMME:
        raise HTTPException(status_code=404, detail="Unbekanntes Wahlprogramm")
    if seite < 1 or seite > 2000:
        raise HTTPException(status_code=400, detail="Ungültige Seitennummer")

    pdf_bytes, found_page, highlighted = render_highlighted_page(pid, seite, q)
    if pdf_bytes is None:
        raise HTTPException(
            status_code=404,
            detail="Wahlprogramm-PDF oder Seite nicht verfügbar",
        )

    # Issue #47: Wenn das Zitat nicht im PDF auffindbar ist UND wir die
    # Drucksache kennen, ist das Assessment wahrscheinlich ein Pre-#60-
    # Halluzinations-Opfer. Automatische Re-Analyse triggern und dem
    # User eine Warte-Seite zeigen statt ein PDF ohne Highlights.
    if not highlighted and q and ds and bl:
        existing = await get_assessment(ds)
        if existing:
            adapter = get_adapter(bl)
            if adapter:
                # Altes Assessment löschen und neu analysieren
                await delete_assessment(ds)
                job_id = str(uuid.uuid4())
                await create_job(job_id, f"Re-Analyse {ds} (Zitat nicht verifizierbar)", bl, "qwen-plus")
                text = await adapter.download_text(ds)
                if text:
                    doc = await adapter.get_document(ds)
                    background_tasks.add_task(
                        run_drucksache_analysis,
                        job_id, ds, text, bl, "qwen-plus", doc,
                    )
                    # HTML-Warte-Seite mit Auto-Redirect zurück zum Assessment
                    return HTMLResponse(f"""<!DOCTYPE html>
<html><head><meta charset="utf-8">
<meta http-equiv="refresh" content="15;url=/#assessment={ds}">
<title>Wird neu analysiert…</title>
<style>body{{font-family:sans-serif;display:flex;justify-content:center;align-items:center;height:100vh;margin:0;background:#f5f5f5}}
.box{{text-align:center;padding:2rem;background:#fff;border-radius:8px;box-shadow:0 2px 8px rgba(0,0,0,.1)}}
.spinner{{width:40px;height:40px;border:4px solid #ddd;border-top:4px solid #009da5;border-radius:50%;animation:spin 1s linear infinite;margin:1rem auto}}
@keyframes spin{{to{{transform:rotate(360deg)}}}}</style></head>
<body><div class="box">
<div class="spinner"></div>
<h2>Zitat nicht verifizierbar</h2>
<p>Der Antrag <strong>{ds}</strong> wird mit der aktuellen Pipeline<br>
neu analysiert, um verifizierte Zitate zu erzeugen.</p>
<p style="color:#666;font-size:0.9rem">Automatische Weiterleitung in 15 Sekunden…</p>
</div></body></html>""")

    info = PROGRAMME[pid]
    safe_name = info.get("pdf", f"{pid}.pdf")
    return Response(
        content=pdf_bytes,
        media_type="application/pdf",
        headers={
            "Content-Disposition": f'inline; filename="{safe_name}"',
            "Cache-Control": "public, max-age=3600",
            "X-Found-Page": str(found_page),
        },
    )


@app.get("/api/programme/thumbnail/{programm_id}")
async def programme_thumbnail(programm_id: str):
    """Thumbnail der ersten Seite eines Wahlprogramm-PDFs (PNG, 200px breit).

    Wird auf der Quellen-Seite als Vorschau angezeigt. Cached 24h.
    """
    import fitz
    if programm_id not in PROGRAMME:
        raise HTTPException(status_code=404)
    info = PROGRAMME[programm_id]
    pdf_path = static_dir / "referenzen" / info["pdf"]
    if not pdf_path.exists():
        raise HTTPException(status_code=404)
    try:
        doc = fitz.open(str(pdf_path))
        page = doc[0]
        # 200px Breite, proportional skaliert
        zoom = 200 / page.rect.width
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)
        png_bytes = pix.tobytes("png")
        doc.close()
        return Response(
            content=png_bytes,
            media_type="image/png",
            headers={"Cache-Control": "public, max-age=86400"},
        )
    except Exception:
        raise HTTPException(status_code=500)


@app.get("/api/programme")
async def list_programme():
    """List all available programmes."""
    return get_programme_info()


@app.get("/api/programme/status")
async def programme_status():
    """Get indexing status of all programmes."""
    return get_indexing_status()


@app.post("/api/programme/index")
@limiter.limit("3/minute")
async def index_programme(
    request: Request,
    background_tasks: BackgroundTasks,
    programm_id: str = Form(None),
    all_programmes: bool = Form(False),
    user: dict = Depends(require_auth),
):
    """Index programme(s) for semantic search."""
    pdf_dir = static_dir / "referenzen"
    
    if all_programmes:
        # Index sequentially to avoid DB locks
        async def index_all_sequential():
            for prog_id in PROGRAMME.keys():
                try:
                    index_programm(prog_id, pdf_dir)
                except Exception:
                    logger.exception("Error indexing programme %s", prog_id)
        background_tasks.add_task(index_all_sequential)
        return {"status": "indexing", "programmes": list(PROGRAMME.keys())}
    
    if programm_id and programm_id in PROGRAMME:
        background_tasks.add_task(index_programm, programm_id, pdf_dir)
        return {"status": "indexing", "programm_id": programm_id}
    
    raise HTTPException(status_code=400, detail="Ungültiges Programm")


# ─────────────────────────────────────────────────────────────────────────────
# Auswertungen #58 — Bundesland × Partei × Wahlperiode Aggregations-Sicht
# ─────────────────────────────────────────────────────────────────────────────


@app.get("/auswertungen", response_class=HTMLResponse)
async def auswertungen_page(request: Request):
    """Statische Seite, die die Matrix-Endpoints per fetch() lädt."""
    from .wahlperioden import all_wahlperioden
    return templates.TemplateResponse("auswertungen.html", {
        "request": request,
        "app_name": settings.app_name,
        "wahlperioden": sorted(all_wahlperioden()),
    })


@app.get("/api/auswertungen/matrix")
async def auswertungen_matrix(wahlperiode: Optional[str] = None):
    """2D-Matrix Bundesland × Partei mit Anzahl + Ø-GWÖ-Score."""
    from .auswertungen import aggregate_matrix
    return aggregate_matrix(filter_wp=wahlperiode)


@app.get("/api/auswertungen/zeitreihe")
async def auswertungen_zeitreihe(bundesland: str, partei: str):
    """Score-Verlauf einer (BL, Partei)-Kombination über alle WPs."""
    from .auswertungen import aggregate_zeitreihe
    return aggregate_zeitreihe(bundesland, partei)


@app.get("/api/auswertungen/export.csv")
async def auswertungen_export_csv():
    """Long-Format-CSV-Export aller Assessments. Deckt #45 mit ab."""
    from .auswertungen import export_long_format
    csv_text = export_long_format()
    return Response(
        content=csv_text,
        media_type="text/csv",
        headers={"Content-Disposition": 'attachment; filename="gwoe-assessments.csv"'},
    )


# Health check
@app.get("/health")
async def health():
    return {"status": "ok", "version": settings.app_version}
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								"""GWÖ-Antragsprüfer — FastAPI Webapp."""
-												Phase A: Audit-Restbefunde #57.3/4/7 (Roadmap #59)

Drei verbleibende Audit-Befunde aus #57 in einem Patch:

- **#57.3 MEDIUM** Drucksache-Regex-Validation: neue
  app/validators.py mit validate_drucksache() als gemeinsamer
  Validation-Funnel. Pattern ^\d{1,3}/\d{1,7}([-(].{1,20})?$ deckt
  alle 10 aktiven Bundesländer (8/6390, 18/12345, 8/6390(neu),
  23/3700-A) ab und blockt Path-Traversal (../, /etc/passwd) plus
  Standard-Injection (;, <, &). Drei Endpoints durchgeschleust:
  /api/assessment, /api/assessment/pdf, /api/analyze-drucksache.

- **#57.4 MEDIUM** print() → logging.getLogger(__name__): main.py
  und analyzer.py auf strukturiertes Logging umgestellt. LLM-Inhalte
  werden NICHT mehr als Volltext geloggt — neue Helper
  _content_fingerprint() liefert nur "len=N sha1=XXXX", reicht zur
  Forensik ohne Antrag-Inhalte ins Container-Log zu leaken.
  basicConfig() mit ISO-Format setzt strukturiertes Logging früh,
  damit logger.exception() auch beim Boot greift.

- **#57.7 LOW-MED** Search-Query-Limit: validate_search_query() mit
  MAX_SEARCH_QUERY_LEN=200 schützt /api/search und /api/search-landtag
  vor 10-MB-Query-DoS. database._parse_search_query() loggt jetzt
  shlex.ValueError-Fallback statt ihn zu verschlucken (deckt Memory-
  Regel "stille excepts in Adaptern" ab).

Tests: neue tests/test_main_validators.py mit 22 Cases — Drucksache-
Whitelist-Roundtrip + Path-Traversal-Reject, Search-Query Längen-
Edge-Cases. 107 Unit-Tests grün (85 alt + 22 neu).

Validators in eigenem Modul (app/validators.py), damit Tests sie ohne
slowapi-Dependency direkt importieren können.

Refs: #57, #59 (Phase A)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 11:15:16 +02:00
+								import logging
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								import uuid
 								from pathlib import Path
 								from typing import Optional
-												#43 Keycloak SSO: JWT-Middleware + UI-Guiding

Auth-Schicht vorbereitet — Dev-Modus (KEYCLOAK_URL leer) lässt alles
durch, Prod-Modus (ENV gesetzt) validiert JWT gegen Keycloak-JWKS.

Backend (app/auth.py):
- JWKS-Cache mit 1h TTL (async httpx fetch)
- get_current_user: Optional, gibt User-Dict oder None
- require_auth: Pflicht, gibt User-Dict oder HTTP 401
- keycloak_login_url: Baut die OIDC-Login-URL
- _is_auth_enabled: prüft ob alle 3 ENV-Vars gesetzt sind

Abgesicherte POST-Endpoints:
- POST /analyze → Depends(require_auth)
- POST /api/analyze-drucksache → Depends(require_auth)
- POST /api/programme/index → Depends(require_auth)

Neue Endpoints:
- GET /api/auth/me → {authenticated, sub, email, name, roles} oder {authenticated: false}
- GET /api/auth/login-url → {enabled, url} für Keycloak-Redirect

Frontend (index.html):
- initAuth() beim DOMContentLoaded → prüft /api/auth/me
- "Anmelden"-Button im Header (neben "Quellen")
- "Jetzt prüfen"-Button: disabled + Tooltip "Nur nach Anmeldung
  verfügbar" wenn nicht eingeloggt; aktiv wenn eingeloggt
- currentUser-State steuert Button-Zustände

Dev-Modus: Solange KEYCLOAK_URL nicht gesetzt ist (lokale Dev, aktueller
Prod-Stand), sind alle Endpoints offen wie bisher. Kein Breaking Change.

Dependency: python-jose[cryptography]>=3.3.0 in requirements.txt.

Tests: 194/194 grün (auth.py hat keine Seiteneffekte im Import).

Refs: #43

											
										
										
											2026-04-10 14:28:57 +02:00
+								from fastapi import FastAPI, File, Form, UploadFile, Request, BackgroundTasks, HTTPException, Depends
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								from fastapi.responses import HTMLResponse, FileResponse, JSONResponse, Response
 								from starlette.middleware.base import BaseHTTPMiddleware
 								from fastapi.staticfiles import StaticFiles
 								from fastapi.templating import Jinja2Templates
-												Security hotfixes #1, #2, #6 from audit (#57)

Drei akute Befunde aus dem Live-System-Audit (Issue #57):

- **#1 HIGH** — Resource Exhaustion via öffentlichem POST: slowapi
  Limiter (in-memory, IP-key) auf /analyze (10/min), /api/analyze-drucksache
  (10/min) und /api/programme/index (3/min). Verhindert, dass ein
  unauthentifizierter Client mit einer Schleife die DashScope-Quota oder
  die CPU des Containers leerziehen kann. Default-Storage reicht solange
  wir auf einem einzigen Worker laufen.

- **#2 MEDIUM** + **#6 MEDIUM** (selber Root-Cause) — XXE/Local-File-Read
  via WeasyPrint und Stored XSS via Browser-Rendering: alle LLM-getragenen
  Felder in app/report.py laufen jetzt durch html.escape() bevor sie in
  die HTML-Template interpoliert werden. format_redline_html escape-first
  und ersetzt dann die Markdown-Marker durch von uns kontrollierte
  <span>-Tags. build_matrix_html escaped das aspect-Attribut, sodass ein
  nacktes " den title="..."-Wert nicht mehr beenden und einen Event-
  Handler injizieren kann. Toter jinja2-Import in report.py entfernt
  (war never used, blockierte nur den lokalen Test).

- **Tests** — neue tests/test_report.py mit 8 Cases, die direkt die
  Bug-Klasse verifizieren: <script>, file://-img, "-attribut-breakout
  in Title und ein End-to-End-Render mit XSS-Payloads in jedem LLM-Feld.
  Die Marker-Funktionalität (** und ~~) wird mit-getestet, damit der
  Escape-First-Ansatz das nicht versehentlich kaputt macht.

77 alte Unit-Tests + 8 neue → 85 grün.

Rate-Limit-Verifikation per TestClient ist Integration-Scope und folgt
in tests/integration/test_main_security.py als separates Folge-Item.

Refs: #57

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 10:45:43 +02:00
+								from slowapi import Limiter, _rate_limit_exceeded_handler
 								from slowapi.util import get_remote_address
 								from slowapi.errors import RateLimitExceeded
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
-												Phase A: Audit-Restbefunde #57.3/4/7 (Roadmap #59)

Drei verbleibende Audit-Befunde aus #57 in einem Patch:

- **#57.3 MEDIUM** Drucksache-Regex-Validation: neue
  app/validators.py mit validate_drucksache() als gemeinsamer
  Validation-Funnel. Pattern ^\d{1,3}/\d{1,7}([-(].{1,20})?$ deckt
  alle 10 aktiven Bundesländer (8/6390, 18/12345, 8/6390(neu),
  23/3700-A) ab und blockt Path-Traversal (../, /etc/passwd) plus
  Standard-Injection (;, <, &). Drei Endpoints durchgeschleust:
  /api/assessment, /api/assessment/pdf, /api/analyze-drucksache.

- **#57.4 MEDIUM** print() → logging.getLogger(__name__): main.py
  und analyzer.py auf strukturiertes Logging umgestellt. LLM-Inhalte
  werden NICHT mehr als Volltext geloggt — neue Helper
  _content_fingerprint() liefert nur "len=N sha1=XXXX", reicht zur
  Forensik ohne Antrag-Inhalte ins Container-Log zu leaken.
  basicConfig() mit ISO-Format setzt strukturiertes Logging früh,
  damit logger.exception() auch beim Boot greift.

- **#57.7 LOW-MED** Search-Query-Limit: validate_search_query() mit
  MAX_SEARCH_QUERY_LEN=200 schützt /api/search und /api/search-landtag
  vor 10-MB-Query-DoS. database._parse_search_query() loggt jetzt
  shlex.ValueError-Fallback statt ihn zu verschlucken (deckt Memory-
  Regel "stille excepts in Adaptern" ab).

Tests: neue tests/test_main_validators.py mit 22 Cases — Drucksache-
Whitelist-Roundtrip + Path-Traversal-Reject, Search-Query Längen-
Edge-Cases. 107 Unit-Tests grün (85 alt + 22 neu).

Validators in eigenem Modul (app/validators.py), damit Tests sie ohne
slowapi-Dependency direkt importieren können.

Refs: #57, #59 (Phase A)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 11:15:16 +02:00
+								from .validators import (
 								    MAX_SEARCH_QUERY_LEN,
 								    validate_drucksache,
 								    validate_search_query,
 								)
 								# Strukturiertes Logging für die ganze App. uvicorn registriert seinen
 								# eigenen Root-Handler erst beim Start; wir setzen ein neutrales Format
 								# für unsere Module früh, damit logger.exception() auch beim Boot greift.
 								logging.basicConfig(
 								    level=logging.INFO,
 								    format="%(asctime)s %(levelname)-7s %(name)s: %(message)s",
 								)
 								logger = logging.getLogger(__name__)
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								from .config import settings
 								from .database import (
 								    init_db, get_job, create_job, update_job,
-												#47: Auto-Re-Analyse bei nicht-verifizierbaren Zitaten

Statt eine Nachricht "Textstelle nicht auffindbar" zu zeigen (was User
zurecht als Quatsch bezeichnet hat), erkennt der Cite-Endpoint jetzt
halluzinierte Zitate und triggert automatisch eine Re-Analyse:

Flow:
1. User klickt auf Zitat-Link
2. render_highlighted_page gibt (pdf, page, highlighted=False) zurück
3. Endpoint prüft: ds+bl Parameter vorhanden? Assessment in DB?
4. → Löscht altes Assessment, startet Re-Analyse als Background-Task
5. → Zeigt HTML-Warte-Seite mit Spinner und "Wird neu analysiert..."
6. → Auto-Redirect nach 15s zurück zum Assessment

Das neue Assessment hat durch reconstruct_zitate verifizierte Zitate,
die dann beim nächsten Klick korrekt gehighlighted werden.

Änderungen:
- embeddings.render_highlighted_page: Return-Typ (bytes, int, bool) —
  drittes Element ist True wenn Highlight gesetzt wurde
- database.delete_assessment: neue Funktion für die Re-Analyse
- main.py cite-Endpoint: akzeptiert ds= und bl= als optionale Params,
  triggert Re-Analyse bei highlighted=False + ds vorhanden
- Frontend: makeCiteUrl reicht ds+bl aus dem Assessment-Kontext mit
  durch in die Cite-URL
- Cache-Control auf 1h reduziert (war 24h, zu aggressiv für
  Assessments die sich durch Re-Analyse ändern)

Tests: 194/194 grün.

Refs: #47, #60

											
										
										
											2026-04-10 10:35:01 +02:00
+								    get_all_assessments, get_assessment, delete_assessment,
 								    upsert_assessment, import_json_assessments,
 								    search_assessments,
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								)
 								from .parlamente import get_adapter, ADAPTERS
-												Add central bundeslaender.py module with all 16 states (#7)

Introduces app/bundeslaender.py as the single source of truth for all
bundesland-specific data (parliament name, current legislative period,
upcoming elections, governing coalition, doku system, base URLs,
drucksache format, dokukratie scraper code, active flag, optional
remarks). Data reflects April 2026 state.

main.py::index() and /api/bundeslaender now derive their lists from
this module instead of hardcoding. Frontend dropdown now shows all 16
bundesländer (15 disabled with "(bald)" suffix); previously the
landing template showed only 4. NRW remains the only "aktiv" entry.

API behaviour change worth noting: the /api/bundeslaender endpoint
previously emitted code "ST" for Sachsen-Anhalt; it now emits "LSA"
to match the politically dominant abbreviation. No functional impact
because non-NRW bundesländer were inactive in both versions.

Foundation for #5 and #2; deliberately a no-op for NRW so it can ship
and rollback independently.

Resolves issue #7.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 14:17:54 +02:00
+								from .bundeslaender import alle_bundeslaender
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								from .analyzer import analyze_antrag
-												#43 Keycloak SSO: JWT-Middleware + UI-Guiding

Auth-Schicht vorbereitet — Dev-Modus (KEYCLOAK_URL leer) lässt alles
durch, Prod-Modus (ENV gesetzt) validiert JWT gegen Keycloak-JWKS.

Backend (app/auth.py):
- JWKS-Cache mit 1h TTL (async httpx fetch)
- get_current_user: Optional, gibt User-Dict oder None
- require_auth: Pflicht, gibt User-Dict oder HTTP 401
- keycloak_login_url: Baut die OIDC-Login-URL
- _is_auth_enabled: prüft ob alle 3 ENV-Vars gesetzt sind

Abgesicherte POST-Endpoints:
- POST /analyze → Depends(require_auth)
- POST /api/analyze-drucksache → Depends(require_auth)
- POST /api/programme/index → Depends(require_auth)

Neue Endpoints:
- GET /api/auth/me → {authenticated, sub, email, name, roles} oder {authenticated: false}
- GET /api/auth/login-url → {enabled, url} für Keycloak-Redirect

Frontend (index.html):
- initAuth() beim DOMContentLoaded → prüft /api/auth/me
- "Anmelden"-Button im Header (neben "Quellen")
- "Jetzt prüfen"-Button: disabled + Tooltip "Nur nach Anmeldung
  verfügbar" wenn nicht eingeloggt; aktiv wenn eingeloggt
- currentUser-State steuert Button-Zustände

Dev-Modus: Solange KEYCLOAK_URL nicht gesetzt ist (lokale Dev, aktueller
Prod-Stand), sind alle Endpoints offen wie bisher. Kein Breaking Change.

Dependency: python-jose[cryptography]>=3.3.0 in requirements.txt.

Tests: 194/194 grün (auth.py hat keine Seiteneffekte im Import).

Refs: #43

											
										
										
											2026-04-10 14:28:57 +02:00
+								from .auth import get_current_user, require_auth, keycloak_login_url, _is_auth_enabled
-												Fix: NRW-Titel + Regierungsfraktionen-Pflicht im LLM-Prompt

Bug 1 — NRW-Titel "Drucksache XX/YYYYY":
NRW's get_document machte nur HEAD-Request auf die PDF-URL und gab
title="Drucksache 18/18085" zurück — keinen echten Titel. Fix: nutzt
jetzt search(drucksache) um den echten Eintrag von OPAL zu holen.
Fallback: leerer Titel statt generischer, damit der LLM-Titel nicht
überschrieben wird. Plus _pick_best_title Helper: doc.title nur
übernehmen wenn es ein echter Titel ist (nicht "Drucksache XX").

Bug 2 — Nur Antragsteller im Passungsprofil, keine Regierungsfraktionen:
Der LLM ignorierte die "UND Regierungsfraktionen"-Anweisung im Prompt.
Fix: explizite PFLICHT-FRAKTIONEN-Zeile im User-Prompt:
"Du MUSST folgende Fraktionen in wahlprogrammScores bewerten: SPD, CDU, GRÜNE"
(dedupliziert aus fraktionen + regierungsfraktionen).

Tests: 194/194 grün.
Batch-Re-Analyse muss nochmal laufen mit den Fixes (21 bereits fertig,
15 noch offen — werden alle erneut benötigt weil die Titel/Fraktionen
in den neuen Assessments falsch sind).

											
										
										
											2026-04-10 16:05:57 +02:00
 								def _pick_best_title(llm_title: str, doc_title: Optional[str], drucksache: str) -> str:
 								    """Wähle den besten Titel aus LLM-Output und Adapter-Metadata.
 								    Priorität:
 . doc_title, wenn ein echter Titel (nicht "Drucksache XX")
 . llm_title, wenn nicht leer und nicht generisch
 . Generischer Fallback "Drucksache XX"
 								    """
 								    generic_prefix = f"Drucksache {drucksache.split('/')[0]}"
 								    # doc_title gut? (nicht generisch, nicht leer)
 								    if doc_title and not doc_title.startswith("Drucksache ") and len(doc_title) > 5:
 								        return doc_title
 								    # LLM-Titel gut? (nicht generisch)
 								    if llm_title and not llm_title.startswith("Drucksache ") and len(llm_title) > 5:
 								        return llm_title
 								    # doc_title als Fallback (auch wenn generisch)
 								    return doc_title or llm_title or f"Drucksache {drucksache}"
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								from .report import generate_html_report, generate_pdf_report
 								from .embeddings import (
 								    init_embeddings_db, get_programme_info, get_indexing_status,
-												#47 PDF Zitat-Highlighting via PyMuPDF Single-Page-Render

Klick auf eine Zitat-Quelle im Report öffnet jetzt eine 1-Seiten-PDF-
Variante des Wahlprogramms mit gelb markiertem Snippet, statt nur zum
Page-Anchor zu springen und den Leser selbst suchen zu lassen.

Implementation:

embeddings.render_highlighted_page(programm_id, seite, query)
- Validiert programm_id gegen PROGRAMME (Path-Traversal-Schutz)
- Lädt das volle Wahlprogramm-PDF, extrahiert via insert_pdf nur die
  angeforderte Seite in einen neuen Document → kleinere Response
- search_for(query[:200]) → Bounding-Boxes aller Treffer
- Fallback: 5-Wort-Anker wenn Volltext-Match leer (LLM-Truncation,
  identisch zu find_chunk_for_text/Sub-D-Logik)
- add_highlight_annot mit gelber stroke-Color (1.0, 0.93, 0.0)
- Returns serialisierte PDF-Bytes oder None

embeddings._chunk_pdf_url
- Wenn chunk["text"] vorhanden: emittiert /api/wahlprogramm-cite-URL
  mit pid=, seite=, q=urlencoded(text[:200])
- Sonst: alter statischer /static/referenzen/X.pdf#page=N (Pre-#47
  rückwärts-kompatibel)
- text wird auf 200 Zeichen abgeschnitten, sonst blasen
  500-Zeichen-Snippets jedes Assessment-JSON auf

main.py /api/wahlprogramm-cite Endpoint
- Validiert pid gegen PROGRAMME registry
- seite: 1 ≤ n ≤ 2000
- Response: application/pdf, Cache-Control max-age=86400
- 404 bei unknown pid oder fehlendem PDF, 400 bei seite out of range

Reconstruct-Pipeline (Issue #60 Option B) zieht das automatisch durch:
reconstruct_zitate ruft _chunk_pdf_url(matched_chunk) auf, der jetzt
bevorzugt die Cite-URL emittiert. Keine Änderung an reconstruct_zitate
selbst nötig.

Tests: 194/194 grün (185 + 9 neue):

- TestChunkPdfUrl: 4 Cases (cite vs static, unknown prog, 200-char-truncate)
- TestRenderHighlightedPage: 5 Cases (unknown pid, invalid seite, valid
  render, empty query, query-not-found-falls-back-zu-leerem-Highlight)
- Plus Bridge im Test-Stub: pymupdf-as-fitz Shim falls eine
  third-party "fitz" das Pkg shadowt (kommt auf älteren Dev-Setups vor)

Refs: #47

											
										
										
											2026-04-10 01:09:45 +02:00
+								    index_programm, render_highlighted_page, PROGRAMME,
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								)
 								app = FastAPI(
 								    title=settings.app_name,
 								    version=settings.app_version,
 								    docs_url=None,      # Disable /docs in production
 								    redoc_url=None,     # Disable /redoc in production
 								    openapi_url=None,   # Disable /openapi.json in production
 								)
-												Security hotfixes #1, #2, #6 from audit (#57)

Drei akute Befunde aus dem Live-System-Audit (Issue #57):

- **#1 HIGH** — Resource Exhaustion via öffentlichem POST: slowapi
  Limiter (in-memory, IP-key) auf /analyze (10/min), /api/analyze-drucksache
  (10/min) und /api/programme/index (3/min). Verhindert, dass ein
  unauthentifizierter Client mit einer Schleife die DashScope-Quota oder
  die CPU des Containers leerziehen kann. Default-Storage reicht solange
  wir auf einem einzigen Worker laufen.

- **#2 MEDIUM** + **#6 MEDIUM** (selber Root-Cause) — XXE/Local-File-Read
  via WeasyPrint und Stored XSS via Browser-Rendering: alle LLM-getragenen
  Felder in app/report.py laufen jetzt durch html.escape() bevor sie in
  die HTML-Template interpoliert werden. format_redline_html escape-first
  und ersetzt dann die Markdown-Marker durch von uns kontrollierte
  <span>-Tags. build_matrix_html escaped das aspect-Attribut, sodass ein
  nacktes " den title="..."-Wert nicht mehr beenden und einen Event-
  Handler injizieren kann. Toter jinja2-Import in report.py entfernt
  (war never used, blockierte nur den lokalen Test).

- **Tests** — neue tests/test_report.py mit 8 Cases, die direkt die
  Bug-Klasse verifizieren: <script>, file://-img, "-attribut-breakout
  in Title und ein End-to-End-Render mit XSS-Payloads in jedem LLM-Feld.
  Die Marker-Funktionalität (** und ~~) wird mit-getestet, damit der
  Escape-First-Ansatz das nicht versehentlich kaputt macht.

77 alte Unit-Tests + 8 neue → 85 grün.

Rate-Limit-Verifikation per TestClient ist Integration-Scope und folgt
in tests/integration/test_main_security.py als separates Folge-Item.

Refs: #57

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 10:45:43 +02:00
+								# Rate-Limiter — fängt Resource-Exhaustion auf den teuren POST-Endpoints
 								# (LLM-Calls + Indexing). Issue #57 Befund #1 (HIGH). Default in-memory
 								# Storage; für mehrere Worker müsste man auf Redis umstellen, solange wir
 								# auf einem Container laufen reicht das Default-Storage.
 								limiter = Limiter(key_func=get_remote_address, default_limits=[])
 								app.state.limiter = limiter
 								app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								# Security Headers Middleware
 								class SecurityHeadersMiddleware(BaseHTTPMiddleware):
 								    async def dispatch(self, request: Request, call_next):
 								        response = await call_next(request)
 								        response.headers["X-Content-Type-Options"] = "nosniff"
 								        response.headers["X-Frame-Options"] = "DENY"
 								        response.headers["X-XSS-Protection"] = "1; mode=block"
 								        response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
 								        response.headers["Permissions-Policy"] = "geolocation=(), microphone=(), camera=()"
 								        # CSP: Allow self, inline styles (for templates), and PDF viewer
 								        response.headers["Content-Security-Policy"] = (
 								            "default-src 'self'; "
 								            "style-src 'self' 'unsafe-inline'; "
 								            "script-src 'self' 'unsafe-inline'; "
 								            "img-src 'self' data:; "
 								            "frame-ancestors 'none';"
 								        )
 								        return response
 								app.add_middleware(SecurityHeadersMiddleware)
 								# Setup directories
 								settings.data_dir.mkdir(exist_ok=True)
 								settings.reports_dir.mkdir(exist_ok=True)
 								# Static files and templates
 								static_dir = Path(__file__).parent / "static"
 								templates_dir = Path(__file__).parent / "templates"
 								static_dir.mkdir(exist_ok=True)
 								templates_dir.mkdir(exist_ok=True)
 								app.mount("/static", StaticFiles(directory=static_dir), name="static")
 								templates = Jinja2Templates(directory=str(templates_dir))
 								@app.on_event("startup")
 								async def startup():
 								    await init_db()
 								    init_embeddings_db()
-												#95 Job-Queue: SQLite-backed asyncio Worker mit Backpressure

FIFO-Queue für Analyse-Jobs — ersetzt FastAPI BackgroundTasks:

app/queue.py:
- asyncio.Queue mit MAX_QUEUE_SIZE=50
- Einzelner Worker-Coroutine (Concurrency=1, DashScope-freundlich)
- MIN_PAUSE_SECONDS=10 zwischen Jobs
- Exponentielles Backoff bei Serien-Fehlern (15s → 5min)
- get_queue_status() für den Status-Endpoint
- QueueFullError → HTTP 429 + Retry-After Header
- start_worker() als FastAPI-Startup-Task
- re_enqueue_pending() markiert Crash-Überlebende als 'stale'

main.py:
- POST /api/analyze-drucksache nutzt queue.enqueue() statt
  background_tasks.add_task()
- Response enthält queue_position
- GET /api/queue/status zeigt pending, max_size, processed,
  estimated_wait_seconds, worker_running
- Worker wird bei app.startup() gestartet

Tests: 201 passed, 5 skipped.

Refs: #95, #44 (Batch baut auf Queue auf)

											
										
										
											2026-04-10 17:24:34 +02:00
+								    # Job-Queue Worker starten (#95)
 								    from .queue import start_worker, re_enqueue_pending
 								    await re_enqueue_pending()
 								    start_worker()
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    # JSON import disabled - all assessments now live in SQLite DB only
 								    # Legacy import would overwrite new v5 assessments with old format
 								    # count = await import_json_assessments(settings.data_dir / "assessments")
 								    # if count > 0:
 								    #     print(f"Imported {count} assessments from JSON files")
 								@app.get("/", response_class=HTMLResponse)
 								async def index(request: Request):
 								    """Landing page with upload form."""
-												Bundesland filter & transparency: stringent split + visible source (#8)

Brings the Bundesland-Dropdown from a cosmetic header widget to a real
filter that propagates through every layer (Listing, internal search,
statistics, party/tag filters, upload mode), and at the same time makes
the source parliament visible in every place where assessments from
multiple bundesländer can be mixed.

Backend
- database.get_all_assessments(bundesland=None) — new optional filter,
  "ALL" treated as None.
- database.search_assessments — bug fix: previous `if bundesland:`
  branch incorrectly added a `WHERE bundesland='ALL'` clause; now
  guarded with `bundesland and bundesland != "ALL"`.
- main.list_assessments — accepts ?bundesland= query param, includes the
  bundesland field in the response so the frontend can render badges.
- main.get_single_assessment — also includes bundesland in the response
  so the detail header can show the source parlament.
- main.search_landtag — early HTTP 400 when bundesland is missing or
  "ALL"; the live Landtag adapter cannot serve a synthetic Bundesweit
  request.
- main.index() and main.list_bundeslaender — synthetic "🌍 Bundesweit"
  entry prepended to the bundesländer list (kept out of bundeslaender.py
  on purpose — ALL is not a real state). Both endpoints additionally
  expose a parlament_names map so the frontend can render the source
  parliament without an extra round-trip.

Report (PDF + HTML)
- generate_html_report / generate_pdf_report — new optional bundesland
  parameter. When set, the report header carries the parliament name
  ("Landtag von Sachsen-Anhalt", "Landtag Nordrhein-Westfalen", …)
  beside the title. Three call sites updated: run_analysis,
  run_drucksache_analysis, download_assessment_pdf.

Frontend (templates/index.html)
- Header dropdown gets the synthetic ALL entry as first option;
  initial currentBundesland is now 'ALL' (was 'NRW').
- localStorage persistence: changeBundesland writes, DOMContentLoaded
  reads and validates against the visible options.
- changeBundesland resets the score / party / tag filter state, syncs
  the upload-mode bundesland select, disables the Landtag-Suche button
  + tooltip when ALL, and toggles a data-mode attribute on
  .list-content (used by CSS to show/hide the per-item bundesland
  badge).
- loadAssessments now sends ?bundesland=… so the API does the actual
  filtering. updateStats renders an additional per-bundesland average
  block (Ø NRW: x · Ø LSA: y) when in ALL mode and the loaded list
  spans more than one bundesland.
- renderList prepends a small "bl-badge" beside the Drucksachen-Nummer.
  Hidden in single-bundesland mode via CSS selector to avoid clutter.
- showDetail header now shows the parliament name as its own line
  (.detail-parlament).
- searchLandtag has an early-out alert if currentBundesland === 'ALL',
  saving a network round-trip.
- Upload-Mode bundesland select now starts with a "— Bundesland wählen
  —" placeholder (no auto-default), and startAnalysis validates that a
  concrete bundesland was chosen.

CSS
- .bl-badge plus the .list-content[data-mode="single"] hide rule.
- .detail-parlament for the detail header line.
- .header-parlament for the PDF report header line.

Resolves #8.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:00:39 +02:00
+								    # Frontend-Liste: synthetischer "ALL"-Eintrag (Bundesweit) zuerst, dann
 								    # die echten Bundesländer aus der Konfig. Der "ALL"-Code ist eine reine
 								    # Frontend/API-Konvention, kein Eintrag in bundeslaender.py.
 								    bl_list = [{"code": "ALL", "name": "🌍 Bundesweit", "active": True}]
 								    bl_list.extend(
 								        {"code": bl.code, "name": bl.name, "active": bl.aktiv}
 								        for bl in alle_bundeslaender()
 								    )
 								    # Map code → parlament_name, damit das Frontend ohne extra Backend-Call
 								    # für jeden Antrag den Parlamentsnamen anzeigen kann.
 								    parlament_names = {
 								        bl.code: bl.parlament_name for bl in alle_bundeslaender()
 								    }
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    return templates.TemplateResponse("index.html", {
 								        "request": request,
 								        "app_name": settings.app_name,
-												Bundesland filter & transparency: stringent split + visible source (#8)

Brings the Bundesland-Dropdown from a cosmetic header widget to a real
filter that propagates through every layer (Listing, internal search,
statistics, party/tag filters, upload mode), and at the same time makes
the source parliament visible in every place where assessments from
multiple bundesländer can be mixed.

Backend
- database.get_all_assessments(bundesland=None) — new optional filter,
  "ALL" treated as None.
- database.search_assessments — bug fix: previous `if bundesland:`
  branch incorrectly added a `WHERE bundesland='ALL'` clause; now
  guarded with `bundesland and bundesland != "ALL"`.
- main.list_assessments — accepts ?bundesland= query param, includes the
  bundesland field in the response so the frontend can render badges.
- main.get_single_assessment — also includes bundesland in the response
  so the detail header can show the source parlament.
- main.search_landtag — early HTTP 400 when bundesland is missing or
  "ALL"; the live Landtag adapter cannot serve a synthetic Bundesweit
  request.
- main.index() and main.list_bundeslaender — synthetic "🌍 Bundesweit"
  entry prepended to the bundesländer list (kept out of bundeslaender.py
  on purpose — ALL is not a real state). Both endpoints additionally
  expose a parlament_names map so the frontend can render the source
  parliament without an extra round-trip.

Report (PDF + HTML)
- generate_html_report / generate_pdf_report — new optional bundesland
  parameter. When set, the report header carries the parliament name
  ("Landtag von Sachsen-Anhalt", "Landtag Nordrhein-Westfalen", …)
  beside the title. Three call sites updated: run_analysis,
  run_drucksache_analysis, download_assessment_pdf.

Frontend (templates/index.html)
- Header dropdown gets the synthetic ALL entry as first option;
  initial currentBundesland is now 'ALL' (was 'NRW').
- localStorage persistence: changeBundesland writes, DOMContentLoaded
  reads and validates against the visible options.
- changeBundesland resets the score / party / tag filter state, syncs
  the upload-mode bundesland select, disables the Landtag-Suche button
  + tooltip when ALL, and toggles a data-mode attribute on
  .list-content (used by CSS to show/hide the per-item bundesland
  badge).
- loadAssessments now sends ?bundesland=… so the API does the actual
  filtering. updateStats renders an additional per-bundesland average
  block (Ø NRW: x · Ø LSA: y) when in ALL mode and the loaded list
  spans more than one bundesland.
- renderList prepends a small "bl-badge" beside the Drucksachen-Nummer.
  Hidden in single-bundesland mode via CSS selector to avoid clutter.
- showDetail header now shows the parliament name as its own line
  (.detail-parlament).
- searchLandtag has an early-out alert if currentBundesland === 'ALL',
  saving a network round-trip.
- Upload-Mode bundesland select now starts with a "— Bundesland wählen
  —" placeholder (no auto-default), and startAnalysis validates that a
  concrete bundesland was chosen.

CSS
- .bl-badge plus the .list-content[data-mode="single"] hide rule.
- .detail-parlament for the detail header line.
- .header-parlament for the PDF report header line.

Resolves #8.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:00:39 +02:00
+								        "bundeslaender": bl_list,
 								        "parlament_names": parlament_names,
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    })
 								@app.post("/analyze")
-												Security hotfixes #1, #2, #6 from audit (#57)

Drei akute Befunde aus dem Live-System-Audit (Issue #57):

- **#1 HIGH** — Resource Exhaustion via öffentlichem POST: slowapi
  Limiter (in-memory, IP-key) auf /analyze (10/min), /api/analyze-drucksache
  (10/min) und /api/programme/index (3/min). Verhindert, dass ein
  unauthentifizierter Client mit einer Schleife die DashScope-Quota oder
  die CPU des Containers leerziehen kann. Default-Storage reicht solange
  wir auf einem einzigen Worker laufen.

- **#2 MEDIUM** + **#6 MEDIUM** (selber Root-Cause) — XXE/Local-File-Read
  via WeasyPrint und Stored XSS via Browser-Rendering: alle LLM-getragenen
  Felder in app/report.py laufen jetzt durch html.escape() bevor sie in
  die HTML-Template interpoliert werden. format_redline_html escape-first
  und ersetzt dann die Markdown-Marker durch von uns kontrollierte
  <span>-Tags. build_matrix_html escaped das aspect-Attribut, sodass ein
  nacktes " den title="..."-Wert nicht mehr beenden und einen Event-
  Handler injizieren kann. Toter jinja2-Import in report.py entfernt
  (war never used, blockierte nur den lokalen Test).

- **Tests** — neue tests/test_report.py mit 8 Cases, die direkt die
  Bug-Klasse verifizieren: <script>, file://-img, "-attribut-breakout
  in Title und ein End-to-End-Render mit XSS-Payloads in jedem LLM-Feld.
  Die Marker-Funktionalität (** und ~~) wird mit-getestet, damit der
  Escape-First-Ansatz das nicht versehentlich kaputt macht.

77 alte Unit-Tests + 8 neue → 85 grün.

Rate-Limit-Verifikation per TestClient ist Integration-Scope und folgt
in tests/integration/test_main_security.py als separates Folge-Item.

Refs: #57

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 10:45:43 +02:00
+								@limiter.limit("10/minute")
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								async def start_analysis(
-												Security hotfixes #1, #2, #6 from audit (#57)

Drei akute Befunde aus dem Live-System-Audit (Issue #57):

- **#1 HIGH** — Resource Exhaustion via öffentlichem POST: slowapi
  Limiter (in-memory, IP-key) auf /analyze (10/min), /api/analyze-drucksache
  (10/min) und /api/programme/index (3/min). Verhindert, dass ein
  unauthentifizierter Client mit einer Schleife die DashScope-Quota oder
  die CPU des Containers leerziehen kann. Default-Storage reicht solange
  wir auf einem einzigen Worker laufen.

- **#2 MEDIUM** + **#6 MEDIUM** (selber Root-Cause) — XXE/Local-File-Read
  via WeasyPrint und Stored XSS via Browser-Rendering: alle LLM-getragenen
  Felder in app/report.py laufen jetzt durch html.escape() bevor sie in
  die HTML-Template interpoliert werden. format_redline_html escape-first
  und ersetzt dann die Markdown-Marker durch von uns kontrollierte
  <span>-Tags. build_matrix_html escaped das aspect-Attribut, sodass ein
  nacktes " den title="..."-Wert nicht mehr beenden und einen Event-
  Handler injizieren kann. Toter jinja2-Import in report.py entfernt
  (war never used, blockierte nur den lokalen Test).

- **Tests** — neue tests/test_report.py mit 8 Cases, die direkt die
  Bug-Klasse verifizieren: <script>, file://-img, "-attribut-breakout
  in Title und ein End-to-End-Render mit XSS-Payloads in jedem LLM-Feld.
  Die Marker-Funktionalität (** und ~~) wird mit-getestet, damit der
  Escape-First-Ansatz das nicht versehentlich kaputt macht.

77 alte Unit-Tests + 8 neue → 85 grün.

Rate-Limit-Verifikation per TestClient ist Integration-Scope und folgt
in tests/integration/test_main_security.py als separates Folge-Item.

Refs: #57

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 10:45:43 +02:00
+								    request: Request,
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    background_tasks: BackgroundTasks,
 								    text: Optional[str] = Form(None),
 								    file: Optional[UploadFile] = File(None),
 								    bundesland: str = Form("NRW"),
 								    model: str = Form("qwen-plus"),
-												Fix SyntaxError: user=Depends nach Form-Params (Python positional-after-default)

											
										
										
											2026-04-10 14:30:54 +02:00
+								    user: dict = Depends(require_auth),
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								):
 								    """Start analysis job."""
 								    if not text and not file:
 								        raise HTTPException(status_code=400, detail="Entweder Text oder PDF-Datei erforderlich")
 								    # Extract text from PDF if uploaded
 								    if file and file.filename:
 								        import fitz  # PyMuPDF
 								        pdf_bytes = await file.read()
 								        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
 								        text = ""
 								        for page in doc:
 								            text += page.get_text()
 								        doc.close()
 								    # Create job
 								    job_id = str(uuid.uuid4())
 								    await create_job(job_id, text[:500], bundesland, model)
 								    # Start background analysis
 								    background_tasks.add_task(run_analysis, job_id, text, bundesland, model)
 								    return JSONResponse({"job_id": job_id, "status": "queued"})
 								async def run_analysis(job_id: str, text: str, bundesland: str, model: str):
 								    """Background task for analysis."""
 								    try:
 								        await update_job(job_id, status="processing")
 								        # Run LLM analysis
 								        assessment = await analyze_antrag(text, bundesland, model)
 								        # Generate reports
 								        html_path = settings.reports_dir / f"{job_id}.html"
 								        pdf_path = settings.reports_dir / f"{job_id}.pdf"
-												Bundesland filter & transparency: stringent split + visible source (#8)

Brings the Bundesland-Dropdown from a cosmetic header widget to a real
filter that propagates through every layer (Listing, internal search,
statistics, party/tag filters, upload mode), and at the same time makes
the source parliament visible in every place where assessments from
multiple bundesländer can be mixed.

Backend
- database.get_all_assessments(bundesland=None) — new optional filter,
  "ALL" treated as None.
- database.search_assessments — bug fix: previous `if bundesland:`
  branch incorrectly added a `WHERE bundesland='ALL'` clause; now
  guarded with `bundesland and bundesland != "ALL"`.
- main.list_assessments — accepts ?bundesland= query param, includes the
  bundesland field in the response so the frontend can render badges.
- main.get_single_assessment — also includes bundesland in the response
  so the detail header can show the source parlament.
- main.search_landtag — early HTTP 400 when bundesland is missing or
  "ALL"; the live Landtag adapter cannot serve a synthetic Bundesweit
  request.
- main.index() and main.list_bundeslaender — synthetic "🌍 Bundesweit"
  entry prepended to the bundesländer list (kept out of bundeslaender.py
  on purpose — ALL is not a real state). Both endpoints additionally
  expose a parlament_names map so the frontend can render the source
  parliament without an extra round-trip.

Report (PDF + HTML)
- generate_html_report / generate_pdf_report — new optional bundesland
  parameter. When set, the report header carries the parliament name
  ("Landtag von Sachsen-Anhalt", "Landtag Nordrhein-Westfalen", …)
  beside the title. Three call sites updated: run_analysis,
  run_drucksache_analysis, download_assessment_pdf.

Frontend (templates/index.html)
- Header dropdown gets the synthetic ALL entry as first option;
  initial currentBundesland is now 'ALL' (was 'NRW').
- localStorage persistence: changeBundesland writes, DOMContentLoaded
  reads and validates against the visible options.
- changeBundesland resets the score / party / tag filter state, syncs
  the upload-mode bundesland select, disables the Landtag-Suche button
  + tooltip when ALL, and toggles a data-mode attribute on
  .list-content (used by CSS to show/hide the per-item bundesland
  badge).
- loadAssessments now sends ?bundesland=… so the API does the actual
  filtering. updateStats renders an additional per-bundesland average
  block (Ø NRW: x · Ø LSA: y) when in ALL mode and the loaded list
  spans more than one bundesland.
- renderList prepends a small "bl-badge" beside the Drucksachen-Nummer.
  Hidden in single-bundesland mode via CSS selector to avoid clutter.
- showDetail header now shows the parliament name as its own line
  (.detail-parlament).
- searchLandtag has an early-out alert if currentBundesland === 'ALL',
  saving a network round-trip.
- Upload-Mode bundesland select now starts with a "— Bundesland wählen
  —" placeholder (no auto-default), and startAnalysis validates that a
  concrete bundesland was chosen.

CSS
- .bl-badge plus the .list-content[data-mode="single"] hide rule.
- .detail-parlament for the detail header line.
- .header-parlament for the PDF report header line.

Resolves #8.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:00:39 +02:00
+								        await generate_html_report(assessment, html_path, bundesland=bundesland)
 								        await generate_pdf_report(assessment, pdf_path, bundesland=bundesland)
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								        await update_job(
 								            job_id,
 								            status="completed",
 								            result=assessment.model_dump_json(),
 								            html_path=str(html_path),
 								            pdf_path=str(pdf_path),
 								        )
 								    except Exception as e:
 								        await update_job(job_id, status="failed", error=str(e))
 								@app.get("/status/{job_id}")
 								async def get_status(job_id: str):
 								    """Get job status."""
 								    job = await get_job(job_id)
 								    if not job:
 								        raise HTTPException(status_code=404, detail="Job nicht gefunden")
 								    return JSONResponse({
 								        "job_id": job_id,
 								        "status": job["status"],
 								        "created_at": job["created_at"],
 								    })
 								@app.get("/result/{job_id}", response_class=HTMLResponse)
 								async def get_result(request: Request, job_id: str):
 								    """Get analysis result as HTML."""
 								    job = await get_job(job_id)
 								    if not job:
 								        raise HTTPException(status_code=404, detail="Job nicht gefunden")
 								    if job["status"] != "completed":
 								        raise HTTPException(status_code=400, detail=f"Job noch nicht fertig: {job['status']}")
 								    html_path = Path(job["html_path"])
 								    if html_path.exists():
 								        return HTMLResponse(html_path.read_text())
 								    raise HTTPException(status_code=500, detail="Report nicht gefunden")
 								@app.get("/result/{job_id}/pdf")
 								async def get_pdf(job_id: str):
 								    """Download PDF report."""
 								    job = await get_job(job_id)
 								    if not job:
 								        raise HTTPException(status_code=404, detail="Job nicht gefunden")
 								    if job["status"] != "completed":
 								        raise HTTPException(status_code=400, detail=f"Job noch nicht fertig: {job['status']}")
 								    pdf_path = Path(job["pdf_path"])
 								    if pdf_path.exists():
 								        return FileResponse(
 								            pdf_path,
 								            media_type="application/pdf",
 								            filename=f"gwoe-bericht-{job_id[:8]}.pdf"
 								        )
 								    raise HTTPException(status_code=500, detail="PDF nicht gefunden")
-												#95 Job-Queue: SQLite-backed asyncio Worker mit Backpressure

FIFO-Queue für Analyse-Jobs — ersetzt FastAPI BackgroundTasks:

app/queue.py:
- asyncio.Queue mit MAX_QUEUE_SIZE=50
- Einzelner Worker-Coroutine (Concurrency=1, DashScope-freundlich)
- MIN_PAUSE_SECONDS=10 zwischen Jobs
- Exponentielles Backoff bei Serien-Fehlern (15s → 5min)
- get_queue_status() für den Status-Endpoint
- QueueFullError → HTTP 429 + Retry-After Header
- start_worker() als FastAPI-Startup-Task
- re_enqueue_pending() markiert Crash-Überlebende als 'stale'

main.py:
- POST /api/analyze-drucksache nutzt queue.enqueue() statt
  background_tasks.add_task()
- Response enthält queue_position
- GET /api/queue/status zeigt pending, max_size, processed,
  estimated_wait_seconds, worker_running
- Worker wird bei app.startup() gestartet

Tests: 201 passed, 5 skipped.

Refs: #95, #44 (Batch baut auf Queue auf)

											
										
										
											2026-04-10 17:24:34 +02:00
+								# ─── Queue-Status (#95) ─────────────────────────────────────────────────────
 								@app.get("/api/queue/status")
 								async def queue_status():
 								    """Aktueller Queue-Stand: wartende Jobs, geschätzte Wartezeit."""
 								    from .queue import get_queue_status
 								    return get_queue_status()
-												#43 Keycloak SSO: JWT-Middleware + UI-Guiding

Auth-Schicht vorbereitet — Dev-Modus (KEYCLOAK_URL leer) lässt alles
durch, Prod-Modus (ENV gesetzt) validiert JWT gegen Keycloak-JWKS.

Backend (app/auth.py):
- JWKS-Cache mit 1h TTL (async httpx fetch)
- get_current_user: Optional, gibt User-Dict oder None
- require_auth: Pflicht, gibt User-Dict oder HTTP 401
- keycloak_login_url: Baut die OIDC-Login-URL
- _is_auth_enabled: prüft ob alle 3 ENV-Vars gesetzt sind

Abgesicherte POST-Endpoints:
- POST /analyze → Depends(require_auth)
- POST /api/analyze-drucksache → Depends(require_auth)
- POST /api/programme/index → Depends(require_auth)

Neue Endpoints:
- GET /api/auth/me → {authenticated, sub, email, name, roles} oder {authenticated: false}
- GET /api/auth/login-url → {enabled, url} für Keycloak-Redirect

Frontend (index.html):
- initAuth() beim DOMContentLoaded → prüft /api/auth/me
- "Anmelden"-Button im Header (neben "Quellen")
- "Jetzt prüfen"-Button: disabled + Tooltip "Nur nach Anmeldung
  verfügbar" wenn nicht eingeloggt; aktiv wenn eingeloggt
- currentUser-State steuert Button-Zustände

Dev-Modus: Solange KEYCLOAK_URL nicht gesetzt ist (lokale Dev, aktueller
Prod-Stand), sind alle Endpoints offen wie bisher. Kein Breaking Change.

Dependency: python-jose[cryptography]>=3.3.0 in requirements.txt.

Tests: 194/194 grün (auth.py hat keine Seiteneffekte im Import).

Refs: #43

											
										
										
											2026-04-10 14:28:57 +02:00
+								# ─── Auth-Endpoints (#43) ───────────────────────────────────────────────────
 								@app.get("/api/auth/me")
 								async def auth_me(user=Depends(get_current_user)):
 								    """User-Info oder null wenn nicht eingeloggt.
 								    Das Frontend ruft diesen Endpoint beim Load auf, um zu entscheiden
 								    ob "Bewerten" aktiv oder ausgegraut ist.
 								    """
 								    if user:
 								        return {"authenticated": True, **user}
 								    return {"authenticated": False}
-												Auth: OIDC Code→Token Exchange Callback + Cookie-basiertes Login

											
										
										
											2026-04-10 21:18:10 +02:00
+								@app.get("/api/auth/callback")
 								async def auth_callback(request: Request, code: str = "", state: str = ""):
 								    """OIDC Authorization Code → Access Token Exchange.
 								    Keycloak redirects hierher nach Login mit ?code=... Parameter.
 								    Wir tauschen den Code gegen ein Access Token und setzen es als Cookie.
 								    """
 								    if not _is_auth_enabled() or not code:
 								        from fastapi.responses import RedirectResponse
 								        return RedirectResponse("/")
 								    from .auth import _keycloak_issuer
 								    token_url = f"{_keycloak_issuer()}/protocol/openid-connect/token"
 								    # Construct the same redirect_uri used for the auth request
 								    base = str(request.base_url).rstrip("/").replace("http://", "https://")
 								    redirect_uri = f"{base}/api/auth/callback"
-												Fix: httpx import in auth callback

											
										
										
											2026-04-10 21:19:31 +02:00
+								    import httpx as _httpx
 								    async with _httpx.AsyncClient(timeout=10) as client:
-												Auth: OIDC Code→Token Exchange Callback + Cookie-basiertes Login

											
										
										
											2026-04-10 21:18:10 +02:00
+								        resp = await client.post(token_url, data={
 								            "grant_type": "authorization_code",
 								            "client_id": settings.keycloak_client_id,
 								            "code": code,
 								            "redirect_uri": redirect_uri,
 								        })
 								    if resp.status_code != 200:
 								        logger.error("Token exchange failed: %s %s", resp.status_code, resp.text[:200])
 								        raise HTTPException(status_code=401, detail="Login fehlgeschlagen")
 								    tokens = resp.json()
 								    access_token = tokens.get("access_token", "")
-												Fix: Auth-Callback setzt Cookie via HTML-Response statt RedirectResponse

											
										
										
											2026-04-10 21:27:32 +02:00
+								    expires_in = tokens.get("expires_in", 3600)
 								    # HTML-Response statt RedirectResponse: setzt Cookie UND redirected.
 								    # RedirectResponse mit Set-Cookie wird von manchen Browsern bei
 								    # 307/302 ignoriert (insb. hinter Reverse-Proxies).
 								    return HTMLResponse(
 								        f"""<!DOCTYPE html><html><head>
 								        <meta http-equiv="refresh" content="0;url=/">
 								        </head><body><p>Anmeldung erfolgreich, Weiterleitung...</p></body></html>""",
 								        headers={
 								            "Set-Cookie": (
 								                f"access_token={access_token}; Path=/; Secure; HttpOnly; "
 								                f"SameSite=Lax; Max-Age={expires_in}"
 								            )
 								        },
-												Auth: OIDC Code→Token Exchange Callback + Cookie-basiertes Login

											
										
										
											2026-04-10 21:18:10 +02:00
+								    )
-												#43 Keycloak SSO: JWT-Middleware + UI-Guiding

Auth-Schicht vorbereitet — Dev-Modus (KEYCLOAK_URL leer) lässt alles
durch, Prod-Modus (ENV gesetzt) validiert JWT gegen Keycloak-JWKS.

Backend (app/auth.py):
- JWKS-Cache mit 1h TTL (async httpx fetch)
- get_current_user: Optional, gibt User-Dict oder None
- require_auth: Pflicht, gibt User-Dict oder HTTP 401
- keycloak_login_url: Baut die OIDC-Login-URL
- _is_auth_enabled: prüft ob alle 3 ENV-Vars gesetzt sind

Abgesicherte POST-Endpoints:
- POST /analyze → Depends(require_auth)
- POST /api/analyze-drucksache → Depends(require_auth)
- POST /api/programme/index → Depends(require_auth)

Neue Endpoints:
- GET /api/auth/me → {authenticated, sub, email, name, roles} oder {authenticated: false}
- GET /api/auth/login-url → {enabled, url} für Keycloak-Redirect

Frontend (index.html):
- initAuth() beim DOMContentLoaded → prüft /api/auth/me
- "Anmelden"-Button im Header (neben "Quellen")
- "Jetzt prüfen"-Button: disabled + Tooltip "Nur nach Anmeldung
  verfügbar" wenn nicht eingeloggt; aktiv wenn eingeloggt
- currentUser-State steuert Button-Zustände

Dev-Modus: Solange KEYCLOAK_URL nicht gesetzt ist (lokale Dev, aktueller
Prod-Stand), sind alle Endpoints offen wie bisher. Kein Breaking Change.

Dependency: python-jose[cryptography]>=3.3.0 in requirements.txt.

Tests: 194/194 grün (auth.py hat keine Seiteneffekte im Import).

Refs: #43

											
										
										
											2026-04-10 14:28:57 +02:00
+								@app.get("/api/auth/login-url")
 								async def auth_login_url(request: Request, redirect: str = "/"):
 								    """Keycloak-Login-URL für den Browser-Redirect."""
 								    if not _is_auth_enabled():
 								        return {"enabled": False, "url": ""}
-												Auth: OIDC Code→Token Exchange Callback + Cookie-basiertes Login

											
										
										
											2026-04-10 21:18:10 +02:00
+								    # redirect_uri muss auf den Callback-Endpoint zeigen, nicht auf die
 								    # Zielseite — der Callback tauscht den Code gegen ein Token.
 								    base = str(request.base_url).rstrip("/").replace("http://", "https://")
 								    url = keycloak_login_url(f"{base}/api/auth/callback")
-												#43 Keycloak SSO: JWT-Middleware + UI-Guiding

Auth-Schicht vorbereitet — Dev-Modus (KEYCLOAK_URL leer) lässt alles
durch, Prod-Modus (ENV gesetzt) validiert JWT gegen Keycloak-JWKS.

Backend (app/auth.py):
- JWKS-Cache mit 1h TTL (async httpx fetch)
- get_current_user: Optional, gibt User-Dict oder None
- require_auth: Pflicht, gibt User-Dict oder HTTP 401
- keycloak_login_url: Baut die OIDC-Login-URL
- _is_auth_enabled: prüft ob alle 3 ENV-Vars gesetzt sind

Abgesicherte POST-Endpoints:
- POST /analyze → Depends(require_auth)
- POST /api/analyze-drucksache → Depends(require_auth)
- POST /api/programme/index → Depends(require_auth)

Neue Endpoints:
- GET /api/auth/me → {authenticated, sub, email, name, roles} oder {authenticated: false}
- GET /api/auth/login-url → {enabled, url} für Keycloak-Redirect

Frontend (index.html):
- initAuth() beim DOMContentLoaded → prüft /api/auth/me
- "Anmelden"-Button im Header (neben "Quellen")
- "Jetzt prüfen"-Button: disabled + Tooltip "Nur nach Anmeldung
  verfügbar" wenn nicht eingeloggt; aktiv wenn eingeloggt
- currentUser-State steuert Button-Zustände

Dev-Modus: Solange KEYCLOAK_URL nicht gesetzt ist (lokale Dev, aktueller
Prod-Stand), sind alle Endpoints offen wie bisher. Kein Breaking Change.

Dependency: python-jose[cryptography]>=3.3.0 in requirements.txt.

Tests: 194/194 grün (auth.py hat keine Seiteneffekte im Import).

Refs: #43

											
										
										
											2026-04-10 14:28:57 +02:00
+								    return {"enabled": True, "url": url}
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								# API: Load assessments from database
 								@app.get("/api/assessments")
-												Bundesland filter & transparency: stringent split + visible source (#8)

Brings the Bundesland-Dropdown from a cosmetic header widget to a real
filter that propagates through every layer (Listing, internal search,
statistics, party/tag filters, upload mode), and at the same time makes
the source parliament visible in every place where assessments from
multiple bundesländer can be mixed.

Backend
- database.get_all_assessments(bundesland=None) — new optional filter,
  "ALL" treated as None.
- database.search_assessments — bug fix: previous `if bundesland:`
  branch incorrectly added a `WHERE bundesland='ALL'` clause; now
  guarded with `bundesland and bundesland != "ALL"`.
- main.list_assessments — accepts ?bundesland= query param, includes the
  bundesland field in the response so the frontend can render badges.
- main.get_single_assessment — also includes bundesland in the response
  so the detail header can show the source parlament.
- main.search_landtag — early HTTP 400 when bundesland is missing or
  "ALL"; the live Landtag adapter cannot serve a synthetic Bundesweit
  request.
- main.index() and main.list_bundeslaender — synthetic "🌍 Bundesweit"
  entry prepended to the bundesländer list (kept out of bundeslaender.py
  on purpose — ALL is not a real state). Both endpoints additionally
  expose a parlament_names map so the frontend can render the source
  parliament without an extra round-trip.

Report (PDF + HTML)
- generate_html_report / generate_pdf_report — new optional bundesland
  parameter. When set, the report header carries the parliament name
  ("Landtag von Sachsen-Anhalt", "Landtag Nordrhein-Westfalen", …)
  beside the title. Three call sites updated: run_analysis,
  run_drucksache_analysis, download_assessment_pdf.

Frontend (templates/index.html)
- Header dropdown gets the synthetic ALL entry as first option;
  initial currentBundesland is now 'ALL' (was 'NRW').
- localStorage persistence: changeBundesland writes, DOMContentLoaded
  reads and validates against the visible options.
- changeBundesland resets the score / party / tag filter state, syncs
  the upload-mode bundesland select, disables the Landtag-Suche button
  + tooltip when ALL, and toggles a data-mode attribute on
  .list-content (used by CSS to show/hide the per-item bundesland
  badge).
- loadAssessments now sends ?bundesland=… so the API does the actual
  filtering. updateStats renders an additional per-bundesland average
  block (Ø NRW: x · Ø LSA: y) when in ALL mode and the loaded list
  spans more than one bundesland.
- renderList prepends a small "bl-badge" beside the Drucksachen-Nummer.
  Hidden in single-bundesland mode via CSS selector to avoid clutter.
- showDetail header now shows the parliament name as its own line
  (.detail-parlament).
- searchLandtag has an early-out alert if currentBundesland === 'ALL',
  saving a network round-trip.
- Upload-Mode bundesland select now starts with a "— Bundesland wählen
  —" placeholder (no auto-default), and startAnalysis validates that a
  concrete bundesland was chosen.

CSS
- .bl-badge plus the .list-content[data-mode="single"] hide rule.
- .detail-parlament for the detail header line.
- .header-parlament for the PDF report header line.

Resolves #8.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:00:39 +02:00
+								async def list_assessments(bundesland: Optional[str] = None):
 								    """Return assessments from database, optionally filtered by Bundesland.
 								    ``bundesland="ALL"`` and missing parameter both mean "no filter".
 								    """
 								    rows = await get_all_assessments(bundesland)
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    # Convert DB format to frontend format
 								    assessments = []
 								    for row in rows:
 								        assessments.append({
 								            "drucksache": row.get("drucksache"),
 								            "title": row.get("title"),
 								            "fraktionen": row.get("fraktionen", []),
 								            "datum": row.get("datum"),
 								            "link": row.get("link"),
-												Bundesland filter & transparency: stringent split + visible source (#8)

Brings the Bundesland-Dropdown from a cosmetic header widget to a real
filter that propagates through every layer (Listing, internal search,
statistics, party/tag filters, upload mode), and at the same time makes
the source parliament visible in every place where assessments from
multiple bundesländer can be mixed.

Backend
- database.get_all_assessments(bundesland=None) — new optional filter,
  "ALL" treated as None.
- database.search_assessments — bug fix: previous `if bundesland:`
  branch incorrectly added a `WHERE bundesland='ALL'` clause; now
  guarded with `bundesland and bundesland != "ALL"`.
- main.list_assessments — accepts ?bundesland= query param, includes the
  bundesland field in the response so the frontend can render badges.
- main.get_single_assessment — also includes bundesland in the response
  so the detail header can show the source parlament.
- main.search_landtag — early HTTP 400 when bundesland is missing or
  "ALL"; the live Landtag adapter cannot serve a synthetic Bundesweit
  request.
- main.index() and main.list_bundeslaender — synthetic "🌍 Bundesweit"
  entry prepended to the bundesländer list (kept out of bundeslaender.py
  on purpose — ALL is not a real state). Both endpoints additionally
  expose a parlament_names map so the frontend can render the source
  parliament without an extra round-trip.

Report (PDF + HTML)
- generate_html_report / generate_pdf_report — new optional bundesland
  parameter. When set, the report header carries the parliament name
  ("Landtag von Sachsen-Anhalt", "Landtag Nordrhein-Westfalen", …)
  beside the title. Three call sites updated: run_analysis,
  run_drucksache_analysis, download_assessment_pdf.

Frontend (templates/index.html)
- Header dropdown gets the synthetic ALL entry as first option;
  initial currentBundesland is now 'ALL' (was 'NRW').
- localStorage persistence: changeBundesland writes, DOMContentLoaded
  reads and validates against the visible options.
- changeBundesland resets the score / party / tag filter state, syncs
  the upload-mode bundesland select, disables the Landtag-Suche button
  + tooltip when ALL, and toggles a data-mode attribute on
  .list-content (used by CSS to show/hide the per-item bundesland
  badge).
- loadAssessments now sends ?bundesland=… so the API does the actual
  filtering. updateStats renders an additional per-bundesland average
  block (Ø NRW: x · Ø LSA: y) when in ALL mode and the loaded list
  spans more than one bundesland.
- renderList prepends a small "bl-badge" beside the Drucksachen-Nummer.
  Hidden in single-bundesland mode via CSS selector to avoid clutter.
- showDetail header now shows the parliament name as its own line
  (.detail-parlament).
- searchLandtag has an early-out alert if currentBundesland === 'ALL',
  saving a network round-trip.
- Upload-Mode bundesland select now starts with a "— Bundesland wählen
  —" placeholder (no auto-default), and startAnalysis validates that a
  concrete bundesland was chosen.

CSS
- .bl-badge plus the .list-content[data-mode="single"] hide rule.
- .detail-parlament for the detail header line.
- .header-parlament for the PDF report header line.

Resolves #8.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:00:39 +02:00
+								            "bundesland": row.get("bundesland"),
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								            "gwoeScore": row.get("gwoe_score"),
 								            "gwoeBegründung": row.get("gwoe_begruendung"),
 								            "gwoeMatrix": row.get("gwoe_matrix", []),
 								            "gwoeSchwerpunkt": row.get("gwoe_schwerpunkt", []),
 								            "wahlprogrammScores": row.get("wahlprogramm_scores", []),
 								            "verbesserungen": row.get("verbesserungen", []),
 								            "stärken": row.get("staerken", []),
 								            "schwächen": row.get("schwaechen", []),
 								            "empfehlung": row.get("empfehlung"),
 								            "empfehlungSymbol": row.get("empfehlung_symbol"),
 								            "verbesserungspotenzial": row.get("verbesserungspotenzial"),
 								            "themen": row.get("themen", []),
 								            "antragZusammenfassung": row.get("antrag_zusammenfassung"),
 								            "antragKernpunkte": row.get("antrag_kernpunkte", []),
-												#97 Neu bewerten: manueller Re-Analyse-Button + Bewertungsdatum

Fußzeile unter jedem Assessment-Detail jetzt mit:
- Bewertungsdatum ("Bewertet am DD.MM.YYYY") aus updated_at
- Quelle + Modell (batch-reanalyze / webapp, qwen-plus)
- "Neu bewerten"-Button (Auth-pflichtig, ausgegraut ohne Login)

Flow: Klick → DELETE /api/assessment/delete → POST /api/analyze-drucksache
→ Queue → pollAnalysis → Detail neu laden

Neuer DELETE-Endpoint /api/assessment/delete mit require_auth.

API-Response erweitert um updatedAt, source, model für beide
Endpoints (list + single assessment).

Tests: 206 passed.

Refs: #97

											
										
										
											2026-04-10 21:10:33 +02:00
+								            "updatedAt": row.get("updated_at"),
 								            "source": row.get("source"),
 								            "model": row.get("model"),
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								        })
-												Bundesland filter & transparency: stringent split + visible source (#8)

Brings the Bundesland-Dropdown from a cosmetic header widget to a real
filter that propagates through every layer (Listing, internal search,
statistics, party/tag filters, upload mode), and at the same time makes
the source parliament visible in every place where assessments from
multiple bundesländer can be mixed.

Backend
- database.get_all_assessments(bundesland=None) — new optional filter,
  "ALL" treated as None.
- database.search_assessments — bug fix: previous `if bundesland:`
  branch incorrectly added a `WHERE bundesland='ALL'` clause; now
  guarded with `bundesland and bundesland != "ALL"`.
- main.list_assessments — accepts ?bundesland= query param, includes the
  bundesland field in the response so the frontend can render badges.
- main.get_single_assessment — also includes bundesland in the response
  so the detail header can show the source parlament.
- main.search_landtag — early HTTP 400 when bundesland is missing or
  "ALL"; the live Landtag adapter cannot serve a synthetic Bundesweit
  request.
- main.index() and main.list_bundeslaender — synthetic "🌍 Bundesweit"
  entry prepended to the bundesländer list (kept out of bundeslaender.py
  on purpose — ALL is not a real state). Both endpoints additionally
  expose a parlament_names map so the frontend can render the source
  parliament without an extra round-trip.

Report (PDF + HTML)
- generate_html_report / generate_pdf_report — new optional bundesland
  parameter. When set, the report header carries the parliament name
  ("Landtag von Sachsen-Anhalt", "Landtag Nordrhein-Westfalen", …)
  beside the title. Three call sites updated: run_analysis,
  run_drucksache_analysis, download_assessment_pdf.

Frontend (templates/index.html)
- Header dropdown gets the synthetic ALL entry as first option;
  initial currentBundesland is now 'ALL' (was 'NRW').
- localStorage persistence: changeBundesland writes, DOMContentLoaded
  reads and validates against the visible options.
- changeBundesland resets the score / party / tag filter state, syncs
  the upload-mode bundesland select, disables the Landtag-Suche button
  + tooltip when ALL, and toggles a data-mode attribute on
  .list-content (used by CSS to show/hide the per-item bundesland
  badge).
- loadAssessments now sends ?bundesland=… so the API does the actual
  filtering. updateStats renders an additional per-bundesland average
  block (Ø NRW: x · Ø LSA: y) when in ALL mode and the loaded list
  spans more than one bundesland.
- renderList prepends a small "bl-badge" beside the Drucksachen-Nummer.
  Hidden in single-bundesland mode via CSS selector to avoid clutter.
- showDetail header now shows the parliament name as its own line
  (.detail-parlament).
- searchLandtag has an early-out alert if currentBundesland === 'ALL',
  saving a network round-trip.
- Upload-Mode bundesland select now starts with a "— Bundesland wählen
  —" placeholder (no auto-default), and startAnalysis validates that a
  concrete bundesland was chosen.

CSS
- .bl-badge plus the .list-content[data-mode="single"] hide rule.
- .detail-parlament for the detail header line.
- .header-parlament for the PDF report header line.

Resolves #8.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:00:39 +02:00
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    return assessments
 								# API: Get single assessment (use query param for drucksache with /)
 								@app.get("/api/assessment")
 								async def get_single_assessment(drucksache: str):
 								    """Get a single assessment by drucksache ID."""
-												Phase A: Audit-Restbefunde #57.3/4/7 (Roadmap #59)

Drei verbleibende Audit-Befunde aus #57 in einem Patch:

- **#57.3 MEDIUM** Drucksache-Regex-Validation: neue
  app/validators.py mit validate_drucksache() als gemeinsamer
  Validation-Funnel. Pattern ^\d{1,3}/\d{1,7}([-(].{1,20})?$ deckt
  alle 10 aktiven Bundesländer (8/6390, 18/12345, 8/6390(neu),
  23/3700-A) ab und blockt Path-Traversal (../, /etc/passwd) plus
  Standard-Injection (;, <, &). Drei Endpoints durchgeschleust:
  /api/assessment, /api/assessment/pdf, /api/analyze-drucksache.

- **#57.4 MEDIUM** print() → logging.getLogger(__name__): main.py
  und analyzer.py auf strukturiertes Logging umgestellt. LLM-Inhalte
  werden NICHT mehr als Volltext geloggt — neue Helper
  _content_fingerprint() liefert nur "len=N sha1=XXXX", reicht zur
  Forensik ohne Antrag-Inhalte ins Container-Log zu leaken.
  basicConfig() mit ISO-Format setzt strukturiertes Logging früh,
  damit logger.exception() auch beim Boot greift.

- **#57.7 LOW-MED** Search-Query-Limit: validate_search_query() mit
  MAX_SEARCH_QUERY_LEN=200 schützt /api/search und /api/search-landtag
  vor 10-MB-Query-DoS. database._parse_search_query() loggt jetzt
  shlex.ValueError-Fallback statt ihn zu verschlucken (deckt Memory-
  Regel "stille excepts in Adaptern" ab).

Tests: neue tests/test_main_validators.py mit 22 Cases — Drucksache-
Whitelist-Roundtrip + Path-Traversal-Reject, Search-Query Längen-
Edge-Cases. 107 Unit-Tests grün (85 alt + 22 neu).

Validators in eigenem Modul (app/validators.py), damit Tests sie ohne
slowapi-Dependency direkt importieren können.

Refs: #57, #59 (Phase A)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 11:15:16 +02:00
+								    drucksache = validate_drucksache(drucksache)
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    row = await get_assessment(drucksache)
 								    if not row:
 								        raise HTTPException(status_code=404, detail="Assessment nicht gefunden")
 								    return {
 								        "drucksache": row.get("drucksache"),
 								        "title": row.get("title"),
 								        "fraktionen": row.get("fraktionen", []),
 								        "datum": row.get("datum"),
 								        "link": row.get("link"),
-												Bundesland filter & transparency: stringent split + visible source (#8)

Brings the Bundesland-Dropdown from a cosmetic header widget to a real
filter that propagates through every layer (Listing, internal search,
statistics, party/tag filters, upload mode), and at the same time makes
the source parliament visible in every place where assessments from
multiple bundesländer can be mixed.

Backend
- database.get_all_assessments(bundesland=None) — new optional filter,
  "ALL" treated as None.
- database.search_assessments — bug fix: previous `if bundesland:`
  branch incorrectly added a `WHERE bundesland='ALL'` clause; now
  guarded with `bundesland and bundesland != "ALL"`.
- main.list_assessments — accepts ?bundesland= query param, includes the
  bundesland field in the response so the frontend can render badges.
- main.get_single_assessment — also includes bundesland in the response
  so the detail header can show the source parlament.
- main.search_landtag — early HTTP 400 when bundesland is missing or
  "ALL"; the live Landtag adapter cannot serve a synthetic Bundesweit
  request.
- main.index() and main.list_bundeslaender — synthetic "🌍 Bundesweit"
  entry prepended to the bundesländer list (kept out of bundeslaender.py
  on purpose — ALL is not a real state). Both endpoints additionally
  expose a parlament_names map so the frontend can render the source
  parliament without an extra round-trip.

Report (PDF + HTML)
- generate_html_report / generate_pdf_report — new optional bundesland
  parameter. When set, the report header carries the parliament name
  ("Landtag von Sachsen-Anhalt", "Landtag Nordrhein-Westfalen", …)
  beside the title. Three call sites updated: run_analysis,
  run_drucksache_analysis, download_assessment_pdf.

Frontend (templates/index.html)
- Header dropdown gets the synthetic ALL entry as first option;
  initial currentBundesland is now 'ALL' (was 'NRW').
- localStorage persistence: changeBundesland writes, DOMContentLoaded
  reads and validates against the visible options.
- changeBundesland resets the score / party / tag filter state, syncs
  the upload-mode bundesland select, disables the Landtag-Suche button
  + tooltip when ALL, and toggles a data-mode attribute on
  .list-content (used by CSS to show/hide the per-item bundesland
  badge).
- loadAssessments now sends ?bundesland=… so the API does the actual
  filtering. updateStats renders an additional per-bundesland average
  block (Ø NRW: x · Ø LSA: y) when in ALL mode and the loaded list
  spans more than one bundesland.
- renderList prepends a small "bl-badge" beside the Drucksachen-Nummer.
  Hidden in single-bundesland mode via CSS selector to avoid clutter.
- showDetail header now shows the parliament name as its own line
  (.detail-parlament).
- searchLandtag has an early-out alert if currentBundesland === 'ALL',
  saving a network round-trip.
- Upload-Mode bundesland select now starts with a "— Bundesland wählen
  —" placeholder (no auto-default), and startAnalysis validates that a
  concrete bundesland was chosen.

CSS
- .bl-badge plus the .list-content[data-mode="single"] hide rule.
- .detail-parlament for the detail header line.
- .header-parlament for the PDF report header line.

Resolves #8.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:00:39 +02:00
+								        "bundesland": row.get("bundesland"),
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								        "gwoeScore": row.get("gwoe_score"),
 								        "gwoeBegründung": row.get("gwoe_begruendung"),
 								        "gwoeMatrix": row.get("gwoe_matrix", []),
 								        "gwoeSchwerpunkt": row.get("gwoe_schwerpunkt", []),
 								        "wahlprogrammScores": row.get("wahlprogramm_scores", []),
 								        "verbesserungen": row.get("verbesserungen", []),
 								        "stärken": row.get("staerken", []),
 								        "schwächen": row.get("schwaechen", []),
 								        "empfehlung": row.get("empfehlung"),
 								        "empfehlungSymbol": row.get("empfehlung_symbol"),
 								        "verbesserungspotenzial": row.get("verbesserungspotenzial"),
 								        "themen": row.get("themen", []),
 								        "antragZusammenfassung": row.get("antrag_zusammenfassung"),
 								        "antragKernpunkte": row.get("antrag_kernpunkte", []),
-												#97 Neu bewerten: manueller Re-Analyse-Button + Bewertungsdatum

Fußzeile unter jedem Assessment-Detail jetzt mit:
- Bewertungsdatum ("Bewertet am DD.MM.YYYY") aus updated_at
- Quelle + Modell (batch-reanalyze / webapp, qwen-plus)
- "Neu bewerten"-Button (Auth-pflichtig, ausgegraut ohne Login)

Flow: Klick → DELETE /api/assessment/delete → POST /api/analyze-drucksache
→ Queue → pollAnalysis → Detail neu laden

Neuer DELETE-Endpoint /api/assessment/delete mit require_auth.

API-Response erweitert um updatedAt, source, model für beide
Endpoints (list + single assessment).

Tests: 206 passed.

Refs: #97

											
										
										
											2026-04-10 21:10:33 +02:00
+								        "updatedAt": row.get("updated_at"),
 								        "source": row.get("source"),
 								        "model": row.get("model"),
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    }
-												#97 Neu bewerten: manueller Re-Analyse-Button + Bewertungsdatum

Fußzeile unter jedem Assessment-Detail jetzt mit:
- Bewertungsdatum ("Bewertet am DD.MM.YYYY") aus updated_at
- Quelle + Modell (batch-reanalyze / webapp, qwen-plus)
- "Neu bewerten"-Button (Auth-pflichtig, ausgegraut ohne Login)

Flow: Klick → DELETE /api/assessment/delete → POST /api/analyze-drucksache
→ Queue → pollAnalysis → Detail neu laden

Neuer DELETE-Endpoint /api/assessment/delete mit require_auth.

API-Response erweitert um updatedAt, source, model für beide
Endpoints (list + single assessment).

Tests: 206 passed.

Refs: #97

											
										
										
											2026-04-10 21:10:33 +02:00
+								# API: Delete assessment for re-analysis (#97)
 								@app.delete("/api/assessment/delete")
 								async def delete_assessment_endpoint(
 								    drucksache: str,
 								    user: dict = Depends(require_auth),
 								):
 								    """Löscht ein Assessment, damit es neu analysiert werden kann."""
 								    drucksache = validate_drucksache(drucksache)
 								    deleted = await delete_assessment(drucksache)
 								    if not deleted:
 								        raise HTTPException(status_code=404, detail="Assessment nicht gefunden")
 								    return {"status": "deleted", "drucksache": drucksache}
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								# API: Generate PDF on demand for an assessment
 								@app.get("/api/assessment/pdf")
 								async def download_assessment_pdf(drucksache: str):
 								    """Generate and download PDF for an assessment."""
 								    from .models import Assessment
-												Phase A: Audit-Restbefunde #57.3/4/7 (Roadmap #59)

Drei verbleibende Audit-Befunde aus #57 in einem Patch:

- **#57.3 MEDIUM** Drucksache-Regex-Validation: neue
  app/validators.py mit validate_drucksache() als gemeinsamer
  Validation-Funnel. Pattern ^\d{1,3}/\d{1,7}([-(].{1,20})?$ deckt
  alle 10 aktiven Bundesländer (8/6390, 18/12345, 8/6390(neu),
  23/3700-A) ab und blockt Path-Traversal (../, /etc/passwd) plus
  Standard-Injection (;, <, &). Drei Endpoints durchgeschleust:
  /api/assessment, /api/assessment/pdf, /api/analyze-drucksache.

- **#57.4 MEDIUM** print() → logging.getLogger(__name__): main.py
  und analyzer.py auf strukturiertes Logging umgestellt. LLM-Inhalte
  werden NICHT mehr als Volltext geloggt — neue Helper
  _content_fingerprint() liefert nur "len=N sha1=XXXX", reicht zur
  Forensik ohne Antrag-Inhalte ins Container-Log zu leaken.
  basicConfig() mit ISO-Format setzt strukturiertes Logging früh,
  damit logger.exception() auch beim Boot greift.

- **#57.7 LOW-MED** Search-Query-Limit: validate_search_query() mit
  MAX_SEARCH_QUERY_LEN=200 schützt /api/search und /api/search-landtag
  vor 10-MB-Query-DoS. database._parse_search_query() loggt jetzt
  shlex.ValueError-Fallback statt ihn zu verschlucken (deckt Memory-
  Regel "stille excepts in Adaptern" ab).

Tests: neue tests/test_main_validators.py mit 22 Cases — Drucksache-
Whitelist-Roundtrip + Path-Traversal-Reject, Search-Query Längen-
Edge-Cases. 107 Unit-Tests grün (85 alt + 22 neu).

Validators in eigenem Modul (app/validators.py), damit Tests sie ohne
slowapi-Dependency direkt importieren können.

Refs: #57, #59 (Phase A)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 11:15:16 +02:00
 								    drucksache = validate_drucksache(drucksache)
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    row = await get_assessment(drucksache)
 								    if not row:
 								        raise HTTPException(status_code=404, detail="Assessment nicht gefunden")
 								    # Check if PDF already exists
 								    safe_name = drucksache.replace("/", "-")
 								    pdf_path = settings.reports_dir / f"{safe_name}.pdf"
 								    if not pdf_path.exists():
 								        # Convert DB row to Assessment model for report generation
 								        assessment_data = {
 								            "drucksache": row.get("drucksache"),
 								            "title": row.get("title"),
 								            "fraktionen": row.get("fraktionen", []),
 								            "datum": row.get("datum"),
 								            "link": row.get("link"),
 								            "gwoe_score": row.get("gwoe_score") or 0,
 								            "gwoe_begruendung": row.get("gwoe_begruendung") or "",
 								            "gwoe_matrix": row.get("gwoe_matrix", []),
 								            "gwoe_schwerpunkt": row.get("gwoe_schwerpunkt", []),
 								            "wahlprogramm_scores": row.get("wahlprogramm_scores", []),
 								            "verbesserungen": row.get("verbesserungen", []),
 								            "staerken": row.get("staerken", []),
 								            "schwaechen": row.get("schwaechen", []),
 								            "empfehlung": row.get("empfehlung") or "",
 								            "empfehlung_symbol": row.get("empfehlung_symbol") or "",
 								            "verbesserungspotenzial": row.get("verbesserungspotenzial") or "",
 								            "themen": row.get("themen", []),
 								            "antrag_zusammenfassung": row.get("antrag_zusammenfassung") or "",
 								            "antrag_kernpunkte": row.get("antrag_kernpunkte", []),
 								        }
 								        try:
 								            assessment = Assessment(**assessment_data)
-												Bundesland filter & transparency: stringent split + visible source (#8)

Brings the Bundesland-Dropdown from a cosmetic header widget to a real
filter that propagates through every layer (Listing, internal search,
statistics, party/tag filters, upload mode), and at the same time makes
the source parliament visible in every place where assessments from
multiple bundesländer can be mixed.

Backend
- database.get_all_assessments(bundesland=None) — new optional filter,
  "ALL" treated as None.
- database.search_assessments — bug fix: previous `if bundesland:`
  branch incorrectly added a `WHERE bundesland='ALL'` clause; now
  guarded with `bundesland and bundesland != "ALL"`.
- main.list_assessments — accepts ?bundesland= query param, includes the
  bundesland field in the response so the frontend can render badges.
- main.get_single_assessment — also includes bundesland in the response
  so the detail header can show the source parlament.
- main.search_landtag — early HTTP 400 when bundesland is missing or
  "ALL"; the live Landtag adapter cannot serve a synthetic Bundesweit
  request.
- main.index() and main.list_bundeslaender — synthetic "🌍 Bundesweit"
  entry prepended to the bundesländer list (kept out of bundeslaender.py
  on purpose — ALL is not a real state). Both endpoints additionally
  expose a parlament_names map so the frontend can render the source
  parliament without an extra round-trip.

Report (PDF + HTML)
- generate_html_report / generate_pdf_report — new optional bundesland
  parameter. When set, the report header carries the parliament name
  ("Landtag von Sachsen-Anhalt", "Landtag Nordrhein-Westfalen", …)
  beside the title. Three call sites updated: run_analysis,
  run_drucksache_analysis, download_assessment_pdf.

Frontend (templates/index.html)
- Header dropdown gets the synthetic ALL entry as first option;
  initial currentBundesland is now 'ALL' (was 'NRW').
- localStorage persistence: changeBundesland writes, DOMContentLoaded
  reads and validates against the visible options.
- changeBundesland resets the score / party / tag filter state, syncs
  the upload-mode bundesland select, disables the Landtag-Suche button
  + tooltip when ALL, and toggles a data-mode attribute on
  .list-content (used by CSS to show/hide the per-item bundesland
  badge).
- loadAssessments now sends ?bundesland=… so the API does the actual
  filtering. updateStats renders an additional per-bundesland average
  block (Ø NRW: x · Ø LSA: y) when in ALL mode and the loaded list
  spans more than one bundesland.
- renderList prepends a small "bl-badge" beside the Drucksachen-Nummer.
  Hidden in single-bundesland mode via CSS selector to avoid clutter.
- showDetail header now shows the parliament name as its own line
  (.detail-parlament).
- searchLandtag has an early-out alert if currentBundesland === 'ALL',
  saving a network round-trip.
- Upload-Mode bundesland select now starts with a "— Bundesland wählen
  —" placeholder (no auto-default), and startAnalysis validates that a
  concrete bundesland was chosen.

CSS
- .bl-badge plus the .list-content[data-mode="single"] hide rule.
- .detail-parlament for the detail header line.
- .header-parlament for the PDF report header line.

Resolves #8.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:00:39 +02:00
+								            await generate_pdf_report(
 								                assessment,
 								                pdf_path,
 								                bundesland=row.get("bundesland"),
 								            )
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								        except Exception as e:
 								            raise HTTPException(status_code=500, detail=f"PDF-Generierung fehlgeschlagen: {e}")
 								    return FileResponse(
 								        pdf_path,
 								        media_type="application/pdf",
 								        filename=f"gwoe-{safe_name}.pdf"
 								    )
 								# API: Search internal DB only
 								@app.get("/api/search")
 								async def search_internal(
 								    q: str,
 								    bundesland: str = "NRW",
 								    limit: int = 50
 								):
 								    """
 								    Search internal assessments database only.
 								    """
-												Phase A: Audit-Restbefunde #57.3/4/7 (Roadmap #59)

Drei verbleibende Audit-Befunde aus #57 in einem Patch:

- **#57.3 MEDIUM** Drucksache-Regex-Validation: neue
  app/validators.py mit validate_drucksache() als gemeinsamer
  Validation-Funnel. Pattern ^\d{1,3}/\d{1,7}([-(].{1,20})?$ deckt
  alle 10 aktiven Bundesländer (8/6390, 18/12345, 8/6390(neu),
  23/3700-A) ab und blockt Path-Traversal (../, /etc/passwd) plus
  Standard-Injection (;, <, &). Drei Endpoints durchgeschleust:
  /api/assessment, /api/assessment/pdf, /api/analyze-drucksache.

- **#57.4 MEDIUM** print() → logging.getLogger(__name__): main.py
  und analyzer.py auf strukturiertes Logging umgestellt. LLM-Inhalte
  werden NICHT mehr als Volltext geloggt — neue Helper
  _content_fingerprint() liefert nur "len=N sha1=XXXX", reicht zur
  Forensik ohne Antrag-Inhalte ins Container-Log zu leaken.
  basicConfig() mit ISO-Format setzt strukturiertes Logging früh,
  damit logger.exception() auch beim Boot greift.

- **#57.7 LOW-MED** Search-Query-Limit: validate_search_query() mit
  MAX_SEARCH_QUERY_LEN=200 schützt /api/search und /api/search-landtag
  vor 10-MB-Query-DoS. database._parse_search_query() loggt jetzt
  shlex.ValueError-Fallback statt ihn zu verschlucken (deckt Memory-
  Regel "stille excepts in Adaptern" ab).

Tests: neue tests/test_main_validators.py mit 22 Cases — Drucksache-
Whitelist-Roundtrip + Path-Traversal-Reject, Search-Query Längen-
Edge-Cases. 107 Unit-Tests grün (85 alt + 22 neu).

Validators in eigenem Modul (app/validators.py), damit Tests sie ohne
slowapi-Dependency direkt importieren können.

Refs: #57, #59 (Phase A)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 11:15:16 +02:00
+								    q = validate_search_query(q)
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    db_results = await search_assessments(q, bundesland, limit)
 								    results = []
 								    for row in db_results:
 								        results.append({
 								            "drucksache": row.get("drucksache"),
 								            "title": row.get("title"),
 								            "fraktionen": row.get("fraktionen", []),
 								            "datum": row.get("datum"),
 								            "link": row.get("link"),
 								            "bundesland": bundesland,
 								            "gwoeScore": row.get("gwoe_score"),
 								            "themen": row.get("themen", []),
 								            "status": "checked",
 								        })
 								    return results
 								# API: Search external parliament portal (Landtag)
 								@app.get("/api/search-landtag")
 								async def search_landtag(
 								    q: str,
 								    bundesland: str = "NRW",
 								    limit: int = 20
 								):
 								    """
 								    Search external parliament portal (e.g., NRW OPAL).
 								    Returns results that can be analyzed with "Jetzt prüfen".
-												Bundesland filter & transparency: stringent split + visible source (#8)

Brings the Bundesland-Dropdown from a cosmetic header widget to a real
filter that propagates through every layer (Listing, internal search,
statistics, party/tag filters, upload mode), and at the same time makes
the source parliament visible in every place where assessments from
multiple bundesländer can be mixed.

Backend
- database.get_all_assessments(bundesland=None) — new optional filter,
  "ALL" treated as None.
- database.search_assessments — bug fix: previous `if bundesland:`
  branch incorrectly added a `WHERE bundesland='ALL'` clause; now
  guarded with `bundesland and bundesland != "ALL"`.
- main.list_assessments — accepts ?bundesland= query param, includes the
  bundesland field in the response so the frontend can render badges.
- main.get_single_assessment — also includes bundesland in the response
  so the detail header can show the source parlament.
- main.search_landtag — early HTTP 400 when bundesland is missing or
  "ALL"; the live Landtag adapter cannot serve a synthetic Bundesweit
  request.
- main.index() and main.list_bundeslaender — synthetic "🌍 Bundesweit"
  entry prepended to the bundesländer list (kept out of bundeslaender.py
  on purpose — ALL is not a real state). Both endpoints additionally
  expose a parlament_names map so the frontend can render the source
  parliament without an extra round-trip.

Report (PDF + HTML)
- generate_html_report / generate_pdf_report — new optional bundesland
  parameter. When set, the report header carries the parliament name
  ("Landtag von Sachsen-Anhalt", "Landtag Nordrhein-Westfalen", …)
  beside the title. Three call sites updated: run_analysis,
  run_drucksache_analysis, download_assessment_pdf.

Frontend (templates/index.html)
- Header dropdown gets the synthetic ALL entry as first option;
  initial currentBundesland is now 'ALL' (was 'NRW').
- localStorage persistence: changeBundesland writes, DOMContentLoaded
  reads and validates against the visible options.
- changeBundesland resets the score / party / tag filter state, syncs
  the upload-mode bundesland select, disables the Landtag-Suche button
  + tooltip when ALL, and toggles a data-mode attribute on
  .list-content (used by CSS to show/hide the per-item bundesland
  badge).
- loadAssessments now sends ?bundesland=… so the API does the actual
  filtering. updateStats renders an additional per-bundesland average
  block (Ø NRW: x · Ø LSA: y) when in ALL mode and the loaded list
  spans more than one bundesland.
- renderList prepends a small "bl-badge" beside the Drucksachen-Nummer.
  Hidden in single-bundesland mode via CSS selector to avoid clutter.
- showDetail header now shows the parliament name as its own line
  (.detail-parlament).
- searchLandtag has an early-out alert if currentBundesland === 'ALL',
  saving a network round-trip.
- Upload-Mode bundesland select now starts with a "— Bundesland wählen
  —" placeholder (no auto-default), and startAnalysis validates that a
  concrete bundesland was chosen.

CSS
- .bl-badge plus the .list-content[data-mode="single"] hide rule.
- .detail-parlament for the detail header line.
- .header-parlament for the PDF report header line.

Resolves #8.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:00:39 +02:00
 								    Requires a concrete Bundesland — the special "ALL" / Bundesweit mode
 								    cannot pick a single Landtag adapter and is rejected with HTTP 400.
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    """
-												Phase A: Audit-Restbefunde #57.3/4/7 (Roadmap #59)

Drei verbleibende Audit-Befunde aus #57 in einem Patch:

- **#57.3 MEDIUM** Drucksache-Regex-Validation: neue
  app/validators.py mit validate_drucksache() als gemeinsamer
  Validation-Funnel. Pattern ^\d{1,3}/\d{1,7}([-(].{1,20})?$ deckt
  alle 10 aktiven Bundesländer (8/6390, 18/12345, 8/6390(neu),
  23/3700-A) ab und blockt Path-Traversal (../, /etc/passwd) plus
  Standard-Injection (;, <, &). Drei Endpoints durchgeschleust:
  /api/assessment, /api/assessment/pdf, /api/analyze-drucksache.

- **#57.4 MEDIUM** print() → logging.getLogger(__name__): main.py
  und analyzer.py auf strukturiertes Logging umgestellt. LLM-Inhalte
  werden NICHT mehr als Volltext geloggt — neue Helper
  _content_fingerprint() liefert nur "len=N sha1=XXXX", reicht zur
  Forensik ohne Antrag-Inhalte ins Container-Log zu leaken.
  basicConfig() mit ISO-Format setzt strukturiertes Logging früh,
  damit logger.exception() auch beim Boot greift.

- **#57.7 LOW-MED** Search-Query-Limit: validate_search_query() mit
  MAX_SEARCH_QUERY_LEN=200 schützt /api/search und /api/search-landtag
  vor 10-MB-Query-DoS. database._parse_search_query() loggt jetzt
  shlex.ValueError-Fallback statt ihn zu verschlucken (deckt Memory-
  Regel "stille excepts in Adaptern" ab).

Tests: neue tests/test_main_validators.py mit 22 Cases — Drucksache-
Whitelist-Roundtrip + Path-Traversal-Reject, Search-Query Längen-
Edge-Cases. 107 Unit-Tests grün (85 alt + 22 neu).

Validators in eigenem Modul (app/validators.py), damit Tests sie ohne
slowapi-Dependency direkt importieren können.

Refs: #57, #59 (Phase A)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 11:15:16 +02:00
+								    q = validate_search_query(q)
-												Bundesland filter & transparency: stringent split + visible source (#8)

Brings the Bundesland-Dropdown from a cosmetic header widget to a real
filter that propagates through every layer (Listing, internal search,
statistics, party/tag filters, upload mode), and at the same time makes
the source parliament visible in every place where assessments from
multiple bundesländer can be mixed.

Backend
- database.get_all_assessments(bundesland=None) — new optional filter,
  "ALL" treated as None.
- database.search_assessments — bug fix: previous `if bundesland:`
  branch incorrectly added a `WHERE bundesland='ALL'` clause; now
  guarded with `bundesland and bundesland != "ALL"`.
- main.list_assessments — accepts ?bundesland= query param, includes the
  bundesland field in the response so the frontend can render badges.
- main.get_single_assessment — also includes bundesland in the response
  so the detail header can show the source parlament.
- main.search_landtag — early HTTP 400 when bundesland is missing or
  "ALL"; the live Landtag adapter cannot serve a synthetic Bundesweit
  request.
- main.index() and main.list_bundeslaender — synthetic "🌍 Bundesweit"
  entry prepended to the bundesländer list (kept out of bundeslaender.py
  on purpose — ALL is not a real state). Both endpoints additionally
  expose a parlament_names map so the frontend can render the source
  parliament without an extra round-trip.

Report (PDF + HTML)
- generate_html_report / generate_pdf_report — new optional bundesland
  parameter. When set, the report header carries the parliament name
  ("Landtag von Sachsen-Anhalt", "Landtag Nordrhein-Westfalen", …)
  beside the title. Three call sites updated: run_analysis,
  run_drucksache_analysis, download_assessment_pdf.

Frontend (templates/index.html)
- Header dropdown gets the synthetic ALL entry as first option;
  initial currentBundesland is now 'ALL' (was 'NRW').
- localStorage persistence: changeBundesland writes, DOMContentLoaded
  reads and validates against the visible options.
- changeBundesland resets the score / party / tag filter state, syncs
  the upload-mode bundesland select, disables the Landtag-Suche button
  + tooltip when ALL, and toggles a data-mode attribute on
  .list-content (used by CSS to show/hide the per-item bundesland
  badge).
- loadAssessments now sends ?bundesland=… so the API does the actual
  filtering. updateStats renders an additional per-bundesland average
  block (Ø NRW: x · Ø LSA: y) when in ALL mode and the loaded list
  spans more than one bundesland.
- renderList prepends a small "bl-badge" beside the Drucksachen-Nummer.
  Hidden in single-bundesland mode via CSS selector to avoid clutter.
- showDetail header now shows the parliament name as its own line
  (.detail-parlament).
- searchLandtag has an early-out alert if currentBundesland === 'ALL',
  saving a network round-trip.
- Upload-Mode bundesland select now starts with a "— Bundesland wählen
  —" placeholder (no auto-default), and startAnalysis validates that a
  concrete bundesland was chosen.

CSS
- .bl-badge plus the .list-content[data-mode="single"] hide rule.
- .detail-parlament for the detail header line.
- .header-parlament for the PDF report header line.

Resolves #8.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:00:39 +02:00
+								    if not bundesland or bundesland == "ALL":
 								        raise HTTPException(
 								            status_code=400,
 								            detail="Landtag-Suche benötigt ein konkretes Bundesland",
 								        )
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    adapter = get_adapter(bundesland)
 								    if not adapter:
 								        return {"error": f"Bundesland {bundesland} noch nicht unterstützt"}
 								    try:
 								        external = await adapter.search(q, limit)
 								        results = []
 								        for doc in external:
 								            results.append({
 								                "drucksache": doc.drucksache,
 								                "title": doc.title,
 								                "fraktionen": doc.fraktionen,
 								                "datum": doc.datum,
 								                "link": doc.link,
 								                "bundesland": bundesland,
 								                "typ": doc.typ,
 								                "gwoeScore": None,
 								                "status": "unchecked",
 								            })
 								        return results
 								    except Exception as e:
-												Phase A: Audit-Restbefunde #57.3/4/7 (Roadmap #59)

Drei verbleibende Audit-Befunde aus #57 in einem Patch:

- **#57.3 MEDIUM** Drucksache-Regex-Validation: neue
  app/validators.py mit validate_drucksache() als gemeinsamer
  Validation-Funnel. Pattern ^\d{1,3}/\d{1,7}([-(].{1,20})?$ deckt
  alle 10 aktiven Bundesländer (8/6390, 18/12345, 8/6390(neu),
  23/3700-A) ab und blockt Path-Traversal (../, /etc/passwd) plus
  Standard-Injection (;, <, &). Drei Endpoints durchgeschleust:
  /api/assessment, /api/assessment/pdf, /api/analyze-drucksache.

- **#57.4 MEDIUM** print() → logging.getLogger(__name__): main.py
  und analyzer.py auf strukturiertes Logging umgestellt. LLM-Inhalte
  werden NICHT mehr als Volltext geloggt — neue Helper
  _content_fingerprint() liefert nur "len=N sha1=XXXX", reicht zur
  Forensik ohne Antrag-Inhalte ins Container-Log zu leaken.
  basicConfig() mit ISO-Format setzt strukturiertes Logging früh,
  damit logger.exception() auch beim Boot greift.

- **#57.7 LOW-MED** Search-Query-Limit: validate_search_query() mit
  MAX_SEARCH_QUERY_LEN=200 schützt /api/search und /api/search-landtag
  vor 10-MB-Query-DoS. database._parse_search_query() loggt jetzt
  shlex.ValueError-Fallback statt ihn zu verschlucken (deckt Memory-
  Regel "stille excepts in Adaptern" ab).

Tests: neue tests/test_main_validators.py mit 22 Cases — Drucksache-
Whitelist-Roundtrip + Path-Traversal-Reject, Search-Query Längen-
Edge-Cases. 107 Unit-Tests grün (85 alt + 22 neu).

Validators in eigenem Modul (app/validators.py), damit Tests sie ohne
slowapi-Dependency direkt importieren können.

Refs: #57, #59 (Phase A)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 11:15:16 +02:00
+								        logger.exception("Landtag search error for q=%r bundesland=%s", q, bundesland)
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								        return {"error": f"Suchfehler: {str(e)}"}
-												#44 Batch-Analyse: POST /api/batch-analyze

Neuer Endpoint der die neuesten ungeprüften Drucksachen eines BL
automatisch sucht, herunterlädt und in die Queue (#95) einreiht:

POST /api/batch-analyze
  bundesland=NRW  (Pflicht)
  limit=10        (1-100, default 10)

Flow:
1. adapter.search("", limit=limit*3) holt neueste Drucksachen
2. Pro Drucksache: check ob schon bewertet → skip
3. download_text → enqueue(run_drucksache_analysis)
4. Queue verarbeitet seriell mit 10s Pause (DashScope-freundlich)

Response:
{
  "status": "batch_enqueued",
  "enqueued": 7,
  "skipped_existing": 3,
  "jobs": [{"drucksache": "18/...", "title": "...", "queue_position": 1}, ...]
}

Rate-limited auf 3/min. Erfordert Auth (#43).
Bei voller Queue: enqueued nur soweit Platz, kein Error.

Tests: 201 passed.

Refs: #44, #95 (Queue-Basis)

											
										
										
											2026-04-10 17:26:05 +02:00
+								# API: Batch-Analyse (#44) — enqueued ungeprüfte Drucksachen eines BL
 								@app.post("/api/batch-analyze")
 								@limiter.limit("3/minute")
 								async def batch_analyze(
 								    request: Request,
 								    bundesland: str = Form(...),
 								    limit: int = Form(10),
 								    user: dict = Depends(require_auth),
 								):
 								    """Sucht die neuesten Drucksachen im Landtag-Portal und enqueued
 								    alle, die noch nicht in der DB bewertet sind.
 								    Returns: Liste der enqueued Drucksachen + Queue-Position.
 								    """
 								    from .queue import enqueue, QueueFullError
 								    if limit < 1 or limit > 100:
 								        raise HTTPException(status_code=400, detail="limit muss 1-100 sein")
 								    adapter = get_adapter(bundesland)
 								    if not adapter:
 								        raise HTTPException(status_code=400, detail=f"Bundesland {bundesland} nicht unterstützt")
 								    # Neueste Drucksachen vom Landtag holen (leer = neueste Anträge)
 								    drucksachen = await adapter.search("", limit=limit * 3)  # 3× holen wegen Typ-Filter
 								    enqueued = []
 								    skipped = 0
 								    for doc in drucksachen:
 								        if len(enqueued) >= limit:
 								            break
 								        # Schon bewertet?
 								        existing = await get_assessment(doc.drucksache)
 								        if existing:
 								            skipped += 1
 								            continue
 								        # Text herunterladen
 								        text = await adapter.download_text(doc.drucksache)
 								        if not text:
 								            continue
 								        # Enqueue
 								        job_id = str(uuid.uuid4())
 								        await create_job(job_id, text[:500], bundesland, "qwen-plus")
 								        try:
 								            position = await enqueue(
 								                job_id,
 								                run_drucksache_analysis,
 								                job_id, doc.drucksache, text, bundesland, "qwen-plus", doc,
 								            )
 								            enqueued.append({
 								                "drucksache": doc.drucksache,
 								                "title": doc.title,
 								                "job_id": job_id,
 								                "queue_position": position,
 								            })
 								        except QueueFullError:
 								            break
 								    return {
 								        "status": "batch_enqueued",
 								        "bundesland": bundesland,
 								        "enqueued": len(enqueued),
 								        "skipped_existing": skipped,
 								        "jobs": enqueued,
 								    }
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								# API: Analyze a document from parliament portal
 								@app.post("/api/analyze-drucksache")
-												Security hotfixes #1, #2, #6 from audit (#57)

Drei akute Befunde aus dem Live-System-Audit (Issue #57):

- **#1 HIGH** — Resource Exhaustion via öffentlichem POST: slowapi
  Limiter (in-memory, IP-key) auf /analyze (10/min), /api/analyze-drucksache
  (10/min) und /api/programme/index (3/min). Verhindert, dass ein
  unauthentifizierter Client mit einer Schleife die DashScope-Quota oder
  die CPU des Containers leerziehen kann. Default-Storage reicht solange
  wir auf einem einzigen Worker laufen.

- **#2 MEDIUM** + **#6 MEDIUM** (selber Root-Cause) — XXE/Local-File-Read
  via WeasyPrint und Stored XSS via Browser-Rendering: alle LLM-getragenen
  Felder in app/report.py laufen jetzt durch html.escape() bevor sie in
  die HTML-Template interpoliert werden. format_redline_html escape-first
  und ersetzt dann die Markdown-Marker durch von uns kontrollierte
  <span>-Tags. build_matrix_html escaped das aspect-Attribut, sodass ein
  nacktes " den title="..."-Wert nicht mehr beenden und einen Event-
  Handler injizieren kann. Toter jinja2-Import in report.py entfernt
  (war never used, blockierte nur den lokalen Test).

- **Tests** — neue tests/test_report.py mit 8 Cases, die direkt die
  Bug-Klasse verifizieren: <script>, file://-img, "-attribut-breakout
  in Title und ein End-to-End-Render mit XSS-Payloads in jedem LLM-Feld.
  Die Marker-Funktionalität (** und ~~) wird mit-getestet, damit der
  Escape-First-Ansatz das nicht versehentlich kaputt macht.

77 alte Unit-Tests + 8 neue → 85 grün.

Rate-Limit-Verifikation per TestClient ist Integration-Scope und folgt
in tests/integration/test_main_security.py als separates Folge-Item.

Refs: #57

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 10:45:43 +02:00
+								@limiter.limit("10/minute")
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								async def analyze_drucksache(
-												Security hotfixes #1, #2, #6 from audit (#57)

Drei akute Befunde aus dem Live-System-Audit (Issue #57):

- **#1 HIGH** — Resource Exhaustion via öffentlichem POST: slowapi
  Limiter (in-memory, IP-key) auf /analyze (10/min), /api/analyze-drucksache
  (10/min) und /api/programme/index (3/min). Verhindert, dass ein
  unauthentifizierter Client mit einer Schleife die DashScope-Quota oder
  die CPU des Containers leerziehen kann. Default-Storage reicht solange
  wir auf einem einzigen Worker laufen.

- **#2 MEDIUM** + **#6 MEDIUM** (selber Root-Cause) — XXE/Local-File-Read
  via WeasyPrint und Stored XSS via Browser-Rendering: alle LLM-getragenen
  Felder in app/report.py laufen jetzt durch html.escape() bevor sie in
  die HTML-Template interpoliert werden. format_redline_html escape-first
  und ersetzt dann die Markdown-Marker durch von uns kontrollierte
  <span>-Tags. build_matrix_html escaped das aspect-Attribut, sodass ein
  nacktes " den title="..."-Wert nicht mehr beenden und einen Event-
  Handler injizieren kann. Toter jinja2-Import in report.py entfernt
  (war never used, blockierte nur den lokalen Test).

- **Tests** — neue tests/test_report.py mit 8 Cases, die direkt die
  Bug-Klasse verifizieren: <script>, file://-img, "-attribut-breakout
  in Title und ein End-to-End-Render mit XSS-Payloads in jedem LLM-Feld.
  Die Marker-Funktionalität (** und ~~) wird mit-getestet, damit der
  Escape-First-Ansatz das nicht versehentlich kaputt macht.

77 alte Unit-Tests + 8 neue → 85 grün.

Rate-Limit-Verifikation per TestClient ist Integration-Scope und folgt
in tests/integration/test_main_security.py als separates Folge-Item.

Refs: #57

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 10:45:43 +02:00
+								    request: Request,
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    background_tasks: BackgroundTasks,
 								    drucksache: str = Form(...),
 								    bundesland: str = Form("NRW"),
-												Fix SyntaxError: user=Depends nach Form-Params (Python positional-after-default)

											
										
										
											2026-04-10 14:30:54 +02:00
+								    model: str = Form("qwen-plus"),
 								    user: dict = Depends(require_auth),
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								):
 								    """
 								    Download a document from parliament portal and analyze it.
 								    """
-												Phase A: Audit-Restbefunde #57.3/4/7 (Roadmap #59)

Drei verbleibende Audit-Befunde aus #57 in einem Patch:

- **#57.3 MEDIUM** Drucksache-Regex-Validation: neue
  app/validators.py mit validate_drucksache() als gemeinsamer
  Validation-Funnel. Pattern ^\d{1,3}/\d{1,7}([-(].{1,20})?$ deckt
  alle 10 aktiven Bundesländer (8/6390, 18/12345, 8/6390(neu),
  23/3700-A) ab und blockt Path-Traversal (../, /etc/passwd) plus
  Standard-Injection (;, <, &). Drei Endpoints durchgeschleust:
  /api/assessment, /api/assessment/pdf, /api/analyze-drucksache.

- **#57.4 MEDIUM** print() → logging.getLogger(__name__): main.py
  und analyzer.py auf strukturiertes Logging umgestellt. LLM-Inhalte
  werden NICHT mehr als Volltext geloggt — neue Helper
  _content_fingerprint() liefert nur "len=N sha1=XXXX", reicht zur
  Forensik ohne Antrag-Inhalte ins Container-Log zu leaken.
  basicConfig() mit ISO-Format setzt strukturiertes Logging früh,
  damit logger.exception() auch beim Boot greift.

- **#57.7 LOW-MED** Search-Query-Limit: validate_search_query() mit
  MAX_SEARCH_QUERY_LEN=200 schützt /api/search und /api/search-landtag
  vor 10-MB-Query-DoS. database._parse_search_query() loggt jetzt
  shlex.ValueError-Fallback statt ihn zu verschlucken (deckt Memory-
  Regel "stille excepts in Adaptern" ab).

Tests: neue tests/test_main_validators.py mit 22 Cases — Drucksache-
Whitelist-Roundtrip + Path-Traversal-Reject, Search-Query Längen-
Edge-Cases. 107 Unit-Tests grün (85 alt + 22 neu).

Validators in eigenem Modul (app/validators.py), damit Tests sie ohne
slowapi-Dependency direkt importieren können.

Refs: #57, #59 (Phase A)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 11:15:16 +02:00
+								    drucksache = validate_drucksache(drucksache)
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    # Check if already analyzed
 								    existing = await get_assessment(drucksache)
 								    if existing:
 								        return {"status": "already_checked", "drucksache": drucksache}
 								    # Get adapter and download
 								    adapter = get_adapter(bundesland)
 								    if not adapter:
 								        raise HTTPException(status_code=400, detail=f"Bundesland {bundesland} nicht unterstützt")
 								    # Download text
 								    text = await adapter.download_text(drucksache)
 								    if not text:
 								        raise HTTPException(status_code=404, detail=f"Dokument {drucksache} nicht gefunden")
 								    # Get document metadata
 								    doc = await adapter.get_document(drucksache)
-												#95 Job-Queue: SQLite-backed asyncio Worker mit Backpressure

FIFO-Queue für Analyse-Jobs — ersetzt FastAPI BackgroundTasks:

app/queue.py:
- asyncio.Queue mit MAX_QUEUE_SIZE=50
- Einzelner Worker-Coroutine (Concurrency=1, DashScope-freundlich)
- MIN_PAUSE_SECONDS=10 zwischen Jobs
- Exponentielles Backoff bei Serien-Fehlern (15s → 5min)
- get_queue_status() für den Status-Endpoint
- QueueFullError → HTTP 429 + Retry-After Header
- start_worker() als FastAPI-Startup-Task
- re_enqueue_pending() markiert Crash-Überlebende als 'stale'

main.py:
- POST /api/analyze-drucksache nutzt queue.enqueue() statt
  background_tasks.add_task()
- Response enthält queue_position
- GET /api/queue/status zeigt pending, max_size, processed,
  estimated_wait_seconds, worker_running
- Worker wird bei app.startup() gestartet

Tests: 201 passed, 5 skipped.

Refs: #95, #44 (Batch baut auf Queue auf)

											
										
										
											2026-04-10 17:24:34 +02:00
+								    # Create job and enqueue (#95)
 								    from .queue import enqueue, QueueFullError
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    job_id = str(uuid.uuid4())
 								    await create_job(job_id, text[:500], bundesland, model)
-												#95 Job-Queue: SQLite-backed asyncio Worker mit Backpressure

FIFO-Queue für Analyse-Jobs — ersetzt FastAPI BackgroundTasks:

app/queue.py:
- asyncio.Queue mit MAX_QUEUE_SIZE=50
- Einzelner Worker-Coroutine (Concurrency=1, DashScope-freundlich)
- MIN_PAUSE_SECONDS=10 zwischen Jobs
- Exponentielles Backoff bei Serien-Fehlern (15s → 5min)
- get_queue_status() für den Status-Endpoint
- QueueFullError → HTTP 429 + Retry-After Header
- start_worker() als FastAPI-Startup-Task
- re_enqueue_pending() markiert Crash-Überlebende als 'stale'

main.py:
- POST /api/analyze-drucksache nutzt queue.enqueue() statt
  background_tasks.add_task()
- Response enthält queue_position
- GET /api/queue/status zeigt pending, max_size, processed,
  estimated_wait_seconds, worker_running
- Worker wird bei app.startup() gestartet

Tests: 201 passed, 5 skipped.

Refs: #95, #44 (Batch baut auf Queue auf)

											
										
										
											2026-04-10 17:24:34 +02:00
 								    try:
 								        position = await enqueue(
 								            job_id,
 								            run_drucksache_analysis,
 								            job_id, drucksache, text, bundesland, model, doc,
 								        )
 								    except QueueFullError:
 								        await update_job(job_id, status="rejected", error="Queue voll")
 								        raise HTTPException(
 								            status_code=429,
 								            detail="Analyse-Queue ist voll. Bitte später erneut versuchen.",
 								            headers={"Retry-After": "60"},
 								        )
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
-												#95 Job-Queue: SQLite-backed asyncio Worker mit Backpressure

FIFO-Queue für Analyse-Jobs — ersetzt FastAPI BackgroundTasks:

app/queue.py:
- asyncio.Queue mit MAX_QUEUE_SIZE=50
- Einzelner Worker-Coroutine (Concurrency=1, DashScope-freundlich)
- MIN_PAUSE_SECONDS=10 zwischen Jobs
- Exponentielles Backoff bei Serien-Fehlern (15s → 5min)
- get_queue_status() für den Status-Endpoint
- QueueFullError → HTTP 429 + Retry-After Header
- start_worker() als FastAPI-Startup-Task
- re_enqueue_pending() markiert Crash-Überlebende als 'stale'

main.py:
- POST /api/analyze-drucksache nutzt queue.enqueue() statt
  background_tasks.add_task()
- Response enthält queue_position
- GET /api/queue/status zeigt pending, max_size, processed,
  estimated_wait_seconds, worker_running
- Worker wird bei app.startup() gestartet

Tests: 201 passed, 5 skipped.

Refs: #95, #44 (Batch baut auf Queue auf)

											
										
										
											2026-04-10 17:24:34 +02:00
+								    return {
 								        "status": "queued",
 								        "job_id": job_id,
 								        "drucksache": drucksache,
 								        "queue_position": position,
 								    }
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
 								async def run_drucksache_analysis(
 								    job_id: str,
 								    drucksache: str,
 								    text: str,
 								    bundesland: str,
 								    model: str,
 								    doc
 								):
 								    """Background task for drucksache analysis."""
 								    try:
 								        await update_job(job_id, status="processing")
 								        # Run LLM analysis
 								        assessment = await analyze_antrag(text, bundesland, model)
 								        # Prepare data for DB
 								        assessment_data = {
 								            "drucksache": drucksache,
-												Fix: NRW-Titel + Regierungsfraktionen-Pflicht im LLM-Prompt

Bug 1 — NRW-Titel "Drucksache XX/YYYYY":
NRW's get_document machte nur HEAD-Request auf die PDF-URL und gab
title="Drucksache 18/18085" zurück — keinen echten Titel. Fix: nutzt
jetzt search(drucksache) um den echten Eintrag von OPAL zu holen.
Fallback: leerer Titel statt generischer, damit der LLM-Titel nicht
überschrieben wird. Plus _pick_best_title Helper: doc.title nur
übernehmen wenn es ein echter Titel ist (nicht "Drucksache XX").

Bug 2 — Nur Antragsteller im Passungsprofil, keine Regierungsfraktionen:
Der LLM ignorierte die "UND Regierungsfraktionen"-Anweisung im Prompt.
Fix: explizite PFLICHT-FRAKTIONEN-Zeile im User-Prompt:
"Du MUSST folgende Fraktionen in wahlprogrammScores bewerten: SPD, CDU, GRÜNE"
(dedupliziert aus fraktionen + regierungsfraktionen).

Tests: 194/194 grün.
Batch-Re-Analyse muss nochmal laufen mit den Fixes (21 bereits fertig,
15 noch offen — werden alle erneut benötigt weil die Titel/Fraktionen
in den neuen Assessments falsch sind).

											
										
										
											2026-04-10 16:05:57 +02:00
+								            # Titel-Priorität: LLM-generierter Titel > doc.title,
 								            # ABER nur wenn doc.title ein echter Titel ist (nicht "Drucksache XX",
 								            # wie NRW's get_document zurückgibt). Sonst überschreibt der
 								            # generische doc.title den echten LLM-Titel.
 								            "title": _pick_best_title(assessment.title, doc.title if doc else None, drucksache),
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								            "fraktionen": assessment.fraktionen,
 								            "datum": assessment.datum or (doc.datum if doc else ""),
 								            "link": doc.link if doc else "",
 								            "bundesland": bundesland,
 								            "gwoeScore": assessment.gwoe_score,
 								            "gwoeBegründung": assessment.gwoe_begruendung,
 								            "gwoeMatrix": [m.model_dump() for m in assessment.gwoe_matrix],
 								            "gwoeSchwerpunkt": assessment.gwoe_schwerpunkt,
 								            "wahlprogrammScores": [w.model_dump() for w in assessment.wahlprogramm_scores],
 								            "verbesserungen": [v.model_dump() for v in assessment.verbesserungen],
 								            "stärken": assessment.staerken,
 								            "schwächen": assessment.schwaechen,
 								            "empfehlung": assessment.empfehlung,
 								            "empfehlungSymbol": assessment.empfehlung_symbol,
 								            "verbesserungspotenzial": assessment.verbesserungspotenzial,
 								            "themen": assessment.themen,
 								            "antragZusammenfassung": assessment.antrag_zusammenfassung,
 								            "antragKernpunkte": assessment.antrag_kernpunkte,
 								            "source": "webapp",
 								            "model": model,
 								        }
 								        # Save to DB
 								        await upsert_assessment(assessment_data)
 								        # Generate reports
 								        html_path = settings.reports_dir / f"{job_id}.html"
 								        pdf_path = settings.reports_dir / f"{job_id}.pdf"
-												Bundesland filter & transparency: stringent split + visible source (#8)

Brings the Bundesland-Dropdown from a cosmetic header widget to a real
filter that propagates through every layer (Listing, internal search,
statistics, party/tag filters, upload mode), and at the same time makes
the source parliament visible in every place where assessments from
multiple bundesländer can be mixed.

Backend
- database.get_all_assessments(bundesland=None) — new optional filter,
  "ALL" treated as None.
- database.search_assessments — bug fix: previous `if bundesland:`
  branch incorrectly added a `WHERE bundesland='ALL'` clause; now
  guarded with `bundesland and bundesland != "ALL"`.
- main.list_assessments — accepts ?bundesland= query param, includes the
  bundesland field in the response so the frontend can render badges.
- main.get_single_assessment — also includes bundesland in the response
  so the detail header can show the source parlament.
- main.search_landtag — early HTTP 400 when bundesland is missing or
  "ALL"; the live Landtag adapter cannot serve a synthetic Bundesweit
  request.
- main.index() and main.list_bundeslaender — synthetic "🌍 Bundesweit"
  entry prepended to the bundesländer list (kept out of bundeslaender.py
  on purpose — ALL is not a real state). Both endpoints additionally
  expose a parlament_names map so the frontend can render the source
  parliament without an extra round-trip.

Report (PDF + HTML)
- generate_html_report / generate_pdf_report — new optional bundesland
  parameter. When set, the report header carries the parliament name
  ("Landtag von Sachsen-Anhalt", "Landtag Nordrhein-Westfalen", …)
  beside the title. Three call sites updated: run_analysis,
  run_drucksache_analysis, download_assessment_pdf.

Frontend (templates/index.html)
- Header dropdown gets the synthetic ALL entry as first option;
  initial currentBundesland is now 'ALL' (was 'NRW').
- localStorage persistence: changeBundesland writes, DOMContentLoaded
  reads and validates against the visible options.
- changeBundesland resets the score / party / tag filter state, syncs
  the upload-mode bundesland select, disables the Landtag-Suche button
  + tooltip when ALL, and toggles a data-mode attribute on
  .list-content (used by CSS to show/hide the per-item bundesland
  badge).
- loadAssessments now sends ?bundesland=… so the API does the actual
  filtering. updateStats renders an additional per-bundesland average
  block (Ø NRW: x · Ø LSA: y) when in ALL mode and the loaded list
  spans more than one bundesland.
- renderList prepends a small "bl-badge" beside the Drucksachen-Nummer.
  Hidden in single-bundesland mode via CSS selector to avoid clutter.
- showDetail header now shows the parliament name as its own line
  (.detail-parlament).
- searchLandtag has an early-out alert if currentBundesland === 'ALL',
  saving a network round-trip.
- Upload-Mode bundesland select now starts with a "— Bundesland wählen
  —" placeholder (no auto-default), and startAnalysis validates that a
  concrete bundesland was chosen.

CSS
- .bl-badge plus the .list-content[data-mode="single"] hide rule.
- .detail-parlament for the detail header line.
- .header-parlament for the PDF report header line.

Resolves #8.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:00:39 +02:00
 								        await generate_html_report(assessment, html_path, bundesland=bundesland)
 								        await generate_pdf_report(assessment, pdf_path, bundesland=bundesland)
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								        await update_job(
 								            job_id,
 								            status="completed",
 								            result=assessment.model_dump_json(),
 								            html_path=str(html_path),
 								            pdf_path=str(pdf_path),
 								        )
 								    except Exception as e:
-												Phase A: Audit-Restbefunde #57.3/4/7 (Roadmap #59)

Drei verbleibende Audit-Befunde aus #57 in einem Patch:

- **#57.3 MEDIUM** Drucksache-Regex-Validation: neue
  app/validators.py mit validate_drucksache() als gemeinsamer
  Validation-Funnel. Pattern ^\d{1,3}/\d{1,7}([-(].{1,20})?$ deckt
  alle 10 aktiven Bundesländer (8/6390, 18/12345, 8/6390(neu),
  23/3700-A) ab und blockt Path-Traversal (../, /etc/passwd) plus
  Standard-Injection (;, <, &). Drei Endpoints durchgeschleust:
  /api/assessment, /api/assessment/pdf, /api/analyze-drucksache.

- **#57.4 MEDIUM** print() → logging.getLogger(__name__): main.py
  und analyzer.py auf strukturiertes Logging umgestellt. LLM-Inhalte
  werden NICHT mehr als Volltext geloggt — neue Helper
  _content_fingerprint() liefert nur "len=N sha1=XXXX", reicht zur
  Forensik ohne Antrag-Inhalte ins Container-Log zu leaken.
  basicConfig() mit ISO-Format setzt strukturiertes Logging früh,
  damit logger.exception() auch beim Boot greift.

- **#57.7 LOW-MED** Search-Query-Limit: validate_search_query() mit
  MAX_SEARCH_QUERY_LEN=200 schützt /api/search und /api/search-landtag
  vor 10-MB-Query-DoS. database._parse_search_query() loggt jetzt
  shlex.ValueError-Fallback statt ihn zu verschlucken (deckt Memory-
  Regel "stille excepts in Adaptern" ab).

Tests: neue tests/test_main_validators.py mit 22 Cases — Drucksache-
Whitelist-Roundtrip + Path-Traversal-Reject, Search-Query Längen-
Edge-Cases. 107 Unit-Tests grün (85 alt + 22 neu).

Validators in eigenem Modul (app/validators.py), damit Tests sie ohne
slowapi-Dependency direkt importieren können.

Refs: #57, #59 (Phase A)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 11:15:16 +02:00
+								        # Volltext-Stack via logger.exception, NICHT via print — landet so im
 								        # strukturierten Container-Log und wird vom logging-Framework formatiert
 								        logger.exception("run_drucksache_analysis failed for drucksache=%s", drucksache)
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								        await update_job(job_id, status="failed", error=str(e))
 								# API: List available Bundesländer
 								@app.get("/api/bundeslaender")
 								async def list_bundeslaender():
-												Bundesland filter & transparency: stringent split + visible source (#8)

Brings the Bundesland-Dropdown from a cosmetic header widget to a real
filter that propagates through every layer (Listing, internal search,
statistics, party/tag filters, upload mode), and at the same time makes
the source parliament visible in every place where assessments from
multiple bundesländer can be mixed.

Backend
- database.get_all_assessments(bundesland=None) — new optional filter,
  "ALL" treated as None.
- database.search_assessments — bug fix: previous `if bundesland:`
  branch incorrectly added a `WHERE bundesland='ALL'` clause; now
  guarded with `bundesland and bundesland != "ALL"`.
- main.list_assessments — accepts ?bundesland= query param, includes the
  bundesland field in the response so the frontend can render badges.
- main.get_single_assessment — also includes bundesland in the response
  so the detail header can show the source parlament.
- main.search_landtag — early HTTP 400 when bundesland is missing or
  "ALL"; the live Landtag adapter cannot serve a synthetic Bundesweit
  request.
- main.index() and main.list_bundeslaender — synthetic "🌍 Bundesweit"
  entry prepended to the bundesländer list (kept out of bundeslaender.py
  on purpose — ALL is not a real state). Both endpoints additionally
  expose a parlament_names map so the frontend can render the source
  parliament without an extra round-trip.

Report (PDF + HTML)
- generate_html_report / generate_pdf_report — new optional bundesland
  parameter. When set, the report header carries the parliament name
  ("Landtag von Sachsen-Anhalt", "Landtag Nordrhein-Westfalen", …)
  beside the title. Three call sites updated: run_analysis,
  run_drucksache_analysis, download_assessment_pdf.

Frontend (templates/index.html)
- Header dropdown gets the synthetic ALL entry as first option;
  initial currentBundesland is now 'ALL' (was 'NRW').
- localStorage persistence: changeBundesland writes, DOMContentLoaded
  reads and validates against the visible options.
- changeBundesland resets the score / party / tag filter state, syncs
  the upload-mode bundesland select, disables the Landtag-Suche button
  + tooltip when ALL, and toggles a data-mode attribute on
  .list-content (used by CSS to show/hide the per-item bundesland
  badge).
- loadAssessments now sends ?bundesland=… so the API does the actual
  filtering. updateStats renders an additional per-bundesland average
  block (Ø NRW: x · Ø LSA: y) when in ALL mode and the loaded list
  spans more than one bundesland.
- renderList prepends a small "bl-badge" beside the Drucksachen-Nummer.
  Hidden in single-bundesland mode via CSS selector to avoid clutter.
- showDetail header now shows the parliament name as its own line
  (.detail-parlament).
- searchLandtag has an early-out alert if currentBundesland === 'ALL',
  saving a network round-trip.
- Upload-Mode bundesland select now starts with a "— Bundesland wählen
  —" placeholder (no auto-default), and startAnalysis validates that a
  concrete bundesland was chosen.

CSS
- .bl-badge plus the .list-content[data-mode="single"] hide rule.
- .detail-parlament for the detail header line.
- .header-parlament for the PDF report header line.

Resolves #8.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-04-07 23:00:39 +02:00
+								    """List available bundesländer with their status.
 								    Includes the synthetic "ALL" / Bundesweit entry as the first item so
 								    that the frontend can render it directly. ``parlament_name`` is added
 								    so the detail view can show the source parliament without an extra
 								    backend round-trip.
 								    """
 								    out = [{
 								        "code": "ALL",
 								        "name": "🌍 Bundesweit",
 								        "parlament_name": None,
 								        "active": True,
 								    }]
 								    out.extend({
 								        "code": bl.code,
 								        "name": bl.name,
 								        "parlament_name": bl.parlament_name,
 								        "active": bl.aktiv,
 								    } for bl in alle_bundeslaender())
 								    return out
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
 								# === Quellen / Programme ===
-												#96 Methodik-/Transparenz-Seite unter /methodik

Neue Seite für Endnutzer-Transparenz über die Bewertungsmethodik:

- GWÖ-Matrix 2.0 Erklärung mit interaktivem 5×5-Grid
- Analyse-Pipeline als 5-Schritt-Visualisierung (Download → Embedding
  → LLM → Verifikation → Darstellung)
- Wahlprogramm-Vergleich: Erklärung des Retrieval + Top-K + Verifikation
- Qualitätssicherung: Sub-D Property-Tests, server-seitige Quellen-
  Rekonstruktion, automatische Neu-Analyse
- Einschränkungen: KI-Bias, keine juristische Bewertung, nur indexierte
  Programme, kein Abstimmungsverhalten
- Datenquellen: dynamische Tabelle aller angebundenen Parlamente aus
  ADAPTERS + bundeslaender.py
- Technische Details aufklappbar (details/summary) für Interessierte,
  Haupttext verständlich für Nicht-Techniker
- Links zu Quellen-Seite, Adapter-Matrix, ADRs

In Hauptnavigation verlinkt (neben Quellen + Auswertungen).
Template-Variablen: adapter_count, model_name, programme_count,
chunk_count, bundeslaender — alles dynamisch aus dem Backend.

Tests: 194/194 grün.

Refs: #96

											
										
										
											2026-04-10 16:14:38 +02:00
+								@app.get("/methodik", response_class=HTMLResponse)
 								async def methodik_page(request: Request):
 								    """Transparenz-/Methodik-Seite (#96)."""
 								    from .bundeslaender import aktive_bundeslaender, BUNDESLAENDER
 								    from .embeddings import get_indexing_status
 								    bl_list = []
 								    for bl in aktive_bundeslaender():
 								        bl_list.append({
 								            "code": bl.code,
 								            "name": bl.name,
 								            "doku_system": bl.doku_system,
 								        })
 								    status = get_indexing_status()
 								    return templates.TemplateResponse("methodik.html", {
 								        "request": request,
 								        "app_name": settings.app_name,
 								        "adapter_count": len(ADAPTERS),
 								        "model_name": settings.llm_model_default,
 								        "programme_count": status.get("total", 0),
 								        "chunk_count": sum(p.get("chunks", 0) for p in status.get("programmes", [])),
 								        "bundeslaender": sorted(bl_list, key=lambda x: x["name"]),
 								    })
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								@app.get("/quellen", response_class=HTMLResponse)
 								async def quellen_page(request: Request):
-												Quellen-Seite: Programme nach Bundesland gruppiert statt einer langen Liste

											
										
										
											2026-04-10 19:10:18 +02:00
+								    """Quellen-Seite mit allen Wahl- und Parteiprogrammen, nach BL gruppiert."""
 								    from .bundeslaender import BUNDESLAENDER
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    programmes = get_programme_info()
 								    status = get_indexing_status()
-												Quellen-Seite: Programme nach Bundesland gruppiert statt einer langen Liste

											
										
										
											2026-04-10 19:10:18 +02:00
 								    # Wahlprogramme nach Bundesland gruppieren
 								    by_bl: dict[str, list] = {}
 								    grundsatz = []
 								    for prog in programmes:
 								        if prog["typ"] == "parteiprogramm":
 								            grundsatz.append(prog)
 								        else:
 								            bl = prog.get("bundesland") or "Sonstige"
 								            bl_name = BUNDESLAENDER[bl].name if bl in BUNDESLAENDER else bl
 								            by_bl.setdefault(bl_name, []).append(prog)
 								    # Sortieren: alphabetisch nach BL-Name
 								    wahlprogramme_grouped = sorted(by_bl.items())
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    return templates.TemplateResponse("quellen.html", {
 								        "request": request,
 								        "app_name": settings.app_name,
 								        "programmes": programmes,
-												Quellen-Seite: Programme nach Bundesland gruppiert statt einer langen Liste

											
										
										
											2026-04-10 19:10:18 +02:00
+								        "wahlprogramme_grouped": wahlprogramme_grouped,
 								        "grundsatzprogramme": grundsatz,
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								        "status": status,
 								    })
-												#47 PDF Zitat-Highlighting via PyMuPDF Single-Page-Render

Klick auf eine Zitat-Quelle im Report öffnet jetzt eine 1-Seiten-PDF-
Variante des Wahlprogramms mit gelb markiertem Snippet, statt nur zum
Page-Anchor zu springen und den Leser selbst suchen zu lassen.

Implementation:

embeddings.render_highlighted_page(programm_id, seite, query)
- Validiert programm_id gegen PROGRAMME (Path-Traversal-Schutz)
- Lädt das volle Wahlprogramm-PDF, extrahiert via insert_pdf nur die
  angeforderte Seite in einen neuen Document → kleinere Response
- search_for(query[:200]) → Bounding-Boxes aller Treffer
- Fallback: 5-Wort-Anker wenn Volltext-Match leer (LLM-Truncation,
  identisch zu find_chunk_for_text/Sub-D-Logik)
- add_highlight_annot mit gelber stroke-Color (1.0, 0.93, 0.0)
- Returns serialisierte PDF-Bytes oder None

embeddings._chunk_pdf_url
- Wenn chunk["text"] vorhanden: emittiert /api/wahlprogramm-cite-URL
  mit pid=, seite=, q=urlencoded(text[:200])
- Sonst: alter statischer /static/referenzen/X.pdf#page=N (Pre-#47
  rückwärts-kompatibel)
- text wird auf 200 Zeichen abgeschnitten, sonst blasen
  500-Zeichen-Snippets jedes Assessment-JSON auf

main.py /api/wahlprogramm-cite Endpoint
- Validiert pid gegen PROGRAMME registry
- seite: 1 ≤ n ≤ 2000
- Response: application/pdf, Cache-Control max-age=86400
- 404 bei unknown pid oder fehlendem PDF, 400 bei seite out of range

Reconstruct-Pipeline (Issue #60 Option B) zieht das automatisch durch:
reconstruct_zitate ruft _chunk_pdf_url(matched_chunk) auf, der jetzt
bevorzugt die Cite-URL emittiert. Keine Änderung an reconstruct_zitate
selbst nötig.

Tests: 194/194 grün (185 + 9 neue):

- TestChunkPdfUrl: 4 Cases (cite vs static, unknown prog, 200-char-truncate)
- TestRenderHighlightedPage: 5 Cases (unknown pid, invalid seite, valid
  render, empty query, query-not-found-falls-back-zu-leerem-Highlight)
- Plus Bridge im Test-Stub: pymupdf-as-fitz Shim falls eine
  third-party "fitz" das Pkg shadowt (kommt auf älteren Dev-Setups vor)

Refs: #47

											
										
										
											2026-04-10 01:09:45 +02:00
+								@app.get("/api/wahlprogramm-cite")
-												#47: Auto-Re-Analyse bei nicht-verifizierbaren Zitaten

Statt eine Nachricht "Textstelle nicht auffindbar" zu zeigen (was User
zurecht als Quatsch bezeichnet hat), erkennt der Cite-Endpoint jetzt
halluzinierte Zitate und triggert automatisch eine Re-Analyse:

Flow:
1. User klickt auf Zitat-Link
2. render_highlighted_page gibt (pdf, page, highlighted=False) zurück
3. Endpoint prüft: ds+bl Parameter vorhanden? Assessment in DB?
4. → Löscht altes Assessment, startet Re-Analyse als Background-Task
5. → Zeigt HTML-Warte-Seite mit Spinner und "Wird neu analysiert..."
6. → Auto-Redirect nach 15s zurück zum Assessment

Das neue Assessment hat durch reconstruct_zitate verifizierte Zitate,
die dann beim nächsten Klick korrekt gehighlighted werden.

Änderungen:
- embeddings.render_highlighted_page: Return-Typ (bytes, int, bool) —
  drittes Element ist True wenn Highlight gesetzt wurde
- database.delete_assessment: neue Funktion für die Re-Analyse
- main.py cite-Endpoint: akzeptiert ds= und bl= als optionale Params,
  triggert Re-Analyse bei highlighted=False + ds vorhanden
- Frontend: makeCiteUrl reicht ds+bl aus dem Assessment-Kontext mit
  durch in die Cite-URL
- Cache-Control auf 1h reduziert (war 24h, zu aggressiv für
  Assessments die sich durch Re-Analyse ändern)

Tests: 194/194 grün.

Refs: #47, #60

											
										
										
											2026-04-10 10:35:01 +02:00
+								async def wahlprogramm_cite(
 								    request: Request,
 								    background_tasks: BackgroundTasks,
 								    pid: str = "", pdf: str = "", seite: int = 1, q: str = "",
 								    ds: str = "", bl: str = "",
 								):
-												#47 PDF Zitat-Highlighting via PyMuPDF Single-Page-Render

Klick auf eine Zitat-Quelle im Report öffnet jetzt eine 1-Seiten-PDF-
Variante des Wahlprogramms mit gelb markiertem Snippet, statt nur zum
Page-Anchor zu springen und den Leser selbst suchen zu lassen.

Implementation:

embeddings.render_highlighted_page(programm_id, seite, query)
- Validiert programm_id gegen PROGRAMME (Path-Traversal-Schutz)
- Lädt das volle Wahlprogramm-PDF, extrahiert via insert_pdf nur die
  angeforderte Seite in einen neuen Document → kleinere Response
- search_for(query[:200]) → Bounding-Boxes aller Treffer
- Fallback: 5-Wort-Anker wenn Volltext-Match leer (LLM-Truncation,
  identisch zu find_chunk_for_text/Sub-D-Logik)
- add_highlight_annot mit gelber stroke-Color (1.0, 0.93, 0.0)
- Returns serialisierte PDF-Bytes oder None

embeddings._chunk_pdf_url
- Wenn chunk["text"] vorhanden: emittiert /api/wahlprogramm-cite-URL
  mit pid=, seite=, q=urlencoded(text[:200])
- Sonst: alter statischer /static/referenzen/X.pdf#page=N (Pre-#47
  rückwärts-kompatibel)
- text wird auf 200 Zeichen abgeschnitten, sonst blasen
  500-Zeichen-Snippets jedes Assessment-JSON auf

main.py /api/wahlprogramm-cite Endpoint
- Validiert pid gegen PROGRAMME registry
- seite: 1 ≤ n ≤ 2000
- Response: application/pdf, Cache-Control max-age=86400
- 404 bei unknown pid oder fehlendem PDF, 400 bei seite out of range

Reconstruct-Pipeline (Issue #60 Option B) zieht das automatisch durch:
reconstruct_zitate ruft _chunk_pdf_url(matched_chunk) auf, der jetzt
bevorzugt die Cite-URL emittiert. Keine Änderung an reconstruct_zitate
selbst nötig.

Tests: 194/194 grün (185 + 9 neue):

- TestChunkPdfUrl: 4 Cases (cite vs static, unknown prog, 200-char-truncate)
- TestRenderHighlightedPage: 5 Cases (unknown pid, invalid seite, valid
  render, empty query, query-not-found-falls-back-zu-leerem-Highlight)
- Plus Bridge im Test-Stub: pymupdf-as-fitz Shim falls eine
  third-party "fitz" das Pkg shadowt (kommt auf älteren Dev-Setups vor)

Refs: #47

											
										
										
											2026-04-10 01:09:45 +02:00
+								    """Render eine Wahlprogramm-Seite mit gelb hervorgehobener Zitat-Stelle.
 								    Issue #47: Klick auf eine Zitat-Quelle im Report soll direkt zur
 								    Stelle im Wahlprogramm-PDF springen, mit dem zitierten Snippet
 								    visuell markiert. Statt das ganze PDF auszuliefern (Browser scrollt
 								    auf #page=N und Leser muss von Hand suchen), liefern wir hier ein
 -Seiten-PDF mit ``add_highlight_annot``-Annotation auf den per
 								    ``page.search_for`` gefundenen Bounding-Boxes.
-												#47 Fix: Highlighting retroaktiv für alle bestehenden Assessments

Problem: Alle Assessments in der Prod-DB haben Pre-#47-URLs
(/static/referenzen/X.pdf#page=N). Die _chunk_pdf_url-Änderung wirkt
nur auf NEUE Analysen, die noch nicht stattgefunden haben.

Fix (zwei Seiten):

1. Endpoint /api/wahlprogramm-cite akzeptiert jetzt auch pdf=<filename>
   als Alternative zu pid=<programm_id>. Reverse-Lookup über PROGRAMME-
   Registry: pdf-Filename → programm_id. Damit können die statischen
   URLs aus Pre-#47-Assessments trotzdem an den Cite-Endpoint geleitet
   werden.

2. Frontend: neue JS-Funktion makeCiteUrl(z) die JEDE Zitat-URL on-the-
   fly umschreibt:
   - /static/referenzen/X.pdf#page=N + z.text
     → /api/wahlprogramm-cite?pdf=X.pdf&seite=N&q=<urlencoded text>
   - /api/wahlprogramm-cite?... → durchreichen (schon Cite-URL)
   - Fallback: URL unverändert

   Funktioniert retroaktiv für ALLE ~31 Assessments in der DB, ohne
   Re-Analyse. Sobald ein User auf ein Zitat klickt, wird die Seite
   des Wahlprogramms mit gelber Markierung gerendert.

Tests: 194/194 grün.

Refs: #47

											
										
										
											2026-04-10 09:57:58 +02:00
+								    Akzeptiert ``pid`` (PROGRAMME-Key) ODER ``pdf`` (Dateiname wie
 								    ``spd-grundsatzprogramm.pdf``). Letzterer ermöglicht die retroaktive
 								    Nutzung von Pre-#47-URLs im Frontend, wo nur der statische Pfad
 								    ``/static/referenzen/<pdf>#page=<N>`` gespeichert ist.
-												#47 PDF Zitat-Highlighting via PyMuPDF Single-Page-Render

Klick auf eine Zitat-Quelle im Report öffnet jetzt eine 1-Seiten-PDF-
Variante des Wahlprogramms mit gelb markiertem Snippet, statt nur zum
Page-Anchor zu springen und den Leser selbst suchen zu lassen.

Implementation:

embeddings.render_highlighted_page(programm_id, seite, query)
- Validiert programm_id gegen PROGRAMME (Path-Traversal-Schutz)
- Lädt das volle Wahlprogramm-PDF, extrahiert via insert_pdf nur die
  angeforderte Seite in einen neuen Document → kleinere Response
- search_for(query[:200]) → Bounding-Boxes aller Treffer
- Fallback: 5-Wort-Anker wenn Volltext-Match leer (LLM-Truncation,
  identisch zu find_chunk_for_text/Sub-D-Logik)
- add_highlight_annot mit gelber stroke-Color (1.0, 0.93, 0.0)
- Returns serialisierte PDF-Bytes oder None

embeddings._chunk_pdf_url
- Wenn chunk["text"] vorhanden: emittiert /api/wahlprogramm-cite-URL
  mit pid=, seite=, q=urlencoded(text[:200])
- Sonst: alter statischer /static/referenzen/X.pdf#page=N (Pre-#47
  rückwärts-kompatibel)
- text wird auf 200 Zeichen abgeschnitten, sonst blasen
  500-Zeichen-Snippets jedes Assessment-JSON auf

main.py /api/wahlprogramm-cite Endpoint
- Validiert pid gegen PROGRAMME registry
- seite: 1 ≤ n ≤ 2000
- Response: application/pdf, Cache-Control max-age=86400
- 404 bei unknown pid oder fehlendem PDF, 400 bei seite out of range

Reconstruct-Pipeline (Issue #60 Option B) zieht das automatisch durch:
reconstruct_zitate ruft _chunk_pdf_url(matched_chunk) auf, der jetzt
bevorzugt die Cite-URL emittiert. Keine Änderung an reconstruct_zitate
selbst nötig.

Tests: 194/194 grün (185 + 9 neue):

- TestChunkPdfUrl: 4 Cases (cite vs static, unknown prog, 200-char-truncate)
- TestRenderHighlightedPage: 5 Cases (unknown pid, invalid seite, valid
  render, empty query, query-not-found-falls-back-zu-leerem-Highlight)
- Plus Bridge im Test-Stub: pymupdf-as-fitz Shim falls eine
  third-party "fitz" das Pkg shadowt (kommt auf älteren Dev-Setups vor)

Refs: #47

											
										
										
											2026-04-10 01:09:45 +02:00
+								    Security: ``pid`` muss ein registrierter PROGRAMME-Key sein —
 								    verhindert Path-Traversal und arbiträren File-Read aus dem
 								    referenzen-Verzeichnis. ``seite`` wird per Pydantic-Coercion
 								    auf int gezwungen. ``q`` ist auf 200 Zeichen begrenzt im Renderer.
 								    """
-												#47 Fix: Highlighting retroaktiv für alle bestehenden Assessments

Problem: Alle Assessments in der Prod-DB haben Pre-#47-URLs
(/static/referenzen/X.pdf#page=N). Die _chunk_pdf_url-Änderung wirkt
nur auf NEUE Analysen, die noch nicht stattgefunden haben.

Fix (zwei Seiten):

1. Endpoint /api/wahlprogramm-cite akzeptiert jetzt auch pdf=<filename>
   als Alternative zu pid=<programm_id>. Reverse-Lookup über PROGRAMME-
   Registry: pdf-Filename → programm_id. Damit können die statischen
   URLs aus Pre-#47-Assessments trotzdem an den Cite-Endpoint geleitet
   werden.

2. Frontend: neue JS-Funktion makeCiteUrl(z) die JEDE Zitat-URL on-the-
   fly umschreibt:
   - /static/referenzen/X.pdf#page=N + z.text
     → /api/wahlprogramm-cite?pdf=X.pdf&seite=N&q=<urlencoded text>
   - /api/wahlprogramm-cite?... → durchreichen (schon Cite-URL)
   - Fallback: URL unverändert

   Funktioniert retroaktiv für ALLE ~31 Assessments in der DB, ohne
   Re-Analyse. Sobald ein User auf ein Zitat klickt, wird die Seite
   des Wahlprogramms mit gelber Markierung gerendert.

Tests: 194/194 grün.

Refs: #47

											
										
										
											2026-04-10 09:57:58 +02:00
+								    # Reverse-Lookup: pdf-Filename → programm_id, falls nur pdf angegeben.
-												#47 Fix: Highlighting für falsche Seitenzahlen + Year-Suffix-Matching

Zwei Bugs aus User-Test:

1. "Unbekanntes Wahlprogramm" bei Klick auf Grünes Grundsatzprogramm:
   Pre-#60 Assessments haben halluzinierte Dateinamen wie
   "gruene-grundsatzprogramm-2020.pdf" statt "gruene-grundsatzprogramm.pdf".
   Fix: Year-Suffix-Stripping im Reverse-Lookup (X-YYYY.pdf → X.pdf).

2. "Eine Seite, aber kein Highlighting": Pre-#60 Assessments haben oft
   falsche Seitennummern. search_for findet nichts auf der falschen Seite.
   Fix: wenn die angegebene Seite leer ist, ALLE Seiten durchsuchen und
   die erste mit einem Treffer nehmen. So funktioniert Highlighting auch
   bei halluzinierten Seitenzahlen retroaktiv. Performance: ~50ms pro PDF
   (Grundsatzprogramme haben ~100-160 Seiten), akzeptabel für on-demand.

Tests: 194/194 grün.

Refs: #47

											
										
										
											2026-04-10 10:08:02 +02:00
+								    # Zwei Stufen: exakter Match, dann fuzzy (Year-Suffix-Stripping), weil
 								    # Pre-#47 Assessments halluzinierte Dateinamen haben können, z.B.
 								    # "gruene-grundsatzprogramm-2020.pdf" statt "gruene-grundsatzprogramm.pdf".
-												#47 Fix: Highlighting retroaktiv für alle bestehenden Assessments

Problem: Alle Assessments in der Prod-DB haben Pre-#47-URLs
(/static/referenzen/X.pdf#page=N). Die _chunk_pdf_url-Änderung wirkt
nur auf NEUE Analysen, die noch nicht stattgefunden haben.

Fix (zwei Seiten):

1. Endpoint /api/wahlprogramm-cite akzeptiert jetzt auch pdf=<filename>
   als Alternative zu pid=<programm_id>. Reverse-Lookup über PROGRAMME-
   Registry: pdf-Filename → programm_id. Damit können die statischen
   URLs aus Pre-#47-Assessments trotzdem an den Cite-Endpoint geleitet
   werden.

2. Frontend: neue JS-Funktion makeCiteUrl(z) die JEDE Zitat-URL on-the-
   fly umschreibt:
   - /static/referenzen/X.pdf#page=N + z.text
     → /api/wahlprogramm-cite?pdf=X.pdf&seite=N&q=<urlencoded text>
   - /api/wahlprogramm-cite?... → durchreichen (schon Cite-URL)
   - Fallback: URL unverändert

   Funktioniert retroaktiv für ALLE ~31 Assessments in der DB, ohne
   Re-Analyse. Sobald ein User auf ein Zitat klickt, wird die Seite
   des Wahlprogramms mit gelber Markierung gerendert.

Tests: 194/194 grün.

Refs: #47

											
										
										
											2026-04-10 09:57:58 +02:00
+								    if not pid and pdf:
-												#47 Fix: Highlighting für falsche Seitenzahlen + Year-Suffix-Matching

Zwei Bugs aus User-Test:

1. "Unbekanntes Wahlprogramm" bei Klick auf Grünes Grundsatzprogramm:
   Pre-#60 Assessments haben halluzinierte Dateinamen wie
   "gruene-grundsatzprogramm-2020.pdf" statt "gruene-grundsatzprogramm.pdf".
   Fix: Year-Suffix-Stripping im Reverse-Lookup (X-YYYY.pdf → X.pdf).

2. "Eine Seite, aber kein Highlighting": Pre-#60 Assessments haben oft
   falsche Seitennummern. search_for findet nichts auf der falschen Seite.
   Fix: wenn die angegebene Seite leer ist, ALLE Seiten durchsuchen und
   die erste mit einem Treffer nehmen. So funktioniert Highlighting auch
   bei halluzinierten Seitenzahlen retroaktiv. Performance: ~50ms pro PDF
   (Grundsatzprogramme haben ~100-160 Seiten), akzeptabel für on-demand.

Tests: 194/194 grün.

Refs: #47

											
										
										
											2026-04-10 10:08:02 +02:00
+								        # Stage 1: exakt
-												#47 Fix: Highlighting retroaktiv für alle bestehenden Assessments

Problem: Alle Assessments in der Prod-DB haben Pre-#47-URLs
(/static/referenzen/X.pdf#page=N). Die _chunk_pdf_url-Änderung wirkt
nur auf NEUE Analysen, die noch nicht stattgefunden haben.

Fix (zwei Seiten):

1. Endpoint /api/wahlprogramm-cite akzeptiert jetzt auch pdf=<filename>
   als Alternative zu pid=<programm_id>. Reverse-Lookup über PROGRAMME-
   Registry: pdf-Filename → programm_id. Damit können die statischen
   URLs aus Pre-#47-Assessments trotzdem an den Cite-Endpoint geleitet
   werden.

2. Frontend: neue JS-Funktion makeCiteUrl(z) die JEDE Zitat-URL on-the-
   fly umschreibt:
   - /static/referenzen/X.pdf#page=N + z.text
     → /api/wahlprogramm-cite?pdf=X.pdf&seite=N&q=<urlencoded text>
   - /api/wahlprogramm-cite?... → durchreichen (schon Cite-URL)
   - Fallback: URL unverändert

   Funktioniert retroaktiv für ALLE ~31 Assessments in der DB, ohne
   Re-Analyse. Sobald ein User auf ein Zitat klickt, wird die Seite
   des Wahlprogramms mit gelber Markierung gerendert.

Tests: 194/194 grün.

Refs: #47

											
										
										
											2026-04-10 09:57:58 +02:00
+								        for p, info in PROGRAMME.items():
 								            if info.get("pdf") == pdf:
 								                pid = p
 								                break
-												#47 Fix: Highlighting für falsche Seitenzahlen + Year-Suffix-Matching

Zwei Bugs aus User-Test:

1. "Unbekanntes Wahlprogramm" bei Klick auf Grünes Grundsatzprogramm:
   Pre-#60 Assessments haben halluzinierte Dateinamen wie
   "gruene-grundsatzprogramm-2020.pdf" statt "gruene-grundsatzprogramm.pdf".
   Fix: Year-Suffix-Stripping im Reverse-Lookup (X-YYYY.pdf → X.pdf).

2. "Eine Seite, aber kein Highlighting": Pre-#60 Assessments haben oft
   falsche Seitennummern. search_for findet nichts auf der falschen Seite.
   Fix: wenn die angegebene Seite leer ist, ALLE Seiten durchsuchen und
   die erste mit einem Treffer nehmen. So funktioniert Highlighting auch
   bei halluzinierten Seitenzahlen retroaktiv. Performance: ~50ms pro PDF
   (Grundsatzprogramme haben ~100-160 Seiten), akzeptabel für on-demand.

Tests: 194/194 grün.

Refs: #47

											
										
										
											2026-04-10 10:08:02 +02:00
+								        # Stage 2: Year-Suffix stripping (z.B. "X-2020.pdf" → "X.pdf")
 								        if not pid:
 								            import re
 								            stripped = re.sub(r"-\d{4}\.pdf$", ".pdf", pdf)
 								            if stripped != pdf:
 								                for p, info in PROGRAMME.items():
 								                    if info.get("pdf") == stripped:
 								                        pid = p
 								                        break
-												#47 PDF Zitat-Highlighting via PyMuPDF Single-Page-Render

Klick auf eine Zitat-Quelle im Report öffnet jetzt eine 1-Seiten-PDF-
Variante des Wahlprogramms mit gelb markiertem Snippet, statt nur zum
Page-Anchor zu springen und den Leser selbst suchen zu lassen.

Implementation:

embeddings.render_highlighted_page(programm_id, seite, query)
- Validiert programm_id gegen PROGRAMME (Path-Traversal-Schutz)
- Lädt das volle Wahlprogramm-PDF, extrahiert via insert_pdf nur die
  angeforderte Seite in einen neuen Document → kleinere Response
- search_for(query[:200]) → Bounding-Boxes aller Treffer
- Fallback: 5-Wort-Anker wenn Volltext-Match leer (LLM-Truncation,
  identisch zu find_chunk_for_text/Sub-D-Logik)
- add_highlight_annot mit gelber stroke-Color (1.0, 0.93, 0.0)
- Returns serialisierte PDF-Bytes oder None

embeddings._chunk_pdf_url
- Wenn chunk["text"] vorhanden: emittiert /api/wahlprogramm-cite-URL
  mit pid=, seite=, q=urlencoded(text[:200])
- Sonst: alter statischer /static/referenzen/X.pdf#page=N (Pre-#47
  rückwärts-kompatibel)
- text wird auf 200 Zeichen abgeschnitten, sonst blasen
  500-Zeichen-Snippets jedes Assessment-JSON auf

main.py /api/wahlprogramm-cite Endpoint
- Validiert pid gegen PROGRAMME registry
- seite: 1 ≤ n ≤ 2000
- Response: application/pdf, Cache-Control max-age=86400
- 404 bei unknown pid oder fehlendem PDF, 400 bei seite out of range

Reconstruct-Pipeline (Issue #60 Option B) zieht das automatisch durch:
reconstruct_zitate ruft _chunk_pdf_url(matched_chunk) auf, der jetzt
bevorzugt die Cite-URL emittiert. Keine Änderung an reconstruct_zitate
selbst nötig.

Tests: 194/194 grün (185 + 9 neue):

- TestChunkPdfUrl: 4 Cases (cite vs static, unknown prog, 200-char-truncate)
- TestRenderHighlightedPage: 5 Cases (unknown pid, invalid seite, valid
  render, empty query, query-not-found-falls-back-zu-leerem-Highlight)
- Plus Bridge im Test-Stub: pymupdf-as-fitz Shim falls eine
  third-party "fitz" das Pkg shadowt (kommt auf älteren Dev-Setups vor)

Refs: #47

											
										
										
											2026-04-10 01:09:45 +02:00
+								    if pid not in PROGRAMME:
 								        raise HTTPException(status_code=404, detail="Unbekanntes Wahlprogramm")
 								    if seite < 1 or seite > 2000:
 								        raise HTTPException(status_code=400, detail="Ungültige Seitennummer")
-												#47: Auto-Re-Analyse bei nicht-verifizierbaren Zitaten

Statt eine Nachricht "Textstelle nicht auffindbar" zu zeigen (was User
zurecht als Quatsch bezeichnet hat), erkennt der Cite-Endpoint jetzt
halluzinierte Zitate und triggert automatisch eine Re-Analyse:

Flow:
1. User klickt auf Zitat-Link
2. render_highlighted_page gibt (pdf, page, highlighted=False) zurück
3. Endpoint prüft: ds+bl Parameter vorhanden? Assessment in DB?
4. → Löscht altes Assessment, startet Re-Analyse als Background-Task
5. → Zeigt HTML-Warte-Seite mit Spinner und "Wird neu analysiert..."
6. → Auto-Redirect nach 15s zurück zum Assessment

Das neue Assessment hat durch reconstruct_zitate verifizierte Zitate,
die dann beim nächsten Klick korrekt gehighlighted werden.

Änderungen:
- embeddings.render_highlighted_page: Return-Typ (bytes, int, bool) —
  drittes Element ist True wenn Highlight gesetzt wurde
- database.delete_assessment: neue Funktion für die Re-Analyse
- main.py cite-Endpoint: akzeptiert ds= und bl= als optionale Params,
  triggert Re-Analyse bei highlighted=False + ds vorhanden
- Frontend: makeCiteUrl reicht ds+bl aus dem Assessment-Kontext mit
  durch in die Cite-URL
- Cache-Control auf 1h reduziert (war 24h, zu aggressiv für
  Assessments die sich durch Re-Analyse ändern)

Tests: 194/194 grün.

Refs: #47, #60

											
										
										
											2026-04-10 10:35:01 +02:00
+								    pdf_bytes, found_page, highlighted = render_highlighted_page(pid, seite, q)
-												#47 PDF Zitat-Highlighting via PyMuPDF Single-Page-Render

Klick auf eine Zitat-Quelle im Report öffnet jetzt eine 1-Seiten-PDF-
Variante des Wahlprogramms mit gelb markiertem Snippet, statt nur zum
Page-Anchor zu springen und den Leser selbst suchen zu lassen.

Implementation:

embeddings.render_highlighted_page(programm_id, seite, query)
- Validiert programm_id gegen PROGRAMME (Path-Traversal-Schutz)
- Lädt das volle Wahlprogramm-PDF, extrahiert via insert_pdf nur die
  angeforderte Seite in einen neuen Document → kleinere Response
- search_for(query[:200]) → Bounding-Boxes aller Treffer
- Fallback: 5-Wort-Anker wenn Volltext-Match leer (LLM-Truncation,
  identisch zu find_chunk_for_text/Sub-D-Logik)
- add_highlight_annot mit gelber stroke-Color (1.0, 0.93, 0.0)
- Returns serialisierte PDF-Bytes oder None

embeddings._chunk_pdf_url
- Wenn chunk["text"] vorhanden: emittiert /api/wahlprogramm-cite-URL
  mit pid=, seite=, q=urlencoded(text[:200])
- Sonst: alter statischer /static/referenzen/X.pdf#page=N (Pre-#47
  rückwärts-kompatibel)
- text wird auf 200 Zeichen abgeschnitten, sonst blasen
  500-Zeichen-Snippets jedes Assessment-JSON auf

main.py /api/wahlprogramm-cite Endpoint
- Validiert pid gegen PROGRAMME registry
- seite: 1 ≤ n ≤ 2000
- Response: application/pdf, Cache-Control max-age=86400
- 404 bei unknown pid oder fehlendem PDF, 400 bei seite out of range

Reconstruct-Pipeline (Issue #60 Option B) zieht das automatisch durch:
reconstruct_zitate ruft _chunk_pdf_url(matched_chunk) auf, der jetzt
bevorzugt die Cite-URL emittiert. Keine Änderung an reconstruct_zitate
selbst nötig.

Tests: 194/194 grün (185 + 9 neue):

- TestChunkPdfUrl: 4 Cases (cite vs static, unknown prog, 200-char-truncate)
- TestRenderHighlightedPage: 5 Cases (unknown pid, invalid seite, valid
  render, empty query, query-not-found-falls-back-zu-leerem-Highlight)
- Plus Bridge im Test-Stub: pymupdf-as-fitz Shim falls eine
  third-party "fitz" das Pkg shadowt (kommt auf älteren Dev-Setups vor)

Refs: #47

											
										
										
											2026-04-10 01:09:45 +02:00
+								    if pdf_bytes is None:
 								        raise HTTPException(
 								            status_code=404,
 								            detail="Wahlprogramm-PDF oder Seite nicht verfügbar",
 								        )
-												#47: Auto-Re-Analyse bei nicht-verifizierbaren Zitaten

Statt eine Nachricht "Textstelle nicht auffindbar" zu zeigen (was User
zurecht als Quatsch bezeichnet hat), erkennt der Cite-Endpoint jetzt
halluzinierte Zitate und triggert automatisch eine Re-Analyse:

Flow:
1. User klickt auf Zitat-Link
2. render_highlighted_page gibt (pdf, page, highlighted=False) zurück
3. Endpoint prüft: ds+bl Parameter vorhanden? Assessment in DB?
4. → Löscht altes Assessment, startet Re-Analyse als Background-Task
5. → Zeigt HTML-Warte-Seite mit Spinner und "Wird neu analysiert..."
6. → Auto-Redirect nach 15s zurück zum Assessment

Das neue Assessment hat durch reconstruct_zitate verifizierte Zitate,
die dann beim nächsten Klick korrekt gehighlighted werden.

Änderungen:
- embeddings.render_highlighted_page: Return-Typ (bytes, int, bool) —
  drittes Element ist True wenn Highlight gesetzt wurde
- database.delete_assessment: neue Funktion für die Re-Analyse
- main.py cite-Endpoint: akzeptiert ds= und bl= als optionale Params,
  triggert Re-Analyse bei highlighted=False + ds vorhanden
- Frontend: makeCiteUrl reicht ds+bl aus dem Assessment-Kontext mit
  durch in die Cite-URL
- Cache-Control auf 1h reduziert (war 24h, zu aggressiv für
  Assessments die sich durch Re-Analyse ändern)

Tests: 194/194 grün.

Refs: #47, #60

											
										
										
											2026-04-10 10:35:01 +02:00
+								    # Issue #47: Wenn das Zitat nicht im PDF auffindbar ist UND wir die
 								    # Drucksache kennen, ist das Assessment wahrscheinlich ein Pre-#60-
 								    # Halluzinations-Opfer. Automatische Re-Analyse triggern und dem
 								    # User eine Warte-Seite zeigen statt ein PDF ohne Highlights.
 								    if not highlighted and q and ds and bl:
 								        existing = await get_assessment(ds)
 								        if existing:
 								            adapter = get_adapter(bl)
 								            if adapter:
 								                # Altes Assessment löschen und neu analysieren
 								                await delete_assessment(ds)
 								                job_id = str(uuid.uuid4())
 								                await create_job(job_id, f"Re-Analyse {ds} (Zitat nicht verifizierbar)", bl, "qwen-plus")
 								                text = await adapter.download_text(ds)
 								                if text:
 								                    doc = await adapter.get_document(ds)
 								                    background_tasks.add_task(
 								                        run_drucksache_analysis,
 								                        job_id, ds, text, bl, "qwen-plus", doc,
 								                    )
 								                    # HTML-Warte-Seite mit Auto-Redirect zurück zum Assessment
 								                    return HTMLResponse(f"""<!DOCTYPE html>
 								<html><head><meta charset="utf-8">
 								<meta http-equiv="refresh" content="15;url=/#assessment={ds}">
 								<title>Wird neu analysiert…</title>
 								<style>body{{font-family:sans-serif;display:flex;justify-content:center;align-items:center;height:100vh;margin:0;background:#f5f5f5}}
 								.box{{text-align:center;padding:2rem;background:#fff;border-radius:8px;box-shadow:0 2px 8px rgba(0,0,0,.1)}}
 								.spinner{{width:40px;height:40px;border:4px solid #ddd;border-top:4px solid #009da5;border-radius:50%;animation:spin 1s linear infinite;margin:1rem auto}}
 								@keyframes spin{{to{{transform:rotate(360deg)}}}}</style></head>
 								<body><div class="box">
 								<div class="spinner"></div>
 								<h2>Zitat nicht verifizierbar</h2>
 								<p>Der Antrag <strong>{ds}</strong> wird mit der aktuellen Pipeline<br>
 								neu analysiert, um verifizierte Zitate zu erzeugen.</p>
 								<p style="color:#666;font-size:0.9rem">Automatische Weiterleitung in 15 Sekunden…</p>
 								</div></body></html>""")
-												#47 PDF Zitat-Highlighting via PyMuPDF Single-Page-Render

Klick auf eine Zitat-Quelle im Report öffnet jetzt eine 1-Seiten-PDF-
Variante des Wahlprogramms mit gelb markiertem Snippet, statt nur zum
Page-Anchor zu springen und den Leser selbst suchen zu lassen.

Implementation:

embeddings.render_highlighted_page(programm_id, seite, query)
- Validiert programm_id gegen PROGRAMME (Path-Traversal-Schutz)
- Lädt das volle Wahlprogramm-PDF, extrahiert via insert_pdf nur die
  angeforderte Seite in einen neuen Document → kleinere Response
- search_for(query[:200]) → Bounding-Boxes aller Treffer
- Fallback: 5-Wort-Anker wenn Volltext-Match leer (LLM-Truncation,
  identisch zu find_chunk_for_text/Sub-D-Logik)
- add_highlight_annot mit gelber stroke-Color (1.0, 0.93, 0.0)
- Returns serialisierte PDF-Bytes oder None

embeddings._chunk_pdf_url
- Wenn chunk["text"] vorhanden: emittiert /api/wahlprogramm-cite-URL
  mit pid=, seite=, q=urlencoded(text[:200])
- Sonst: alter statischer /static/referenzen/X.pdf#page=N (Pre-#47
  rückwärts-kompatibel)
- text wird auf 200 Zeichen abgeschnitten, sonst blasen
  500-Zeichen-Snippets jedes Assessment-JSON auf

main.py /api/wahlprogramm-cite Endpoint
- Validiert pid gegen PROGRAMME registry
- seite: 1 ≤ n ≤ 2000
- Response: application/pdf, Cache-Control max-age=86400
- 404 bei unknown pid oder fehlendem PDF, 400 bei seite out of range

Reconstruct-Pipeline (Issue #60 Option B) zieht das automatisch durch:
reconstruct_zitate ruft _chunk_pdf_url(matched_chunk) auf, der jetzt
bevorzugt die Cite-URL emittiert. Keine Änderung an reconstruct_zitate
selbst nötig.

Tests: 194/194 grün (185 + 9 neue):

- TestChunkPdfUrl: 4 Cases (cite vs static, unknown prog, 200-char-truncate)
- TestRenderHighlightedPage: 5 Cases (unknown pid, invalid seite, valid
  render, empty query, query-not-found-falls-back-zu-leerem-Highlight)
- Plus Bridge im Test-Stub: pymupdf-as-fitz Shim falls eine
  third-party "fitz" das Pkg shadowt (kommt auf älteren Dev-Setups vor)

Refs: #47

											
										
										
											2026-04-10 01:09:45 +02:00
+								    info = PROGRAMME[pid]
 								    safe_name = info.get("pdf", f"{pid}.pdf")
 								    return Response(
 								        content=pdf_bytes,
 								        media_type="application/pdf",
 								        headers={
 								            "Content-Disposition": f'inline; filename="{safe_name}"',
-												#47: Auto-Re-Analyse bei nicht-verifizierbaren Zitaten

Statt eine Nachricht "Textstelle nicht auffindbar" zu zeigen (was User
zurecht als Quatsch bezeichnet hat), erkennt der Cite-Endpoint jetzt
halluzinierte Zitate und triggert automatisch eine Re-Analyse:

Flow:
1. User klickt auf Zitat-Link
2. render_highlighted_page gibt (pdf, page, highlighted=False) zurück
3. Endpoint prüft: ds+bl Parameter vorhanden? Assessment in DB?
4. → Löscht altes Assessment, startet Re-Analyse als Background-Task
5. → Zeigt HTML-Warte-Seite mit Spinner und "Wird neu analysiert..."
6. → Auto-Redirect nach 15s zurück zum Assessment

Das neue Assessment hat durch reconstruct_zitate verifizierte Zitate,
die dann beim nächsten Klick korrekt gehighlighted werden.

Änderungen:
- embeddings.render_highlighted_page: Return-Typ (bytes, int, bool) —
  drittes Element ist True wenn Highlight gesetzt wurde
- database.delete_assessment: neue Funktion für die Re-Analyse
- main.py cite-Endpoint: akzeptiert ds= und bl= als optionale Params,
  triggert Re-Analyse bei highlighted=False + ds vorhanden
- Frontend: makeCiteUrl reicht ds+bl aus dem Assessment-Kontext mit
  durch in die Cite-URL
- Cache-Control auf 1h reduziert (war 24h, zu aggressiv für
  Assessments die sich durch Re-Analyse ändern)

Tests: 194/194 grün.

Refs: #47, #60

											
										
										
											2026-04-10 10:35:01 +02:00
+								            "Cache-Control": "public, max-age=3600",
-												#47: Volles PDF mit Highlight statt 1-Seiten-Extract

User-Feedback: "Kontext geht verloren wenn nur 1 Seite kommt".

Änderung: render_highlighted_page liefert jetzt das GESAMTE Wahlprogramm-
PDF mit gelber Highlight-Annotation auf der Fundstelle, statt eines
1-Seiten-Auszugs. Der Browser öffnet das vollständige Programm.

Frontend hängt #page=N an die URL → Browser scrollt direkt zur
Fundstelle. found_page wird als X-Found-Page Header mitgeliefert,
falls der Text auf einer anderen Seite als angefordert gefunden wurde
(Pre-#60 halluzinierte Seitennummern).

Return-Typ geändert: (bytes, int) statt bytes — zweiter Wert ist die
1-indexed Seitennummer wo der Treffer tatsächlich liegt.

Tests angepasst: Tuple-Unpacking, Size-Check entfernt (volles PDF ist
größer als 1-Seiten-Extract, der alte Vergleich war obsolet).

Refs: #47

											
										
										
											2026-04-10 10:16:00 +02:00
+								            "X-Found-Page": str(found_page),
-												#47 PDF Zitat-Highlighting via PyMuPDF Single-Page-Render

Klick auf eine Zitat-Quelle im Report öffnet jetzt eine 1-Seiten-PDF-
Variante des Wahlprogramms mit gelb markiertem Snippet, statt nur zum
Page-Anchor zu springen und den Leser selbst suchen zu lassen.

Implementation:

embeddings.render_highlighted_page(programm_id, seite, query)
- Validiert programm_id gegen PROGRAMME (Path-Traversal-Schutz)
- Lädt das volle Wahlprogramm-PDF, extrahiert via insert_pdf nur die
  angeforderte Seite in einen neuen Document → kleinere Response
- search_for(query[:200]) → Bounding-Boxes aller Treffer
- Fallback: 5-Wort-Anker wenn Volltext-Match leer (LLM-Truncation,
  identisch zu find_chunk_for_text/Sub-D-Logik)
- add_highlight_annot mit gelber stroke-Color (1.0, 0.93, 0.0)
- Returns serialisierte PDF-Bytes oder None

embeddings._chunk_pdf_url
- Wenn chunk["text"] vorhanden: emittiert /api/wahlprogramm-cite-URL
  mit pid=, seite=, q=urlencoded(text[:200])
- Sonst: alter statischer /static/referenzen/X.pdf#page=N (Pre-#47
  rückwärts-kompatibel)
- text wird auf 200 Zeichen abgeschnitten, sonst blasen
  500-Zeichen-Snippets jedes Assessment-JSON auf

main.py /api/wahlprogramm-cite Endpoint
- Validiert pid gegen PROGRAMME registry
- seite: 1 ≤ n ≤ 2000
- Response: application/pdf, Cache-Control max-age=86400
- 404 bei unknown pid oder fehlendem PDF, 400 bei seite out of range

Reconstruct-Pipeline (Issue #60 Option B) zieht das automatisch durch:
reconstruct_zitate ruft _chunk_pdf_url(matched_chunk) auf, der jetzt
bevorzugt die Cite-URL emittiert. Keine Änderung an reconstruct_zitate
selbst nötig.

Tests: 194/194 grün (185 + 9 neue):

- TestChunkPdfUrl: 4 Cases (cite vs static, unknown prog, 200-char-truncate)
- TestRenderHighlightedPage: 5 Cases (unknown pid, invalid seite, valid
  render, empty query, query-not-found-falls-back-zu-leerem-Highlight)
- Plus Bridge im Test-Stub: pymupdf-as-fitz Shim falls eine
  third-party "fitz" das Pkg shadowt (kommt auf älteren Dev-Setups vor)

Refs: #47

											
										
										
											2026-04-10 01:09:45 +02:00
+								        },
 								    )
-												Quellen-Seite: PDF-Thumbnails der ersten Seite + Thumbnail-API-Endpoint

											
										
										
											2026-04-10 18:40:13 +02:00
+								@app.get("/api/programme/thumbnail/{programm_id}")
 								async def programme_thumbnail(programm_id: str):
 								    """Thumbnail der ersten Seite eines Wahlprogramm-PDFs (PNG, 200px breit).
 								    Wird auf der Quellen-Seite als Vorschau angezeigt. Cached 24h.
 								    """
 								    import fitz
 								    if programm_id not in PROGRAMME:
 								        raise HTTPException(status_code=404)
 								    info = PROGRAMME[programm_id]
 								    pdf_path = static_dir / "referenzen" / info["pdf"]
 								    if not pdf_path.exists():
 								        raise HTTPException(status_code=404)
 								    try:
 								        doc = fitz.open(str(pdf_path))
 								        page = doc[0]
 								        # 200px Breite, proportional skaliert
 								        zoom = 200 / page.rect.width
 								        mat = fitz.Matrix(zoom, zoom)
 								        pix = page.get_pixmap(matrix=mat)
 								        png_bytes = pix.tobytes("png")
 								        doc.close()
 								        return Response(
 								            content=png_bytes,
 								            media_type="image/png",
 								            headers={"Cache-Control": "public, max-age=86400"},
 								        )
 								    except Exception:
 								        raise HTTPException(status_code=500)
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								@app.get("/api/programme")
 								async def list_programme():
 								    """List all available programmes."""
 								    return get_programme_info()
 								@app.get("/api/programme/status")
 								async def programme_status():
 								    """Get indexing status of all programmes."""
 								    return get_indexing_status()
 								@app.post("/api/programme/index")
-												Security hotfixes #1, #2, #6 from audit (#57)

Drei akute Befunde aus dem Live-System-Audit (Issue #57):

- **#1 HIGH** — Resource Exhaustion via öffentlichem POST: slowapi
  Limiter (in-memory, IP-key) auf /analyze (10/min), /api/analyze-drucksache
  (10/min) und /api/programme/index (3/min). Verhindert, dass ein
  unauthentifizierter Client mit einer Schleife die DashScope-Quota oder
  die CPU des Containers leerziehen kann. Default-Storage reicht solange
  wir auf einem einzigen Worker laufen.

- **#2 MEDIUM** + **#6 MEDIUM** (selber Root-Cause) — XXE/Local-File-Read
  via WeasyPrint und Stored XSS via Browser-Rendering: alle LLM-getragenen
  Felder in app/report.py laufen jetzt durch html.escape() bevor sie in
  die HTML-Template interpoliert werden. format_redline_html escape-first
  und ersetzt dann die Markdown-Marker durch von uns kontrollierte
  <span>-Tags. build_matrix_html escaped das aspect-Attribut, sodass ein
  nacktes " den title="..."-Wert nicht mehr beenden und einen Event-
  Handler injizieren kann. Toter jinja2-Import in report.py entfernt
  (war never used, blockierte nur den lokalen Test).

- **Tests** — neue tests/test_report.py mit 8 Cases, die direkt die
  Bug-Klasse verifizieren: <script>, file://-img, "-attribut-breakout
  in Title und ein End-to-End-Render mit XSS-Payloads in jedem LLM-Feld.
  Die Marker-Funktionalität (** und ~~) wird mit-getestet, damit der
  Escape-First-Ansatz das nicht versehentlich kaputt macht.

77 alte Unit-Tests + 8 neue → 85 grün.

Rate-Limit-Verifikation per TestClient ist Integration-Scope und folgt
in tests/integration/test_main_security.py als separates Folge-Item.

Refs: #57

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 10:45:43 +02:00
+								@limiter.limit("3/minute")
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								async def index_programme(
-												Security hotfixes #1, #2, #6 from audit (#57)

Drei akute Befunde aus dem Live-System-Audit (Issue #57):

- **#1 HIGH** — Resource Exhaustion via öffentlichem POST: slowapi
  Limiter (in-memory, IP-key) auf /analyze (10/min), /api/analyze-drucksache
  (10/min) und /api/programme/index (3/min). Verhindert, dass ein
  unauthentifizierter Client mit einer Schleife die DashScope-Quota oder
  die CPU des Containers leerziehen kann. Default-Storage reicht solange
  wir auf einem einzigen Worker laufen.

- **#2 MEDIUM** + **#6 MEDIUM** (selber Root-Cause) — XXE/Local-File-Read
  via WeasyPrint und Stored XSS via Browser-Rendering: alle LLM-getragenen
  Felder in app/report.py laufen jetzt durch html.escape() bevor sie in
  die HTML-Template interpoliert werden. format_redline_html escape-first
  und ersetzt dann die Markdown-Marker durch von uns kontrollierte
  <span>-Tags. build_matrix_html escaped das aspect-Attribut, sodass ein
  nacktes " den title="..."-Wert nicht mehr beenden und einen Event-
  Handler injizieren kann. Toter jinja2-Import in report.py entfernt
  (war never used, blockierte nur den lokalen Test).

- **Tests** — neue tests/test_report.py mit 8 Cases, die direkt die
  Bug-Klasse verifizieren: <script>, file://-img, "-attribut-breakout
  in Title und ein End-to-End-Render mit XSS-Payloads in jedem LLM-Feld.
  Die Marker-Funktionalität (** und ~~) wird mit-getestet, damit der
  Escape-First-Ansatz das nicht versehentlich kaputt macht.

77 alte Unit-Tests + 8 neue → 85 grün.

Rate-Limit-Verifikation per TestClient ist Integration-Scope und folgt
in tests/integration/test_main_security.py als separates Folge-Item.

Refs: #57

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 10:45:43 +02:00
+								    request: Request,
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								    background_tasks: BackgroundTasks,
 								    programm_id: str = Form(None),
 								    all_programmes: bool = Form(False),
-												Fix SyntaxError: user=Depends nach Form-Params (Python positional-after-default)

											
										
										
											2026-04-10 14:30:54 +02:00
+								    user: dict = Depends(require_auth),
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								):
 								    """Index programme(s) for semantic search."""
 								    pdf_dir = static_dir / "referenzen"
 								    if all_programmes:
 								        # Index sequentially to avoid DB locks
 								        async def index_all_sequential():
 								            for prog_id in PROGRAMME.keys():
 								                try:
 								                    index_programm(prog_id, pdf_dir)
-												Phase A: Audit-Restbefunde #57.3/4/7 (Roadmap #59)

Drei verbleibende Audit-Befunde aus #57 in einem Patch:

- **#57.3 MEDIUM** Drucksache-Regex-Validation: neue
  app/validators.py mit validate_drucksache() als gemeinsamer
  Validation-Funnel. Pattern ^\d{1,3}/\d{1,7}([-(].{1,20})?$ deckt
  alle 10 aktiven Bundesländer (8/6390, 18/12345, 8/6390(neu),
  23/3700-A) ab und blockt Path-Traversal (../, /etc/passwd) plus
  Standard-Injection (;, <, &). Drei Endpoints durchgeschleust:
  /api/assessment, /api/assessment/pdf, /api/analyze-drucksache.

- **#57.4 MEDIUM** print() → logging.getLogger(__name__): main.py
  und analyzer.py auf strukturiertes Logging umgestellt. LLM-Inhalte
  werden NICHT mehr als Volltext geloggt — neue Helper
  _content_fingerprint() liefert nur "len=N sha1=XXXX", reicht zur
  Forensik ohne Antrag-Inhalte ins Container-Log zu leaken.
  basicConfig() mit ISO-Format setzt strukturiertes Logging früh,
  damit logger.exception() auch beim Boot greift.

- **#57.7 LOW-MED** Search-Query-Limit: validate_search_query() mit
  MAX_SEARCH_QUERY_LEN=200 schützt /api/search und /api/search-landtag
  vor 10-MB-Query-DoS. database._parse_search_query() loggt jetzt
  shlex.ValueError-Fallback statt ihn zu verschlucken (deckt Memory-
  Regel "stille excepts in Adaptern" ab).

Tests: neue tests/test_main_validators.py mit 22 Cases — Drucksache-
Whitelist-Roundtrip + Path-Traversal-Reject, Search-Query Längen-
Edge-Cases. 107 Unit-Tests grün (85 alt + 22 neu).

Validators in eigenem Modul (app/validators.py), damit Tests sie ohne
slowapi-Dependency direkt importieren können.

Refs: #57, #59 (Phase A)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 11:15:16 +02:00
+								                except Exception:
 								                    logger.exception("Error indexing programme %s", prog_id)
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								        background_tasks.add_task(index_all_sequential)
 								        return {"status": "indexing", "programmes": list(PROGRAMME.keys())}
 								    if programm_id and programm_id in PROGRAMME:
 								        background_tasks.add_task(index_programm, programm_id, pdf_dir)
 								        return {"status": "indexing", "programm_id": programm_id}
 								    raise HTTPException(status_code=400, detail="Ungültiges Programm")
-												Phase C: Auswertungen-Dashboard #58 + CSV-Export #45 (Roadmap #59)

Drei-dimensionale Aggregations-Sicht über Bundesland × Partei ×
Wahlperiode mit minimalem Frontend.

Backend (`app/auswertungen.py`):

- `aggregate_matrix(filter_wp=None)` — 2D-Matrix Bundesland × Partei mit
  (n, Ø-Score) pro Zelle, optional gefiltert nach Wahlperiode
- `aggregate_zeitreihe(bundesland, partei)` — Score-Verlauf einer
  (BL, Partei)-Kombination über alle bekannten WPs
- `export_long_format()` — Long-Format-CSV-Export für externe Tools
  (deckt #45 vollständig ab)
- Partei-Auflösung läuft strikt durch `normalize_partei()` aus #55 —
  damit wird BB-`FREIE WÄHLER` korrekt als `BVB-FW` aggregiert und
  NICHT mit dem RP-FW zusammengezählt

Wahlperioden-Helper (`app/wahlperioden.py`):

- `wahlperiode_for(datum, bundesland)` mappt ein ISO-Datum + BL auf eine
  Kennung wie `"NRW-WP18"` oder `"MV-WP7"` (Vorgänger-WP). Single Source
  of Truth ist `BUNDESLAENDER[bl].wahlperiode_start`
- `all_wahlperioden()` für UI-Filter-Dropdowns

Endpoints in `app/main.py`:

- `GET /auswertungen` — HTML-Seite (neues Template)
- `GET /api/auswertungen/matrix?wahlperiode=NRW-WP18` — JSON-Matrix
- `GET /api/auswertungen/zeitreihe?bundesland=MV&partei=CDU` — JSON-Verlauf
- `GET /api/auswertungen/export.csv` — CSV-Download

Frontend (`app/templates/auswertungen.html`):

- Statisches Template mit Vanilla-JS, kein Build-Step
- Wahlperioden-Dropdown + Reload-Button + CSV-Export-Button
- Matrix-Tabelle mit Score-Color-Coding (rot ≤ 3, gelb 3-6, grün > 6)
- Sticky-Bundesland-Spalte für horizontales Scrolling

Tests (`tests/test_auswertungen.py`):

- 19 Cases mit in-memory SQLite-Fixture
- Verifiziert WP-Mapping, Matrix-Aggregation, Koalitions-Counting,
  WP-Filter-Korrektheit, BVB-FW-Disambiguierung in der Matrix,
  CSV-Long-Format
- 176 Unit-Tests grün (157 alt + 19 neu)

Refs: #58, #45, #59 (Phase C)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-09 11:25:57 +02:00
+								# ─────────────────────────────────────────────────────────────────────────────
 								# Auswertungen #58 — Bundesland × Partei × Wahlperiode Aggregations-Sicht
 								# ─────────────────────────────────────────────────────────────────────────────
 								@app.get("/auswertungen", response_class=HTMLResponse)
 								async def auswertungen_page(request: Request):
 								    """Statische Seite, die die Matrix-Endpoints per fetch() lädt."""
 								    from .wahlperioden import all_wahlperioden
 								    return templates.TemplateResponse("auswertungen.html", {
 								        "request": request,
 								        "app_name": settings.app_name,
 								        "wahlperioden": sorted(all_wahlperioden()),
 								    })
 								@app.get("/api/auswertungen/matrix")
 								async def auswertungen_matrix(wahlperiode: Optional[str] = None):
 								    """2D-Matrix Bundesland × Partei mit Anzahl + Ø-GWÖ-Score."""
 								    from .auswertungen import aggregate_matrix
 								    return aggregate_matrix(filter_wp=wahlperiode)
 								@app.get("/api/auswertungen/zeitreihe")
 								async def auswertungen_zeitreihe(bundesland: str, partei: str):
 								    """Score-Verlauf einer (BL, Partei)-Kombination über alle WPs."""
 								    from .auswertungen import aggregate_zeitreihe
 								    return aggregate_zeitreihe(bundesland, partei)
 								@app.get("/api/auswertungen/export.csv")
 								async def auswertungen_export_csv():
 								    """Long-Format-CSV-Export aller Assessments. Deckt #45 mit ab."""
 								    from .auswertungen import export_long_format
 								    csv_text = export_long_format()
 								    return Response(
 								        content=csv_text,
 								        media_type="text/csv",
 								        headers={"Content-Disposition": 'attachment; filename="gwoe-assessments.csv"'},
 								    )
-												Initial commit: GWÖ-Antragsprüfer v1.0

Features:
- GWÖ-Matrix 2.0 Analyse für NRW-Landtagsanträge
- Verbesserungsvorschläge im Redline-Format (Original/Vorschlag/Begründung)
- Wahlprogramm- und Parteiprogrammtreue-Bewertung
- Landtag-Suche via OPAL-API
- Tag-Wolke mit Multi-Select Filter
- Partei-Filter mit Durchschnittswerten
- PDF-Report-Generierung
- Security Headers (CSP, X-Frame-Options, etc.)
- Persistente SQLite-DB via Docker Volumes

Tech Stack:
- FastAPI + Jinja2
- Qwen LLM via DashScope API
- SQLite + aiosqlite
- WeasyPrint für PDF
- Docker Compose mit Traefik

											
										
										
											2026-03-28 22:30:24 +01:00
+								# Health check
 								@app.get("/health")
 								async def health():
 								    return {"status": "ok", "version": settings.app_version}