From 9c70b463acb241052996db17ec7f20c70bcf8892 Mon Sep 17 00:00:00 2001
From: Dotty Dotter <dotty@Mac.wideopen.space>
Date: Thu, 9 Apr 2026 11:15:16 +0200
Subject: [PATCH] Phase A: Audit-Restbefunde #57.3/4/7 (Roadmap #59)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drei verbleibende Audit-Befunde aus #57 in einem Patch:

- **#57.3 MEDIUM** Drucksache-Regex-Validation: neue
  app/validators.py mit validate_drucksache() als gemeinsamer
  Validation-Funnel. Pattern ^\d{1,3}/\d{1,7}([-(].{1,20})?$ deckt
  alle 10 aktiven Bundesländer (8/6390, 18/12345, 8/6390(neu),
  23/3700-A) ab und blockt Path-Traversal (../, /etc/passwd) plus
  Standard-Injection (;, <, &). Drei Endpoints durchgeschleust:
  /api/assessment, /api/assessment/pdf, /api/analyze-drucksache.

- **#57.4 MEDIUM** print() → logging.getLogger(__name__): main.py
  und analyzer.py auf strukturiertes Logging umgestellt. LLM-Inhalte
  werden NICHT mehr als Volltext geloggt — neue Helper
  _content_fingerprint() liefert nur "len=N sha1=XXXX", reicht zur
  Forensik ohne Antrag-Inhalte ins Container-Log zu leaken.
  basicConfig() mit ISO-Format setzt strukturiertes Logging früh,
  damit logger.exception() auch beim Boot greift.

- **#57.7 LOW-MED** Search-Query-Limit: validate_search_query() mit
  MAX_SEARCH_QUERY_LEN=200 schützt /api/search und /api/search-landtag
  vor 10-MB-Query-DoS. database._parse_search_query() loggt jetzt
  shlex.ValueError-Fallback statt ihn zu verschlucken (deckt Memory-
  Regel "stille excepts in Adaptern" ab).

Tests: neue tests/test_main_validators.py mit 22 Cases — Drucksache-
Whitelist-Roundtrip + Path-Traversal-Reject, Search-Query Längen-
Edge-Cases. 107 Unit-Tests grün (85 alt + 22 neu).

Validators in eigenem Modul (app/validators.py), damit Tests sie ohne
slowapi-Dependency direkt importieren können.

Refs: #57, #59 (Phase A)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 app/analyzer.py               | 33 ++++++++++---
 app/database.py               | 10 +++-
 app/main.py                   | 35 +++++++++++---
 app/validators.py             | 48 ++++++++++++++++++
 tests/test_main_validators.py | 91 +++++++++++++++++++++++++++++++++++
 5 files changed, 202 insertions(+), 15 deletions(-)
 create mode 100644 app/validators.py
 create mode 100644 tests/test_main_validators.py

diff --git a/app/analyzer.py b/app/analyzer.py
index e9279b0..6f0dc67 100644
--- a/app/analyzer.py
+++ b/app/analyzer.py
@@ -1,6 +1,8 @@
 """LLM-based analysis of parliamentary motions against GWÖ matrix."""
 
+import hashlib
 import json
+import logging
 import re
 from pathlib import Path
 
@@ -16,6 +18,19 @@ from .wahlprogramme import (
 )
 from .embeddings import get_relevant_quotes_for_antrag, format_quotes_for_prompt, EMBEDDINGS_DB
 
+logger = logging.getLogger(__name__)
+
+
+def _content_fingerprint(content: str) -> str:
+    """Cheap, log-safe identifier for an LLM response: length + first 8 chars
+    of SHA-1. Lets us correlate retries without ever leaking the LLM's
+    actual output (which may contain sensitive Antrags-Inhalte). Issue
+    #57 Befund #4."""
+    if not content:
+        return "len=0"
+    h = hashlib.sha1(content.encode("utf-8", errors="replace")).hexdigest()[:8]
+    return f"len={len(content)} sha1={h}"
+
 # Load context files
 KONTEXT_DIR = Path(__file__).parent / "kontext"
 
@@ -231,8 +246,8 @@ async def analyze_antrag(text: str, bundesland: str = "NRW", model: str = "qwen-
                 text, fraktionen, bundesland=bundesland, top_k_per_partei=2,
             )
             quotes_context = format_quotes_for_prompt(semantic_quotes)
-        except Exception as e:
-            print(f"Semantic search failed: {e}, falling back to keyword search")
+        except Exception:
+            logger.exception("Semantic search failed, falling back to keyword search")
             quotes = find_relevant_quotes(text, fraktionen, bundesland=bundesland)
             quotes_context = format_quote_for_prompt(quotes)
     else:
@@ -305,11 +320,17 @@ Ausgabe als reines JSON ohne Markdown-Codeblöcke."""
             return Assessment.model_validate(data)
         except json.JSONDecodeError as e:
             last_error = e
-            print(f"JSON parse error on attempt {attempt + 1}/{max_retries}: {e}")
+            logger.warning(
+                "LLM JSON parse error attempt %d/%d (%s) — content %s",
+                attempt + 1, max_retries, e, _content_fingerprint(content),
+            )
             if attempt < max_retries - 1:
-                print(f"Retrying with higher temperature...")
                 continue
             else:
-                # Log the problematic content for debugging
-                print(f"Failed JSON content (first 500 chars): {content[:500]}")
+                # Letzter Fehlversuch — Fingerprint reicht zur Forensik;
+                # Volltext darf nicht ins Log, weil er Antrag-Inhalte enthält
+                logger.error(
+                    "LLM JSON parsing exhausted retries, content %s",
+                    _content_fingerprint(content),
+                )
                 raise
diff --git a/app/database.py b/app/database.py
index ef590c1..6f17556 100644
--- a/app/database.py
+++ b/app/database.py
@@ -274,12 +274,18 @@ def _parse_search_query(query: str) -> tuple[list[str], bool]:
         return ([exact.lower()], True)
     
     # Extract quoted phrases and regular terms
+    import logging
     import shlex
     try:
         parts = shlex.split(query)
-    except ValueError:
+    except ValueError as e:
+        # Unbalanced quote — fall back to whitespace split, but log so we
+        # notice patterns of malformed queries (Issue #57 Befund #7).
+        logging.getLogger(__name__).warning(
+            "shlex.split failed on search query (%s), falling back to whitespace split", e
+        )
         parts = query.split()
-    
+
     return ([p.lower() for p in parts], False)
 
 
diff --git a/app/main.py b/app/main.py
index 123f21a..c8a675e 100644
--- a/app/main.py
+++ b/app/main.py
@@ -1,5 +1,6 @@
 """GWÖ-Antragsprüfer — FastAPI Webapp."""
 
+import logging
 import uuid
 from pathlib import Path
 from typing import Optional
@@ -13,6 +14,21 @@ from slowapi import Limiter, _rate_limit_exceeded_handler
 from slowapi.util import get_remote_address
 from slowapi.errors import RateLimitExceeded
 
+from .validators import (
+    MAX_SEARCH_QUERY_LEN,
+    validate_drucksache,
+    validate_search_query,
+)
+
+# Strukturiertes Logging für die ganze App. uvicorn registriert seinen
+# eigenen Root-Handler erst beim Start; wir setzen ein neutrales Format
+# für unsere Module früh, damit logger.exception() auch beim Boot greift.
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)-7s %(name)s: %(message)s",
+)
+logger = logging.getLogger(__name__)
+
 from .config import settings
 from .database import (
     init_db, get_job, create_job, update_job,
@@ -268,6 +284,7 @@ async def list_assessments(bundesland: Optional[str] = None):
 @app.get("/api/assessment")
 async def get_single_assessment(drucksache: str):
     """Get a single assessment by drucksache ID."""
+    drucksache = validate_drucksache(drucksache)
     row = await get_assessment(drucksache)
     if not row:
         raise HTTPException(status_code=404, detail="Assessment nicht gefunden")
@@ -301,7 +318,8 @@ async def get_single_assessment(drucksache: str):
 async def download_assessment_pdf(drucksache: str):
     """Generate and download PDF for an assessment."""
     from .models import Assessment
-    
+
+    drucksache = validate_drucksache(drucksache)
     row = await get_assessment(drucksache)
     if not row:
         raise HTTPException(status_code=404, detail="Assessment nicht gefunden")
@@ -361,6 +379,7 @@ async def search_internal(
     """
     Search internal assessments database only.
     """
+    q = validate_search_query(q)
     db_results = await search_assessments(q, bundesland, limit)
     
     results = []
@@ -394,6 +413,7 @@ async def search_landtag(
     Requires a concrete Bundesland — the special "ALL" / Bundesweit mode
     cannot pick a single Landtag adapter and is rejected with HTTP 400.
     """
+    q = validate_search_query(q)
     if not bundesland or bundesland == "ALL":
         raise HTTPException(
             status_code=400,
@@ -420,7 +440,7 @@ async def search_landtag(
             })
         return results
     except Exception as e:
-        print(f"Landtag search error: {e}")
+        logger.exception("Landtag search error for q=%r bundesland=%s", q, bundesland)
         return {"error": f"Suchfehler: {str(e)}"}
 
 
@@ -437,6 +457,7 @@ async def analyze_drucksache(
     """
     Download a document from parliament portal and analyze it.
     """
+    drucksache = validate_drucksache(drucksache)
     # Check if already analyzed
     existing = await get_assessment(drucksache)
     if existing:
@@ -527,9 +548,9 @@ async def run_drucksache_analysis(
             pdf_path=str(pdf_path),
         )
     except Exception as e:
-        import traceback
-        print(f"ERROR in run_drucksache_analysis for {drucksache}: {e}")
-        print(traceback.format_exc())
+        # Volltext-Stack via logger.exception, NICHT via print — landet so im
+        # strukturierten Container-Log und wird vom logging-Framework formatiert
+        logger.exception("run_drucksache_analysis failed for drucksache=%s", drucksache)
         await update_job(job_id, status="failed", error=str(e))
 
 
@@ -603,8 +624,8 @@ async def index_programme(
             for prog_id in PROGRAMME.keys():
                 try:
                     index_programm(prog_id, pdf_dir)
-                except Exception as e:
-                    print(f"Error indexing {prog_id}: {e}")
+                except Exception:
+                    logger.exception("Error indexing programme %s", prog_id)
         background_tasks.add_task(index_all_sequential)
         return {"status": "indexing", "programmes": list(PROGRAMME.keys())}
     
diff --git a/app/validators.py b/app/validators.py
new file mode 100644
index 0000000..530fbb9
--- /dev/null
+++ b/app/validators.py
@@ -0,0 +1,48 @@
+"""Input-Validators für die FastAPI-Endpoints.
+
+Eigenes Modul, damit die Unit-Tests die Helper direkt importieren können
+ohne den vollen FastAPI-/slowapi-Stack aus app.main mitzuziehen. Issue
+#57 Befunde #3 (Path-Traversal via drucksache) und #7 (Search-Query-DoS).
+"""
+from __future__ import annotations
+
+import re
+
+from fastapi import HTTPException
+
+
+# Drucksache-Format: erlaubt sind alle bisher in den 10 aktiven Bundesländern
+# beobachteten Schreibweisen — z.B. "8/6390", "18/12345", "8/6390(neu)",
+# "23/3700-A". Restriktiv genug, um Path-Traversal (../, /etc/passwd) und
+# trivial-injection (?, ;, <, >, &, =) abzufangen.
+_DRUCKSACHE_RE = re.compile(r"^\d{1,3}/\d{1,7}([-(].{1,20})?$")
+
+
+def validate_drucksache(drucksache: str) -> str:
+    """Lehnt jede Drucksache-ID ab, die nicht dem erwarteten Format
+    entspricht. Gemeinsamer Validation-Funnel für alle Endpoints, die
+    ``drucksache`` als Parameter nehmen. Issue #57 Befund #3.
+    """
+    if not drucksache or not _DRUCKSACHE_RE.match(drucksache):
+        raise HTTPException(
+            status_code=400,
+            detail=f"Ungültige Drucksache-ID: {drucksache!r}",
+        )
+    return drucksache
+
+
+# Längen-Limit für freie Search-Queries — verhindert, dass jemand mit einem
+# 10-MB-Query die SQL-LIKE-Pattern oder den HTTP-Adapter aushungert.
+# Issue #57 Befund #7.
+MAX_SEARCH_QUERY_LEN = 200
+
+
+def validate_search_query(q: str) -> str:
+    if q is None:
+        raise HTTPException(status_code=400, detail="Suchbegriff fehlt")
+    if len(q) > MAX_SEARCH_QUERY_LEN:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Suchbegriff zu lang (max {MAX_SEARCH_QUERY_LEN} Zeichen, war {len(q)})",
+        )
+    return q
diff --git a/tests/test_main_validators.py b/tests/test_main_validators.py
new file mode 100644
index 0000000..2215108
--- /dev/null
+++ b/tests/test_main_validators.py
@@ -0,0 +1,91 @@
+"""Tests für die FastAPI-Validation-Helper aus app.main.
+
+Issue #57 Befund #3 (Path-Traversal via drucksache) und #7 (Query-DoS).
+Wir importieren die Helper direkt — keine TestClient-Roundtrip nötig,
+weil die Funktionen reine Validierung sind und die FastAPI-Integration
+in app.main den richtigen Aufrufpfad bereits vorgibt.
+"""
+from __future__ import annotations
+
+import pytest
+from fastapi import HTTPException
+
+from app.validators import (
+    MAX_SEARCH_QUERY_LEN,
+    validate_drucksache,
+    validate_search_query,
+)
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# validate_drucksache (Befund #3 — Path-Traversal-Härtung)
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+class TestValidateDrucksache:
+    @pytest.mark.parametrize(
+        "valid",
+        [
+            "8/6390",
+            "18/12345",
+            "23/3700",
+            "20/4309",
+            "8/6390(neu)",
+            "23/3700-A",
+        ],
+    )
+    def test_accepts_known_real_formats(self, valid):
+        assert validate_drucksache(valid) == valid
+
+    @pytest.mark.parametrize(
+        "evil",
+        [
+            "../../etc/passwd",
+            "8/6390/../../../etc/shadow",
+            "8/6390;rm -rf /",
+            "8/6390?bundesland=NRW",
+            "<script>alert(1)</script>",
+            "8/6390 OR 1=1",
+            "",
+            "/",
+            "8//6390",
+            "abc/def",
+            "8/abc",
+        ],
+    )
+    def test_rejects_path_traversal_and_injection(self, evil):
+        with pytest.raises(HTTPException) as exc:
+            validate_drucksache(evil)
+        assert exc.value.status_code == 400
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# validate_search_query (Befund #7 — Query-Length-DoS)
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+class TestValidateSearchQuery:
+    def test_short_query_passes(self):
+        assert validate_search_query("Klimaschutz") == "Klimaschutz"
+
+    def test_at_limit_passes(self):
+        q = "x" * MAX_SEARCH_QUERY_LEN
+        assert validate_search_query(q) == q
+
+    def test_over_limit_rejected(self):
+        q = "x" * (MAX_SEARCH_QUERY_LEN + 1)
+        with pytest.raises(HTTPException) as exc:
+            validate_search_query(q)
+        assert exc.value.status_code == 400
+        assert "zu lang" in exc.value.detail.lower()
+
+    def test_none_query_rejected(self):
+        with pytest.raises(HTTPException) as exc:
+            validate_search_query(None)  # type: ignore[arg-type]
+        assert exc.value.status_code == 400
+
+    def test_empty_string_passes(self):
+        # Eine leere Query ist semantisch ok (Frontend nutzt sie für
+        # "alle Drucksachen") — die Längen-Validierung soll nur den
+        # absurd großen Fall blocken, nicht den leeren.
+        assert validate_search_query("") == ""