From 1a82f8294c65db150831d63640538823e8d18f5b Mon Sep 17 00:00:00 2001
From: Dotty Dotter <dotty@Mac.wideopen.space>
Date: Fri, 10 Apr 2026 17:05:12 +0200
Subject: [PATCH] =?UTF-8?q?#57=20Security:=20print()=20=E2=86=92=20logger.?=
 =?UTF-8?q?exception=20f=C3=BCr=20alle=20Module?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Befund #4 aus dem Security-Audit (PII/LLM-Content im Container-Log):
Die letzten 10 print()-Aufrufe in app/{report,embeddings,parlamente}.py
durch strukturiertes Logging (logger.warning/exception/info) ersetzt.

Betroffen:
- report.py: 2× print in _append_original_antrag → logger.exception
- embeddings.py: 3× print in index_programm → logger.warning/info/exception
- parlamente.py: 5× print in NRWAdapter → logger.error/exception

logger.exception statt print+traceback: Stack-Trace wird automatisch
angehängt, ohne den LLM-Content oder Antrags-Details als Volltext zu
leaken (nur die Drucksache-ID als Kontext-Parameter).

Audit-Status nach diesem Commit: alle 7 adressierbaren Befunde aus #57
sind gefixt (1 Rate-Limit, 2/6 XSS/XXE, 3 Path-Traversal, 4 PII-Log,
5 CSRF via Auth, 7 Search-DoS). Befund 8 (Secrets als ENV) ist
akzeptiertes Risiko für Single-Server-Docker.

Tests: 201 passed, 5 skipped.
---
 app/embeddings.py |  9 ++++++---
 app/parlamente.py | 10 +++++-----
 app/report.py     |  7 +++++--
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/app/embeddings.py b/app/embeddings.py
index b0b006c..bdcf821 100644
--- a/app/embeddings.py
+++ b/app/embeddings.py
@@ -1,7 +1,10 @@
 """Semantic search for Wahlprogramme and Parteiprogramme using Qwen embeddings."""
 
 import json
+import logging
 import re
+
+logger = logging.getLogger(__name__)
 import sqlite3
 import urllib.parse
 from pathlib import Path
@@ -349,7 +352,7 @@ def index_programm(programm_id: str, pdf_dir: Path) -> int:
     pdf_path = pdf_dir / info["pdf"]
     
     if not pdf_path.exists():
-        print(f"PDF not found: {pdf_path}")
+        logger.warning("PDF not found: %s", pdf_path)
         return 0
     
     conn = sqlite3.connect(EMBEDDINGS_DB)
@@ -386,13 +389,13 @@ def index_programm(programm_id: str, pdf_dir: Path) -> int:
                 ))
                 total_chunks += 1
             except Exception as e:
-                print(f"Error embedding chunk: {e}")
+                logger.exception("Error embedding chunk")
                 continue
     
     conn.commit()
     conn.close()
     
-    print(f"Indexed {total_chunks} chunks from {programm_id}")
+    logger.info("Indexed %d chunks from %s", total_chunks, programm_id)
     return total_chunks
 
 
diff --git a/app/parlamente.py b/app/parlamente.py
index c9ffae2..c744e6f 100644
--- a/app/parlamente.py
+++ b/app/parlamente.py
@@ -109,7 +109,7 @@ class NRWAdapter(ParlamentAdapter):
                 # First, get the page to establish session
                 initial = await client.get(self.search_url)
                 if initial.status_code != 200:
-                    print(f"NRW search initial request failed: {initial.status_code}")
+                    logger.error("NRW search initial request failed: %s", initial.status_code)
                     return []
                 
                 # Parse for webflow token from pagination links
@@ -161,7 +161,7 @@ class NRWAdapter(ParlamentAdapter):
                 )
                 
                 if search_resp.status_code != 200:
-                    print(f"NRW search request failed: {search_resp.status_code}")
+                    logger.error("NRW search request failed: %s", search_resp.status_code)
                     return []
                 
                 # Parse results
@@ -246,11 +246,11 @@ class NRWAdapter(ParlamentAdapter):
                             results.append(doc)
                             
                     except Exception as e:
-                        print(f"Error parsing item: {e}")
+                        logger.exception("NRW error parsing item")
                         continue
                 
             except Exception as e:
-                print(f"NRW search error: {e}")
+                logger.exception("NRW search error")
         
         return results
     
@@ -312,7 +312,7 @@ class NRWAdapter(ParlamentAdapter):
                 
                 return text
             except Exception as e:
-                print(f"Error downloading {drucksache}: {e}")
+                logger.exception("NRW download error for %s", drucksache)
                 return None
 
 
diff --git a/app/report.py b/app/report.py
index b95c21d..2f99288 100644
--- a/app/report.py
+++ b/app/report.py
@@ -8,11 +8,14 @@ issue #57 (audit findings #2 and #6). The ``_e`` helper is the single
 funnel through which all LLM strings must pass on their way into the HTML.
 """
 
+import logging
 import subprocess
 from html import escape as _e
 from pathlib import Path
 from typing import Optional
 
+logger = logging.getLogger(__name__)
+
 from .models import Assessment, MATRIX_LABELS, EMPFEHLUNG_CONFIG
 from .bundeslaender import BUNDESLAENDER
 
@@ -544,7 +547,7 @@ async def _append_original_antrag(
                     finally:
                         src_doc.close()
                 except Exception as e:
-                    print(f"_append_original_antrag: PDF-Parse-Fehler für {assessment.drucksache}: {e}")
+                    logger.exception("_append_original_antrag: PDF-Parse-Fehler für %s", assessment.drucksache)
 
             # PyMuPDF refuses to overwrite the source file in non-incremental
             # mode — write to a sibling temp file and atomically replace.
@@ -559,7 +562,7 @@ async def _append_original_antrag(
         tmp_path.replace(report_path)
     except Exception as e:
         # Hard failure — leave the original report file untouched.
-        print(f"_append_original_antrag: Konnte Report nicht erweitern für {assessment.drucksache}: {e}")
+        logger.exception("_append_original_antrag: Konnte Report nicht erweitern für %s", assessment.drucksache)
 
 
 def _insert_divider_page(