From 1a82f8294c65db150831d63640538823e8d18f5b Mon Sep 17 00:00:00 2001 From: Dotty Dotter Date: Fri, 10 Apr 2026 17:05:12 +0200 Subject: [PATCH] =?UTF-8?q?#57=20Security:=20print()=20=E2=86=92=20logger.?= =?UTF-8?q?exception=20f=C3=BCr=20alle=20Module?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Befund #4 aus dem Security-Audit (PII/LLM-Content im Container-Log): Die letzten 10 print()-Aufrufe in app/{report,embeddings,parlamente}.py durch strukturiertes Logging (logger.warning/exception/info) ersetzt. Betroffen: - report.py: 2× print in _append_original_antrag → logger.exception - embeddings.py: 3× print in index_programm → logger.warning/info/exception - parlamente.py: 5× print in NRWAdapter → logger.error/exception logger.exception statt print+traceback: Stack-Trace wird automatisch angehängt, ohne den LLM-Content oder Antrags-Details als Volltext zu leaken (nur die Drucksache-ID als Kontext-Parameter). Audit-Status nach diesem Commit: alle 7 adressierbaren Befunde aus #57 sind gefixt (1 Rate-Limit, 2/6 XSS/XXE, 3 Path-Traversal, 4 PII-Log, 5 CSRF via Auth, 7 Search-DoS). Befund 8 (Secrets als ENV) ist akzeptiertes Risiko für Single-Server-Docker. Tests: 201 passed, 5 skipped. --- app/embeddings.py | 9 ++++++--- app/parlamente.py | 10 +++++----- app/report.py | 7 +++++-- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/app/embeddings.py b/app/embeddings.py index b0b006c..bdcf821 100644 --- a/app/embeddings.py +++ b/app/embeddings.py @@ -1,7 +1,10 @@ """Semantic search for Wahlprogramme and Parteiprogramme using Qwen embeddings.""" import json +import logging import re + +logger = logging.getLogger(__name__) import sqlite3 import urllib.parse from pathlib import Path @@ -349,7 +352,7 @@ def index_programm(programm_id: str, pdf_dir: Path) -> int: pdf_path = pdf_dir / info["pdf"] if not pdf_path.exists(): - print(f"PDF not found: {pdf_path}") + logger.warning("PDF not found: %s", pdf_path) return 0 conn = sqlite3.connect(EMBEDDINGS_DB) @@ -386,13 +389,13 @@ def index_programm(programm_id: str, pdf_dir: Path) -> int: )) total_chunks += 1 except Exception as e: - print(f"Error embedding chunk: {e}") + logger.exception("Error embedding chunk") continue conn.commit() conn.close() - print(f"Indexed {total_chunks} chunks from {programm_id}") + logger.info("Indexed %d chunks from %s", total_chunks, programm_id) return total_chunks diff --git a/app/parlamente.py b/app/parlamente.py index c9ffae2..c744e6f 100644 --- a/app/parlamente.py +++ b/app/parlamente.py @@ -109,7 +109,7 @@ class NRWAdapter(ParlamentAdapter): # First, get the page to establish session initial = await client.get(self.search_url) if initial.status_code != 200: - print(f"NRW search initial request failed: {initial.status_code}") + logger.error("NRW search initial request failed: %s", initial.status_code) return [] # Parse for webflow token from pagination links @@ -161,7 +161,7 @@ class NRWAdapter(ParlamentAdapter): ) if search_resp.status_code != 200: - print(f"NRW search request failed: {search_resp.status_code}") + logger.error("NRW search request failed: %s", search_resp.status_code) return [] # Parse results @@ -246,11 +246,11 @@ class NRWAdapter(ParlamentAdapter): results.append(doc) except Exception as e: - print(f"Error parsing item: {e}") + logger.exception("NRW error parsing item") continue except Exception as e: - print(f"NRW search error: {e}") + logger.exception("NRW search error") return results @@ -312,7 +312,7 @@ class NRWAdapter(ParlamentAdapter): return text except Exception as e: - print(f"Error downloading {drucksache}: {e}") + logger.exception("NRW download error for %s", drucksache) return None diff --git a/app/report.py b/app/report.py index b95c21d..2f99288 100644 --- a/app/report.py +++ b/app/report.py @@ -8,11 +8,14 @@ issue #57 (audit findings #2 and #6). The ``_e`` helper is the single funnel through which all LLM strings must pass on their way into the HTML. """ +import logging import subprocess from html import escape as _e from pathlib import Path from typing import Optional +logger = logging.getLogger(__name__) + from .models import Assessment, MATRIX_LABELS, EMPFEHLUNG_CONFIG from .bundeslaender import BUNDESLAENDER @@ -544,7 +547,7 @@ async def _append_original_antrag( finally: src_doc.close() except Exception as e: - print(f"_append_original_antrag: PDF-Parse-Fehler für {assessment.drucksache}: {e}") + logger.exception("_append_original_antrag: PDF-Parse-Fehler für %s", assessment.drucksache) # PyMuPDF refuses to overwrite the source file in non-incremental # mode — write to a sibling temp file and atomically replace. @@ -559,7 +562,7 @@ async def _append_original_antrag( tmp_path.replace(report_path) except Exception as e: # Hard failure — leave the original report file untouched. - print(f"_append_original_antrag: Konnte Report nicht erweitern für {assessment.drucksache}: {e}") + logger.exception("_append_original_antrag: Konnte Report nicht erweitern für %s", assessment.drucksache) def _insert_divider_page(