#57 Security: print() → logger.exception für alle Module

Befund #4 aus dem Security-Audit (PII/LLM-Content im Container-Log):
Die letzten 10 print()-Aufrufe in app/{report,embeddings,parlamente}.py
durch strukturiertes Logging (logger.warning/exception/info) ersetzt.

Betroffen:
- report.py: 2× print in _append_original_antrag → logger.exception
- embeddings.py: 3× print in index_programm → logger.warning/info/exception
- parlamente.py: 5× print in NRWAdapter → logger.error/exception

logger.exception statt print+traceback: Stack-Trace wird automatisch
angehängt, ohne den LLM-Content oder Antrags-Details als Volltext zu
leaken (nur die Drucksache-ID als Kontext-Parameter).

Audit-Status nach diesem Commit: alle 7 adressierbaren Befunde aus #57
sind gefixt (1 Rate-Limit, 2/6 XSS/XXE, 3 Path-Traversal, 4 PII-Log,
5 CSRF via Auth, 7 Search-DoS). Befund 8 (Secrets als ENV) ist
akzeptiertes Risiko für Single-Server-Docker.

Tests: 201 passed, 5 skipped.
This commit is contained in:
Dotty Dotter 2026-04-10 17:05:12 +02:00
parent 0870e8a910
commit 1a82f8294c
3 changed files with 16 additions and 10 deletions

View File

@ -1,7 +1,10 @@
"""Semantic search for Wahlprogramme and Parteiprogramme using Qwen embeddings.""" """Semantic search for Wahlprogramme and Parteiprogramme using Qwen embeddings."""
import json import json
import logging
import re import re
logger = logging.getLogger(__name__)
import sqlite3 import sqlite3
import urllib.parse import urllib.parse
from pathlib import Path from pathlib import Path
@ -349,7 +352,7 @@ def index_programm(programm_id: str, pdf_dir: Path) -> int:
pdf_path = pdf_dir / info["pdf"] pdf_path = pdf_dir / info["pdf"]
if not pdf_path.exists(): if not pdf_path.exists():
print(f"PDF not found: {pdf_path}") logger.warning("PDF not found: %s", pdf_path)
return 0 return 0
conn = sqlite3.connect(EMBEDDINGS_DB) conn = sqlite3.connect(EMBEDDINGS_DB)
@ -386,13 +389,13 @@ def index_programm(programm_id: str, pdf_dir: Path) -> int:
)) ))
total_chunks += 1 total_chunks += 1
except Exception as e: except Exception as e:
print(f"Error embedding chunk: {e}") logger.exception("Error embedding chunk")
continue continue
conn.commit() conn.commit()
conn.close() conn.close()
print(f"Indexed {total_chunks} chunks from {programm_id}") logger.info("Indexed %d chunks from %s", total_chunks, programm_id)
return total_chunks return total_chunks

View File

@ -109,7 +109,7 @@ class NRWAdapter(ParlamentAdapter):
# First, get the page to establish session # First, get the page to establish session
initial = await client.get(self.search_url) initial = await client.get(self.search_url)
if initial.status_code != 200: if initial.status_code != 200:
print(f"NRW search initial request failed: {initial.status_code}") logger.error("NRW search initial request failed: %s", initial.status_code)
return [] return []
# Parse for webflow token from pagination links # Parse for webflow token from pagination links
@ -161,7 +161,7 @@ class NRWAdapter(ParlamentAdapter):
) )
if search_resp.status_code != 200: if search_resp.status_code != 200:
print(f"NRW search request failed: {search_resp.status_code}") logger.error("NRW search request failed: %s", search_resp.status_code)
return [] return []
# Parse results # Parse results
@ -246,11 +246,11 @@ class NRWAdapter(ParlamentAdapter):
results.append(doc) results.append(doc)
except Exception as e: except Exception as e:
print(f"Error parsing item: {e}") logger.exception("NRW error parsing item")
continue continue
except Exception as e: except Exception as e:
print(f"NRW search error: {e}") logger.exception("NRW search error")
return results return results
@ -312,7 +312,7 @@ class NRWAdapter(ParlamentAdapter):
return text return text
except Exception as e: except Exception as e:
print(f"Error downloading {drucksache}: {e}") logger.exception("NRW download error for %s", drucksache)
return None return None

View File

@ -8,11 +8,14 @@ issue #57 (audit findings #2 and #6). The ``_e`` helper is the single
funnel through which all LLM strings must pass on their way into the HTML. funnel through which all LLM strings must pass on their way into the HTML.
""" """
import logging
import subprocess import subprocess
from html import escape as _e from html import escape as _e
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
logger = logging.getLogger(__name__)
from .models import Assessment, MATRIX_LABELS, EMPFEHLUNG_CONFIG from .models import Assessment, MATRIX_LABELS, EMPFEHLUNG_CONFIG
from .bundeslaender import BUNDESLAENDER from .bundeslaender import BUNDESLAENDER
@ -544,7 +547,7 @@ async def _append_original_antrag(
finally: finally:
src_doc.close() src_doc.close()
except Exception as e: except Exception as e:
print(f"_append_original_antrag: PDF-Parse-Fehler für {assessment.drucksache}: {e}") logger.exception("_append_original_antrag: PDF-Parse-Fehler für %s", assessment.drucksache)
# PyMuPDF refuses to overwrite the source file in non-incremental # PyMuPDF refuses to overwrite the source file in non-incremental
# mode — write to a sibling temp file and atomically replace. # mode — write to a sibling temp file and atomically replace.
@ -559,7 +562,7 @@ async def _append_original_antrag(
tmp_path.replace(report_path) tmp_path.replace(report_path)
except Exception as e: except Exception as e:
# Hard failure — leave the original report file untouched. # Hard failure — leave the original report file untouched.
print(f"_append_original_antrag: Konnte Report nicht erweitern für {assessment.drucksache}: {e}") logger.exception("_append_original_antrag: Konnte Report nicht erweitern für %s", assessment.drucksache)
def _insert_divider_page( def _insert_divider_page(