gwoe-antragspruefer/app/report.py
Dotty Dotter 64cbff5286 Security hotfixes #1, #2, #6 from audit (#57)
Drei akute Befunde aus dem Live-System-Audit (Issue #57):

- **#1 HIGH** — Resource Exhaustion via öffentlichem POST: slowapi
  Limiter (in-memory, IP-key) auf /analyze (10/min), /api/analyze-drucksache
  (10/min) und /api/programme/index (3/min). Verhindert, dass ein
  unauthentifizierter Client mit einer Schleife die DashScope-Quota oder
  die CPU des Containers leerziehen kann. Default-Storage reicht solange
  wir auf einem einzigen Worker laufen.

- **#2 MEDIUM** + **#6 MEDIUM** (selber Root-Cause) — XXE/Local-File-Read
  via WeasyPrint und Stored XSS via Browser-Rendering: alle LLM-getragenen
  Felder in app/report.py laufen jetzt durch html.escape() bevor sie in
  die HTML-Template interpoliert werden. format_redline_html escape-first
  und ersetzt dann die Markdown-Marker durch von uns kontrollierte
  <span>-Tags. build_matrix_html escaped das aspect-Attribut, sodass ein
  nacktes " den title="..."-Wert nicht mehr beenden und einen Event-
  Handler injizieren kann. Toter jinja2-Import in report.py entfernt
  (war never used, blockierte nur den lokalen Test).

- **Tests** — neue tests/test_report.py mit 8 Cases, die direkt die
  Bug-Klasse verifizieren: <script>, file://-img, "-attribut-breakout
  in Title und ein End-to-End-Render mit XSS-Payloads in jedem LLM-Feld.
  Die Marker-Funktionalität (** und ~~) wird mit-getestet, damit der
  Escape-First-Ansatz das nicht versehentlich kaputt macht.

77 alte Unit-Tests + 8 neue → 85 grün.

Rate-Limit-Verifikation per TestClient ist Integration-Scope und folgt
in tests/integration/test_main_security.py als separates Folge-Item.

Refs: #57

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 10:45:43 +02:00

645 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Report generation for HTML and PDF output.
All LLM-generated fields are HTML-escaped before being interpolated into
the report template. WeasyPrint will happily resolve ``<img src="file://...">``
or ``<link rel=stylesheet href="file://...">`` against the container
filesystem, so unescaped LLM output is a Local-File-Read primitive — see
issue #57 (audit findings #2 and #6). The ``_e`` helper is the single
funnel through which all LLM strings must pass on their way into the HTML.
"""
import subprocess
from html import escape as _e
from pathlib import Path
from typing import Optional
from .models import Assessment, MATRIX_LABELS, EMPFEHLUNG_CONFIG
from .bundeslaender import BUNDESLAENDER
# ECOnGOOD Colors
COLORS = {
"darkgray": "#5a5a5a",
"green": "#889e33",
"blue": "#009da5",
"lightgray": "#bfbfbf",
"orange": "#F7941D",
"red": "#d00000",
}
def get_score_color(score: float) -> str:
"""Get color for a score value."""
if score >= 7:
return COLORS["blue"]
if score >= 4:
return COLORS["green"]
if score >= 2:
return "#FFC20E"
if score >= 1:
return COLORS["orange"]
return COLORS["red"]
def get_rating_symbol(rating: int) -> str:
"""Convert numeric rating to symbol."""
if rating >= 2:
return "++"
if rating == 1:
return "+"
if rating == 0:
return ""
if rating == -1:
return ""
return ""
def format_redline_html(text: str) -> str:
"""Convert redline markup (``**ins**`` / ``~~del~~``) to HTML.
Escapes the input first so that any HTML in the LLM output (e.g.
``<img src="file:///etc/passwd">``) becomes inert text. The marker
regexes still fire because ``**`` and ``~~`` are not HTML special
chars and survive escaping unchanged. The inserted ``<span>`` tags
are the only raw HTML in the result and are produced by us.
"""
import re
text = _e(text or "")
# **text** → green bold (inserted)
text = re.sub(r'\*\*([^*]+)\*\*', r'<span class="inserted">\1</span>', text)
# ~~text~~ → red strikethrough (deleted)
text = re.sub(r'~~([^~]+)~~', r'<span class="deleted">\1</span>', text)
return text
def build_matrix_html(assessment: Assessment) -> str:
"""Build HTML matrix table."""
rating_map = {e.field: e for e in assessment.gwoe_matrix}
rows = ["A", "B", "C", "D", "E"]
row_labels = {
"A": "Lieferant:innen",
"B": "Finanzen",
"C": "Führung/Verwaltung",
"D": "Bürger:innen",
"E": "Gesellschaft/Natur",
}
html = ['<table class="matrix-table">']
html.append('<thead><tr>')
html.append('<th></th>')
for col in range(1, 6):
html.append(f'<th>{col}</th>')
html.append('</tr></thead>')
html.append('<tbody>')
for row in rows:
html.append(f'<tr><th>{row}: {row_labels[row]}</th>')
for col in range(1, 6):
field = f"{row}{col}"
entry = rating_map.get(field)
if entry:
symbol = get_rating_symbol(entry.rating)
css_class = "positive" if entry.rating > 0 else ("negative" if entry.rating < 0 else "neutral")
# entry.aspect comes from the LLM and is interpolated into a
# title="..." attribute — escape it so a stray double-quote
# cannot break out and inject attributes/handlers.
html.append(f'<td class="{css_class}" title="{_e(entry.aspect)}">{symbol}</td>')
else:
html.append('<td></td>')
html.append('</tr>')
html.append('</tbody></table>')
return '\n'.join(html)
async def generate_html_report(
assessment: Assessment,
output_path: Path,
bundesland: Optional[str] = None,
) -> None:
"""Generate HTML report.
``bundesland`` is the optional state code (e.g. ``"NRW"``, ``"LSA"``).
When set and known in ``BUNDESLAENDER``, the resulting report carries
the parlament name in its header so the source parliament is always
visible — important since assessments from multiple bundesländer share
the same Drucksachen-ID space.
"""
empf_config = EMPFEHLUNG_CONFIG.get(assessment.empfehlung.value, {})
parlament_name = ""
if bundesland and bundesland in BUNDESLAENDER:
parlament_name = BUNDESLAENDER[bundesland].parlament_name
html = f"""<!DOCTYPE html>
<html lang="de">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>GWÖ-Antragsprüfung: {_e(assessment.title or "")}</title>
<style>
:root {{
--color-darkgray: {COLORS['darkgray']};
--color-green: {COLORS['green']};
--color-blue: {COLORS['blue']};
--color-lightgray: {COLORS['lightgray']};
--color-orange: {COLORS['orange']};
--color-red: {COLORS['red']};
}}
body {{
font-family: 'Avenir', Arial, sans-serif;
max-width: 800px;
margin: 0 auto;
padding: 1.5rem 2rem;
color: var(--color-darkgray);
line-height: 1.5;
font-size: 10pt;
}}
.header {{
text-align: center;
border-bottom: 2px solid var(--color-blue);
padding-bottom: 0.75rem;
margin-bottom: 1.25rem;
}}
.header img {{
max-width: 150px;
}}
.header-label {{
font-size: 8pt;
letter-spacing: 0.5px;
color: var(--color-blue);
margin-bottom: 0.5rem;
}}
.header-parlament {{
font-size: 9pt;
color: var(--color-blue);
font-weight: bold;
margin-top: 0.4rem;
letter-spacing: 0.3px;
}}
h1 {{
color: var(--color-darkgray);
font-size: 14pt;
margin: 0.75rem 0;
line-height: 1.3;
}}
h2 {{
color: var(--color-blue);
font-size: 11pt;
border-bottom: 1px solid var(--color-lightgray);
padding-bottom: 0.3rem;
margin-top: 1.25rem;
margin-bottom: 0.5rem;
}}
h3 {{
color: var(--color-green);
font-size: 10pt;
margin-top: 0.75rem;
margin-bottom: 0.3rem;
}}
.meta-box {{
background: #f5f5f5;
padding: 0.6rem 0.8rem;
border-radius: 3px;
margin-bottom: 0.75rem;
font-size: 9pt;
}}
.empfehlung-box {{
background: {empf_config.get('color', COLORS['blue'])}15;
border: 1px solid {empf_config.get('color', COLORS['blue'])};
padding: 0.5rem 0.75rem;
text-align: center;
border-radius: 3px;
margin: 0.75rem 0;
}}
.empfehlung-box .symbol {{
font-size: 12pt;
color: {empf_config.get('color', COLORS['blue'])};
font-weight: bold;
display: inline;
margin-right: 0.5rem;
}}
.empfehlung-box .text {{
font-size: 10pt;
display: inline;
}}
.score-bar {{
background: var(--color-lightgray);
height: 12px;
border-radius: 6px;
overflow: hidden;
margin: 0.3rem 0;
}}
.score-bar-fill {{
height: 100%;
}}
.matrix-table {{
width: 100%;
border-collapse: collapse;
margin: 0.5rem 0;
font-size: 8pt;
}}
.matrix-table th, .matrix-table td {{
border: 1px solid var(--color-lightgray);
padding: 0.25rem 0.4rem;
text-align: center;
}}
.matrix-table thead th {{
background: var(--color-blue);
color: white;
font-size: 8pt;
font-weight: normal;
}}
.matrix-table tbody th {{
background: #f5f5f5;
text-align: left;
font-weight: normal;
font-size: 8pt;
}}
.matrix-table .positive {{
background: var(--color-green);
color: white;
font-weight: bold;
}}
.matrix-table .negative {{
background: var(--color-red);
color: white;
font-weight: bold;
}}
.matrix-table .neutral {{
background: #f0f0f0;
}}
.verbesserung {{
margin: 0.5rem 0;
padding: 0.5rem;
border: 1px solid var(--color-lightgray);
border-radius: 3px;
font-size: 9pt;
}}
.verbesserung .original {{
background: #f9f9f9;
padding: 0.4rem;
margin-bottom: 0.3rem;
}}
.verbesserung .vorschlag {{
background: rgba(136, 158, 51, 0.1);
border-left: 2px solid var(--color-green);
padding: 0.4rem;
}}
.inserted {{
color: var(--color-green);
font-weight: bold;
}}
.deleted {{
color: var(--color-red);
text-decoration: line-through;
}}
.two-columns {{
display: grid;
grid-template-columns: 1fr 1fr;
gap: 0.75rem;
}}
.staerken {{
border-left: 2px solid var(--color-green);
padding-left: 0.5rem;
}}
.schwaechen {{
border-left: 2px solid var(--color-orange);
padding-left: 0.5rem;
}}
ul {{
margin: 0.3rem 0;
padding-left: 1.2rem;
}}
li {{
margin-bottom: 0.2rem;
}}
p {{
margin: 0.4rem 0;
}}
.footer {{
margin-top: 1.5rem;
padding-top: 0.5rem;
border-top: 1px solid var(--color-lightgray);
text-align: center;
color: var(--color-lightgray);
font-size: 7pt;
}}
@media print {{
body {{ max-width: none; }}
}}
</style>
</head>
<body>
<div class="header">
<div class="header-label">GEMEINWOHL-ÖKONOMIE | ANTRAGSBEWERTUNG</div>
<h1>{_e(assessment.title or "")}</h1>
{f'<div class="header-parlament">{_e(parlament_name)}</div>' if parlament_name else ''}
</div>
<div class="meta-box">
<strong>Drucksache:</strong> {_e(assessment.drucksache or "")} &nbsp;|&nbsp;
<strong>Datum:</strong> {_e(assessment.datum or "")} &nbsp;|&nbsp;
<strong>Fraktion(en):</strong> {_e(', '.join(assessment.fraktionen))} &nbsp;|&nbsp;
<strong>GWÖ-Score:</strong> <span style="color: {get_score_color(assessment.gwoe_score)}; font-weight: bold;">{assessment.gwoe_score}/10</span>
</div>
<div class="empfehlung-box">
<span class="symbol">{_e(empf_config.get('symbol', '[?]'))}</span>
<span class="text"><strong>Empfehlung:</strong> {_e(assessment.empfehlung.value)}</span>
</div>
<h2>Der Antrag im Überblick</h2>
<p>{_e(assessment.antrag_zusammenfassung or 'Keine Zusammenfassung verfügbar.')}</p>
{('<ul>' + ''.join(f'<li>{_e(k)}</li>' for k in assessment.antrag_kernpunkte) + '</ul>') if assessment.antrag_kernpunkte else ''}
<h2>GWÖ-Treue</h2>
<p style="font-size: 9pt;"><strong>Score:</strong> <span style="color: {get_score_color(assessment.gwoe_score)};">{assessment.gwoe_score}/10</span></p>
<div class="score-bar">
<div class="score-bar-fill" style="width: {assessment.gwoe_score * 10}%; background: {get_score_color(assessment.gwoe_score)};"></div>
</div>
<p><strong>Begründung:</strong> {_e(assessment.gwoe_begruendung or "")}</p>
<p><strong>Schwerpunkte:</strong> {_e(', '.join(assessment.gwoe_schwerpunkt))}</p>
<h2>Matrix-Zuordnung (Matrix 2.0 für Gemeinden)</h2>
{build_matrix_html(assessment)}
<p style="font-size: 7pt; color: #999;">
<strong>Legende:</strong> ++ stark fördernd, + fördernd, ○ neutral, widersprechend, stark widersprechend
</p>
<h3>Berührte Themenfelder</h3>
<ul>
{''.join(f'<li><strong>{_e(e.field)}:</strong> {_e(e.aspect)} [{get_rating_symbol(e.rating)}]</li>' for e in assessment.gwoe_matrix)}
</ul>
<h2>Programmtreue</h2>
{''.join(f'''
<h3>{_e(s.fraktion)} {' (Antragsteller)' if s.ist_antragsteller else ''}{' (Regierung)' if s.ist_regierung else ''}</h3>
<p><strong>Wahlprogramm:</strong> {s.wahlprogramm.score}/10 — {_e(s.wahlprogramm.begruendung or "")}</p>
<p><strong>Parteiprogramm:</strong> {s.parteiprogramm.score}/10 — {_e(s.parteiprogramm.begruendung or "")}</p>
''' for s in assessment.wahlprogramm_scores)}
<h2>Verbesserungsvorschläge</h2>
{''.join(f'''
<div class="verbesserung">
<div class="original"><strong>Original:</strong><br>{_e(v.original or "")}</div>
<div class="vorschlag"><strong>Vorschlag:</strong><br>{format_redline_html(v.vorschlag)}</div>
<div style="font-style: italic; margin-top: 0.5rem;">{_e(v.begruendung or "")}</div>
</div>
''' for v in assessment.verbesserungen) or '<p>Keine Verbesserungsvorschläge.</p>'}
<h2>Zusammenfassung</h2>
<div class="two-columns">
<div class="staerken">
<h3 style="color: var(--color-green);">Stärken</h3>
<ul>
{''.join(f'<li>{_e(s)}</li>' for s in assessment.staerken) or '<li>(keine)</li>'}
</ul>
</div>
<div class="schwaechen">
<h3 style="color: var(--color-orange);">Schwächen</h3>
<ul>
{''.join(f'<li>{_e(s)}</li>' for s in assessment.schwaechen) or '<li>(keine)</li>'}
</ul>
</div>
</div>
<div class="footer">
<p>Erstellt mit GWÖ-Antragsprüfer v4.1 | Matrix 2.0 für Gemeinden</p>
<p style="color: var(--color-blue);">germany.econgood.org</p>
</div>
</body>
</html>"""
output_path.write_text(html)
async def generate_pdf_report(
assessment: Assessment,
output_path: Path,
bundesland: Optional[str] = None,
) -> None:
"""Generate PDF report using WeasyPrint, then append the original Antrag.
Two-step pipeline:
1. Render the GWÖ-Report HTML and convert to PDF via WeasyPrint
(existing behaviour).
2. If ``assessment.link`` is a fetchable PDF URL, download it via
``httpx`` and append it after a separator page so the resulting
single file contains both the analysis and its source document
(issue #9).
The append step is best-effort: a missing/empty link is silently
skipped, network errors and parse errors fall back to a single
placeholder page so the report itself is always delivered.
``bundesland`` is forwarded to ``generate_html_report`` so the source
parlament name appears in the report header.
"""
# Step 1 — render the report itself
html_path = output_path.with_suffix('.tmp.html')
await generate_html_report(assessment, html_path, bundesland=bundesland)
try:
from weasyprint import HTML
HTML(filename=str(html_path)).write_pdf(str(output_path))
finally:
html_path.unlink(missing_ok=True)
# Step 2 — append the original Antrag (best-effort)
await _append_original_antrag(assessment, output_path)
async def _append_original_antrag(
assessment: Assessment,
report_path: Path,
) -> None:
"""Try to download the original Antrag PDF and append it to ``report_path``.
Failure modes (download error, non-PDF content, parse error) are
handled gracefully: a single placeholder page is appended noting the
issue, so the user always sees that an attempt was made.
"""
import fitz # PyMuPDF
import httpx
link = (assessment.link or "").strip()
if not link or not link.startswith(("http://", "https://")):
# Manual upload / pasted text — nothing to append.
return
download_error: Optional[str] = None
pdf_bytes: Optional[bytes] = None
try:
async with httpx.AsyncClient(
timeout=30,
follow_redirects=True,
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
) as client:
resp = await client.get(link)
if resp.status_code != 200:
download_error = f"HTTP {resp.status_code}"
elif not resp.content[:5].startswith(b"%PDF-"):
download_error = f"kein PDF (Content-Type: {resp.headers.get('content-type', 'unknown')})"
else:
pdf_bytes = resp.content
except Exception as e:
download_error = f"Download-Fehler: {e}"
try:
report_doc = fitz.open(report_path)
try:
# Always insert a divider page so the user sees what comes next
_insert_divider_page(report_doc, assessment, download_error)
if pdf_bytes is not None:
try:
src_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
try:
report_doc.insert_pdf(src_doc)
finally:
src_doc.close()
except Exception as e:
print(f"_append_original_antrag: PDF-Parse-Fehler für {assessment.drucksache}: {e}")
# PyMuPDF refuses to overwrite the source file in non-incremental
# mode — write to a sibling temp file and atomically replace.
tmp_path = report_path.with_suffix(report_path.suffix + ".tmp")
report_doc.save(
str(tmp_path),
deflate=True,
garbage=3,
)
finally:
report_doc.close()
tmp_path.replace(report_path)
except Exception as e:
# Hard failure — leave the original report file untouched.
print(f"_append_original_antrag: Konnte Report nicht erweitern für {assessment.drucksache}: {e}")
def _insert_divider_page(
report_doc, # fitz.Document
assessment: Assessment,
download_error: Optional[str],
) -> None:
"""Append a single A4 separator page that introduces the original Antrag.
Uses PyMuPDF's text drawing API directly so we don't need a second
WeasyPrint round-trip just for one page.
"""
page = report_doc.new_page(width=595, height=842) # A4
margin_left = 60
y = 200
# Title
page.insert_text(
(margin_left, y),
"Original-Antrag",
fontsize=24,
fontname="helv",
color=(0 / 255, 157 / 255, 165 / 255), # var(--color-blue)
)
y += 38
# Drucksache
page.insert_text(
(margin_left, y),
f"Drucksache {assessment.drucksache}",
fontsize=14,
fontname="helv",
color=(0.35, 0.35, 0.35),
)
y += 22
# Title (truncated to ~75 chars to fit one line)
title = assessment.title or ""
if len(title) > 75:
title = title[:72] + ""
page.insert_text(
(margin_left, y),
title,
fontsize=11,
fontname="helv",
color=(0.35, 0.35, 0.35),
)
y += 40
if download_error:
page.insert_text(
(margin_left, y),
"⚠ Original-PDF konnte nicht angehängt werden.",
fontsize=11,
fontname="helv",
color=(0.82, 0.0, 0.0),
)
y += 18
page.insert_text(
(margin_left, y),
f"Grund: {download_error}",
fontsize=10,
fontname="helv",
color=(0.5, 0.5, 0.5),
)
y += 18
if assessment.link:
page.insert_text(
(margin_left, y),
f"Quelle: {assessment.link[:90]}",
fontsize=9,
fontname="helv",
color=(0.5, 0.5, 0.5),
)
else:
page.insert_text(
(margin_left, y),
"Die folgenden Seiten enthalten den unveränderten Originalantrag.",
fontsize=11,
fontname="helv",
color=(0.35, 0.35, 0.35),
)