Append original Antrag-PDF to GWÖ-Report (#9)

Extends generate_pdf_report() with a best-effort second stage that appends the original Antrag PDF to the freshly rendered GWÖ-Report so the analysis and its source document live in the same file. Pipeline 1. WeasyPrint renders the report PDF as before. 2. _append_original_antrag() then: - Skips silently if assessment.link is empty or non-HTTP (manual uploads / pasted text leave nothing to fetch). - Downloads the original PDF via httpx (30s timeout, follow redirects, custom user agent). - Validates the response is actually a PDF (Content-Length not relied on; the magic bytes %PDF- are checked). - Adds a single A4 separator page that says "Original-Antrag", repeats the Drucksachen-ID and title, and either confirms the append or shows the failure reason (HTTP code, network error, parse error) plus the source URL. - Appends the downloaded PDF via PyMuPDF doc.insert_pdf(). - Saves to a sibling .tmp file and atomically replaces the original (PyMuPDF refuses non-incremental save into the same file). Edge cases handled - No link / pasted-text upload → no append, no divider, original report unchanged. - Download error / 404 / non-PDF response → divider page with explicit error message and source URL, report still ships. - PDF parse error → divider page without appended content, error logged. - Hard failure during save → fall back to the original WeasyPrint PDF. Verified live in production container against drucksache 8/6645 (Untrending Frauenhass, BÜNDNIS 90/DIE GRÜNEN LSA): - Report 4 pages + 1 divider + 3 pages original = 8 pages total - Divider correctly placed at index 4 - Page 5 starts with "(Ausgegeben am 24.02.2026) … Drucksache 8/6645 … Antrag — Fraktion BÜNDNIS 90/DIE GRÜNEN — Untrending Frauenhass …" - Negative test with a synthetic 404 link: 5 pages total, divider at index 4 with "Original-PDF konnte nicht angehängt werden. Grund: HTTP 404". Resolves #9. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 23:15:05 +02:00 · 2026-04-07 23:15:05 +02:00 · 80e16df288
commit 80e16df288
parent f1867d463c
1 changed files with 168 additions and 2 deletions
--- a/app/report.py
+++ b/app/report.py
@ -444,12 +444,25 @@ async def generate_pdf_report(
    output_path: Path,
    bundesland: Optional[str] = None,
 ) -> None:
-    """Generate PDF report using WeasyPrint.
+    """Generate PDF report using WeasyPrint, then append the original Antrag.
+
+    Two-step pipeline:
+
+    1. Render the GWÖ-Report HTML and convert to PDF via WeasyPrint
+       (existing behaviour).
+    2. If ``assessment.link`` is a fetchable PDF URL, download it via
+       ``httpx`` and append it after a separator page so the resulting
+       single file contains both the analysis and its source document
+       (issue #9).
+
+    The append step is best-effort: a missing/empty link is silently
+    skipped, network errors and parse errors fall back to a single
+    placeholder page so the report itself is always delivered.

    ``bundesland`` is forwarded to ``generate_html_report`` so the source
    parlament name appears in the report header.
    """
-    # First generate HTML
+    # Step 1 — render the report itself
    html_path = output_path.with_suffix('.tmp.html')
    await generate_html_report(assessment, html_path, bundesland=bundesland)

@ -458,3 +471,156 @@ async def generate_pdf_report(
        HTML(filename=str(html_path)).write_pdf(str(output_path))
    finally:
        html_path.unlink(missing_ok=True)
+
+    # Step 2 — append the original Antrag (best-effort)
+    await _append_original_antrag(assessment, output_path)
+
+
+async def _append_original_antrag(
+    assessment: Assessment,
+    report_path: Path,
+) -> None:
+    """Try to download the original Antrag PDF and append it to ``report_path``.
+
+    Failure modes (download error, non-PDF content, parse error) are
+    handled gracefully: a single placeholder page is appended noting the
+    issue, so the user always sees that an attempt was made.
+    """
+    import fitz  # PyMuPDF
+    import httpx
+
+    link = (assessment.link or "").strip()
+    if not link or not link.startswith(("http://", "https://")):
+        # Manual upload / pasted text — nothing to append.
+        return
+
+    download_error: Optional[str] = None
+    pdf_bytes: Optional[bytes] = None
+    try:
+        async with httpx.AsyncClient(
+            timeout=30,
+            follow_redirects=True,
+            headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
+        ) as client:
+            resp = await client.get(link)
+        if resp.status_code != 200:
+            download_error = f"HTTP {resp.status_code}"
+        elif not resp.content[:5].startswith(b"%PDF-"):
+            download_error = f"kein PDF (Content-Type: {resp.headers.get('content-type', 'unknown')})"
+        else:
+            pdf_bytes = resp.content
+    except Exception as e:
+        download_error = f"Download-Fehler: {e}"
+
+    try:
+        report_doc = fitz.open(report_path)
+        try:
+            # Always insert a divider page so the user sees what comes next
+            _insert_divider_page(report_doc, assessment, download_error)
+
+            if pdf_bytes is not None:
+                try:
+                    src_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+                    try:
+                        report_doc.insert_pdf(src_doc)
+                    finally:
+                        src_doc.close()
+                except Exception as e:
+                    print(f"_append_original_antrag: PDF-Parse-Fehler für {assessment.drucksache}: {e}")
+
+            # PyMuPDF refuses to overwrite the source file in non-incremental
+            # mode — write to a sibling temp file and atomically replace.
+            tmp_path = report_path.with_suffix(report_path.suffix + ".tmp")
+            report_doc.save(
+                str(tmp_path),
+                deflate=True,
+                garbage=3,
+            )
+        finally:
+            report_doc.close()
+        tmp_path.replace(report_path)
+    except Exception as e:
+        # Hard failure — leave the original report file untouched.
+        print(f"_append_original_antrag: Konnte Report nicht erweitern für {assessment.drucksache}: {e}")
+
+
+def _insert_divider_page(
+    report_doc,  # fitz.Document
+    assessment: Assessment,
+    download_error: Optional[str],
+) -> None:
+    """Append a single A4 separator page that introduces the original Antrag.
+
+    Uses PyMuPDF's text drawing API directly so we don't need a second
+    WeasyPrint round-trip just for one page.
+    """
+    page = report_doc.new_page(width=595, height=842)  # A4
+    margin_left = 60
+    y = 200
+
+    # Title
+    page.insert_text(
+        (margin_left, y),
+        "Original-Antrag",
+        fontsize=24,
+        fontname="helv",
+        color=(0 / 255, 157 / 255, 165 / 255),  # var(--color-blue)
+    )
+    y += 38
+
+    # Drucksache
+    page.insert_text(
+        (margin_left, y),
+        f"Drucksache {assessment.drucksache}",
+        fontsize=14,
+        fontname="helv",
+        color=(0.35, 0.35, 0.35),
+    )
+    y += 22
+
+    # Title (truncated to ~75 chars to fit one line)
+    title = assessment.title or ""
+    if len(title) > 75:
+        title = title[:72] + "…"
+    page.insert_text(
+        (margin_left, y),
+        title,
+        fontsize=11,
+        fontname="helv",
+        color=(0.35, 0.35, 0.35),
+    )
+    y += 40
+
+    if download_error:
+        page.insert_text(
+            (margin_left, y),
+            "⚠ Original-PDF konnte nicht angehängt werden.",
+            fontsize=11,
+            fontname="helv",
+            color=(0.82, 0.0, 0.0),
+        )
+        y += 18
+        page.insert_text(
+            (margin_left, y),
+            f"Grund: {download_error}",
+            fontsize=10,
+            fontname="helv",
+            color=(0.5, 0.5, 0.5),
+        )
+        y += 18
+        if assessment.link:
+            page.insert_text(
+                (margin_left, y),
+                f"Quelle: {assessment.link[:90]}",
+                fontsize=9,
+                fontname="helv",
+                color=(0.5, 0.5, 0.5),
+            )
+    else:
+        page.insert_text(
+            (margin_left, y),
+            "Die folgenden Seiten enthalten den unveränderten Originalantrag.",
+            fontsize=11,
+            fontname="helv",
+            color=(0.35, 0.35, 0.35),
+        )