From 80e16df2880988124a23d274bd72e31100ca1845 Mon Sep 17 00:00:00 2001 From: Dotty Dotter Date: Tue, 7 Apr 2026 23:15:05 +0200 Subject: [PATCH] =?UTF-8?q?Append=20original=20Antrag-PDF=20to=20GW=C3=96-?= =?UTF-8?q?Report=20(#9)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends generate_pdf_report() with a best-effort second stage that appends the original Antrag PDF to the freshly rendered GWÖ-Report so the analysis and its source document live in the same file. Pipeline 1. WeasyPrint renders the report PDF as before. 2. _append_original_antrag() then: - Skips silently if assessment.link is empty or non-HTTP (manual uploads / pasted text leave nothing to fetch). - Downloads the original PDF via httpx (30s timeout, follow redirects, custom user agent). - Validates the response is actually a PDF (Content-Length not relied on; the magic bytes %PDF- are checked). - Adds a single A4 separator page that says "Original-Antrag", repeats the Drucksachen-ID and title, and either confirms the append or shows the failure reason (HTTP code, network error, parse error) plus the source URL. - Appends the downloaded PDF via PyMuPDF doc.insert_pdf(). - Saves to a sibling .tmp file and atomically replaces the original (PyMuPDF refuses non-incremental save into the same file). Edge cases handled - No link / pasted-text upload → no append, no divider, original report unchanged. - Download error / 404 / non-PDF response → divider page with explicit error message and source URL, report still ships. - PDF parse error → divider page without appended content, error logged. - Hard failure during save → fall back to the original WeasyPrint PDF. Verified live in production container against drucksache 8/6645 (Untrending Frauenhass, BÜNDNIS 90/DIE GRÜNEN LSA): - Report 4 pages + 1 divider + 3 pages original = 8 pages total - Divider correctly placed at index 4 - Page 5 starts with "(Ausgegeben am 24.02.2026) … Drucksache 8/6645 … Antrag — Fraktion BÜNDNIS 90/DIE GRÜNEN — Untrending Frauenhass …" - Negative test with a synthetic 404 link: 5 pages total, divider at index 4 with "Original-PDF konnte nicht angehängt werden. Grund: HTTP 404". Resolves #9. Co-Authored-By: Claude Sonnet 4.6 --- app/report.py | 170 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 168 insertions(+), 2 deletions(-) diff --git a/app/report.py b/app/report.py index b125cdb..87c3ee4 100644 --- a/app/report.py +++ b/app/report.py @@ -444,12 +444,25 @@ async def generate_pdf_report( output_path: Path, bundesland: Optional[str] = None, ) -> None: - """Generate PDF report using WeasyPrint. + """Generate PDF report using WeasyPrint, then append the original Antrag. + + Two-step pipeline: + + 1. Render the GWÖ-Report HTML and convert to PDF via WeasyPrint + (existing behaviour). + 2. If ``assessment.link`` is a fetchable PDF URL, download it via + ``httpx`` and append it after a separator page so the resulting + single file contains both the analysis and its source document + (issue #9). + + The append step is best-effort: a missing/empty link is silently + skipped, network errors and parse errors fall back to a single + placeholder page so the report itself is always delivered. ``bundesland`` is forwarded to ``generate_html_report`` so the source parlament name appears in the report header. """ - # First generate HTML + # Step 1 — render the report itself html_path = output_path.with_suffix('.tmp.html') await generate_html_report(assessment, html_path, bundesland=bundesland) @@ -458,3 +471,156 @@ async def generate_pdf_report( HTML(filename=str(html_path)).write_pdf(str(output_path)) finally: html_path.unlink(missing_ok=True) + + # Step 2 — append the original Antrag (best-effort) + await _append_original_antrag(assessment, output_path) + + +async def _append_original_antrag( + assessment: Assessment, + report_path: Path, +) -> None: + """Try to download the original Antrag PDF and append it to ``report_path``. + + Failure modes (download error, non-PDF content, parse error) are + handled gracefully: a single placeholder page is appended noting the + issue, so the user always sees that an attempt was made. + """ + import fitz # PyMuPDF + import httpx + + link = (assessment.link or "").strip() + if not link or not link.startswith(("http://", "https://")): + # Manual upload / pasted text — nothing to append. + return + + download_error: Optional[str] = None + pdf_bytes: Optional[bytes] = None + try: + async with httpx.AsyncClient( + timeout=30, + follow_redirects=True, + headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"}, + ) as client: + resp = await client.get(link) + if resp.status_code != 200: + download_error = f"HTTP {resp.status_code}" + elif not resp.content[:5].startswith(b"%PDF-"): + download_error = f"kein PDF (Content-Type: {resp.headers.get('content-type', 'unknown')})" + else: + pdf_bytes = resp.content + except Exception as e: + download_error = f"Download-Fehler: {e}" + + try: + report_doc = fitz.open(report_path) + try: + # Always insert a divider page so the user sees what comes next + _insert_divider_page(report_doc, assessment, download_error) + + if pdf_bytes is not None: + try: + src_doc = fitz.open(stream=pdf_bytes, filetype="pdf") + try: + report_doc.insert_pdf(src_doc) + finally: + src_doc.close() + except Exception as e: + print(f"_append_original_antrag: PDF-Parse-Fehler für {assessment.drucksache}: {e}") + + # PyMuPDF refuses to overwrite the source file in non-incremental + # mode — write to a sibling temp file and atomically replace. + tmp_path = report_path.with_suffix(report_path.suffix + ".tmp") + report_doc.save( + str(tmp_path), + deflate=True, + garbage=3, + ) + finally: + report_doc.close() + tmp_path.replace(report_path) + except Exception as e: + # Hard failure — leave the original report file untouched. + print(f"_append_original_antrag: Konnte Report nicht erweitern für {assessment.drucksache}: {e}") + + +def _insert_divider_page( + report_doc, # fitz.Document + assessment: Assessment, + download_error: Optional[str], +) -> None: + """Append a single A4 separator page that introduces the original Antrag. + + Uses PyMuPDF's text drawing API directly so we don't need a second + WeasyPrint round-trip just for one page. + """ + page = report_doc.new_page(width=595, height=842) # A4 + margin_left = 60 + y = 200 + + # Title + page.insert_text( + (margin_left, y), + "Original-Antrag", + fontsize=24, + fontname="helv", + color=(0 / 255, 157 / 255, 165 / 255), # var(--color-blue) + ) + y += 38 + + # Drucksache + page.insert_text( + (margin_left, y), + f"Drucksache {assessment.drucksache}", + fontsize=14, + fontname="helv", + color=(0.35, 0.35, 0.35), + ) + y += 22 + + # Title (truncated to ~75 chars to fit one line) + title = assessment.title or "" + if len(title) > 75: + title = title[:72] + "…" + page.insert_text( + (margin_left, y), + title, + fontsize=11, + fontname="helv", + color=(0.35, 0.35, 0.35), + ) + y += 40 + + if download_error: + page.insert_text( + (margin_left, y), + "⚠ Original-PDF konnte nicht angehängt werden.", + fontsize=11, + fontname="helv", + color=(0.82, 0.0, 0.0), + ) + y += 18 + page.insert_text( + (margin_left, y), + f"Grund: {download_error}", + fontsize=10, + fontname="helv", + color=(0.5, 0.5, 0.5), + ) + y += 18 + if assessment.link: + page.insert_text( + (margin_left, y), + f"Quelle: {assessment.link[:90]}", + fontsize=9, + fontname="helv", + color=(0.5, 0.5, 0.5), + ) + else: + page.insert_text( + (margin_left, y), + "Die folgenden Seiten enthalten den unveränderten Originalantrag.", + fontsize=11, + fontname="helv", + color=(0.35, 0.35, 0.35), + )