diff --git a/backend/src/tracker/api/routes/bewertung.py b/backend/src/tracker/api/routes/bewertung.py index e6ce0e8..63532bc 100644 --- a/backend/src/tracker/api/routes/bewertung.py +++ b/backend/src/tracker/api/routes/bewertung.py @@ -12,6 +12,9 @@ from fastapi import APIRouter, Depends, HTTPException from pydantic import BaseModel from tracker.db.session import get_connection + +import logging +logger = logging.getLogger(__name__) # Note: Re-evaluation should re-scrape ALLRIS data before KI evaluation # to exclude transfer errors. See Gitea issue for full spec. # IMPORTANT: Destructive changes (deleting old bewertungen etc.) only after @@ -182,6 +185,16 @@ def _run_zusammenfassung(vorlage_id: int, anmerkung: str, job_id: str): try: conn = get_connection() + # --- Rescrape ALLRIS data before KI evaluation --- + _jobs[job_id]["phase"] = "rescrape" + try: + from tracker.core.rescrape import rescrape_vorlage + rescrape_vorlage(conn, vorlage_id) + except Exception as e: + logger.warning("Rescrape failed for vorlage %s: %s", vorlage_id, e) + _jobs[job_id]["phase"] = "ki_bewertung" + + # Reload fresh data after rescrape row = conn.execute("SELECT volltext_clean, aktenzeichen FROM vorlagen WHERE id = ?", (vorlage_id,)).fetchone() if not row or not row["volltext_clean"]: _jobs[job_id] = {"status": "error", "error": "Kein Volltext vorhanden"} @@ -256,6 +269,15 @@ def _run_ketten_bewertung(kette_id: int, anmerkung: str, job_id: str): try: conn = get_connection() + # --- Rescrape all Glieder before KI evaluation --- + _jobs[job_id]["phase"] = "rescrape" + try: + from tracker.core.rescrape import rescrape_kette + rescrape_kette(conn, kette_id) + except Exception as e: + logger.warning("Rescrape failed for kette %s: %s", kette_id, e) + _jobs[job_id]["phase"] = "ki_bewertung" + # Get kette + ursprung kette = conn.execute( "SELECT k.*, v.aktenzeichen, v.volltext_clean FROM ketten k JOIN vorlagen v ON k.ursprung_id = v.id WHERE k.id = ?", @@ -426,4 +448,5 @@ def get_job_status(job_id: str): """Check status of a re-evaluation job.""" if job_id not in _jobs: raise HTTPException(status_code=404, detail="Job nicht gefunden") - return _jobs[job_id] + job = _jobs[job_id] + return {**job, "phase": job.get("phase", "")} diff --git a/backend/src/tracker/core/rescrape.py b/backend/src/tracker/core/rescrape.py new file mode 100644 index 0000000..c37e459 --- /dev/null +++ b/backend/src/tracker/core/rescrape.py @@ -0,0 +1,274 @@ +""" +ALLRIS Rescrape-Modul. + +Scrapet Beratungsfolge, Beschlusstexte und PDF-Volltext für eine Vorlage +oder alle Glieder einer Kette. Eigenständig importierbar aus dem Backend. +""" +from __future__ import annotations + +import logging +import re +import subprocess +import tempfile +import time + +import httpx +from bs4 import BeautifulSoup + +from tracker.db.session import get_connection + +logger = logging.getLogger(__name__) + +ALLRIS_BASE = "https://allris.hagen.de" +DELAY_SECONDS = 1.0 +HTTP_TIMEOUT = 30 + + +# --------------------------------------------------------------------------- +# HTTP helpers +# --------------------------------------------------------------------------- + +def _get(url: str) -> httpx.Response: + """GET with timeout and redirect following.""" + return httpx.get(url, timeout=HTTP_TIMEOUT, follow_redirects=True) + + +# --------------------------------------------------------------------------- +# Scraping functions (adapted from scripts/scrape_beratungsfolge.py) +# --------------------------------------------------------------------------- + +def _scrape_vorlage_page(url: str) -> list[dict]: + """Scrape Beratungsfolge von einer ALLRIS Vorlagen-Seite.""" + resp = _get(url) + resp.raise_for_status() + soup = BeautifulSoup(resp.text, "html.parser") + + beratungen: list[dict] = [] + + for link in soup.find_all("a", href=True): + href = link["href"] + if "to020" not in href or "TOLFDNR=" not in href: + continue + + tolfdnr_match = re.search(r"TOLFDNR=(\d+)", href) + if not tolfdnr_match: + continue + + tolfdnr = tolfdnr_match.group(1) + beschlussart = link.get_text(strip=True) + + # Sitzungsinfo aus vorherigem Link + sitzung_name = None + prev = link.find_previous("a", href=re.compile(r"to010.*SILFDNR=")) + if prev: + sitzung_name = prev.get_text(strip=True) + + to_url = href if href.startswith("http") else ALLRIS_BASE + href + + beratungen.append({ + "tolfdnr": tolfdnr, + "beschlussart": beschlussart, + "sitzung_name": sitzung_name, + "to_url": to_url, + }) + + return beratungen + + +def _scrape_to_page(url: str) -> dict: + """Scrape Beschlusstext und Wortprotokoll von einer TO-Seite.""" + resp = _get(url) + resp.raise_for_status() + soup = BeautifulSoup(resp.text, "html.parser") + + result: dict = { + "beschlusstext": None, + "wortprotokoll": None, + "sitzung_datum": None, + } + + # Datum aus Titel + title = soup.find("h1", class_="title") + if title: + date_match = re.search(r"(\d{2}\.\d{2}\.\d{4})", title.get_text()) + if date_match: + result["sitzung_datum"] = date_match.group(1) + + # Texte in + text_spans = soup.find_all("span", style=re.compile(r"font-family.*Arial")) + texts = [s.get_text(strip=True) for s in text_spans if s.get_text(strip=True)] + + if texts: + result["beschlusstext"] = texts[-1] + if len(texts) > 1: + result["wortprotokoll"] = "\n\n".join(texts[:-1]) + + return result + + +def _extract_pdf_text(url: str) -> str | None: + """Download PDF and extract text via PyMuPDF.""" + try: + import pymupdf + except ImportError: + logger.warning("pymupdf not installed, skipping PDF extraction") + return None + + resp = httpx.get(url, timeout=60, follow_redirects=True) + resp.raise_for_status() + + if len(resp.content) < 100: + return None + + with tempfile.NamedTemporaryFile(suffix=".pdf", delete=True) as tmp: + tmp.write(resp.content) + tmp.flush() + doc = pymupdf.open(tmp.name) + parts = [page.get_text() for page in doc] + doc.close() + + text = "\n".join(parts).strip() + return text if len(text) >= 50 else None + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def rescrape_vorlage(conn_or_none, vorlage_id: int) -> dict: + """ + Rescrape ALLRIS data for a single Vorlage. + + Opens its OWN DB connection (thread-safe). + ``conn_or_none`` is accepted for signature compat but ignored — + we always create a fresh connection so this is safe from threads. + + Returns: {"updated_beratungen": N, "updated_volltext": bool, "errors": [...]} + """ + own_conn = get_connection() + try: + return _rescrape_vorlage_impl(own_conn, vorlage_id) + finally: + own_conn.close() + + +def _rescrape_vorlage_impl(conn, vorlage_id: int) -> dict: + result = {"updated_beratungen": 0, "updated_volltext": False, "errors": []} + + row = conn.execute( + "SELECT web_url, aktenzeichen, pdf_url, volltext_clean FROM vorlagen WHERE id = ?", + (vorlage_id,), + ).fetchone() + + if not row: + result["errors"].append(f"Vorlage {vorlage_id} nicht gefunden") + return result + + web_url = row["web_url"] + pdf_url = row["pdf_url"] + volltext_clean = row["volltext_clean"] + + # --- 1. Beratungsfolge scrapen --- + if web_url: + try: + beratungen = _scrape_vorlage_page(web_url) + logger.info("Vorlage %s: %d Beratungen gefunden", vorlage_id, len(beratungen)) + + for b in beratungen: + time.sleep(DELAY_SECONDS) + try: + to_details = _scrape_to_page(b["to_url"]) + except Exception as e: + result["errors"].append(f"TO {b['tolfdnr']}: {e}") + to_details = {} + + # Upsert: try update first, then insert + cur = conn.execute( + """UPDATE beratungen + SET to_url = ?, beschlussart = ?, + beschlusstext = ?, wortprotokoll = ?, + scraped_at = CURRENT_TIMESTAMP + WHERE vorlage_id = ? AND tolfdnr = ?""", + ( + b["to_url"], + b["beschlussart"], + to_details.get("beschlusstext"), + to_details.get("wortprotokoll"), + vorlage_id, + b["tolfdnr"], + ), + ) + if cur.rowcount == 0: + conn.execute( + """INSERT INTO beratungen + (vorlage_id, to_url, tolfdnr, beschlussart, + beschlusstext, wortprotokoll, scraped_at) + VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)""", + ( + vorlage_id, + b["to_url"], + b["tolfdnr"], + b["beschlussart"], + to_details.get("beschlusstext"), + to_details.get("wortprotokoll"), + ), + ) + result["updated_beratungen"] += 1 + + conn.commit() + except Exception as e: + result["errors"].append(f"Beratungsfolge: {e}") + logger.exception("Fehler beim Scrapen der Beratungsfolge für Vorlage %s", vorlage_id) + + # --- 2. PDF-Volltext --- + if pdf_url and not volltext_clean: + try: + time.sleep(DELAY_SECONDS) + text = _extract_pdf_text(pdf_url) + if text: + conn.execute( + "UPDATE vorlagen SET volltext_clean = ? WHERE id = ?", + (text, vorlage_id), + ) + conn.commit() + result["updated_volltext"] = True + logger.info("Vorlage %s: Volltext extrahiert (%d Zeichen)", vorlage_id, len(text)) + except Exception as e: + result["errors"].append(f"PDF: {e}") + logger.exception("Fehler bei PDF-Extraktion für Vorlage %s", vorlage_id) + + return result + + +def rescrape_kette(conn_or_none, kette_id: int) -> dict: + """ + Rescrape all Glieder of a Kette. + + Opens its OWN DB connection (thread-safe). + Returns: {"vorlage_results": [...], "total_beratungen": N, "total_volltext": N, "errors": [...]} + """ + own_conn = get_connection() + try: + glieder = own_conn.execute( + "SELECT vorlage_id FROM ketten_glieder WHERE kette_id = ?", + (kette_id,), + ).fetchall() + + summary = { + "vorlage_results": [], + "total_beratungen": 0, + "total_volltext": 0, + "errors": [], + } + + for g in glieder: + vid = g["vorlage_id"] + r = _rescrape_vorlage_impl(own_conn, vid) + summary["vorlage_results"].append({"vorlage_id": vid, **r}) + summary["total_beratungen"] += r["updated_beratungen"] + summary["total_volltext"] += int(r["updated_volltext"]) + summary["errors"].extend(r["errors"]) + + return summary + finally: + own_conn.close() diff --git a/frontend/src/lib/api.ts b/frontend/src/lib/api.ts index d8118e6..990f938 100644 --- a/frontend/src/lib/api.ts +++ b/frontend/src/lib/api.ts @@ -234,7 +234,7 @@ export const reevalKette = (id: number, anmerkung: string) => post<{ job_id: string; status: string }>(`/bewertung/ketten/${id}`, { anmerkung }); export const fetchJobStatus = (jobId: string) => - get<{ status: string; result?: object; error?: string }>(`/bewertung/status/${jobId}`); + get<{ status: string; result?: object; error?: string; phase?: string }>(`/bewertung/status/${jobId}`); export interface SuchVorschlag { id: number; diff --git a/frontend/src/routes/explorer/+page.svelte b/frontend/src/routes/explorer/+page.svelte index 1dd44bb..d6c4f13 100644 --- a/frontend/src/routes/explorer/+page.svelte +++ b/frontend/src/routes/explorer/+page.svelte @@ -89,6 +89,7 @@ let showReeval = $state(false); let reevalAnmerkung = $state(''); let reevalStatus = $state<'idle' | 'running' | 'done' | 'error'>('idle'); + let reevalPhase = $state(''); let reevalError = $state(''); async function triggerReeval() { @@ -100,6 +101,7 @@ for (let i = 0; i < 60; i++) { await new Promise(r => setTimeout(r, 3000)); const status = await fetchJobStatus(job_id); + reevalPhase = status.phase || ''; if (status.status === 'done') { reevalStatus = 'done'; selectedVorlage = await fetchVorlage(selectedVorlage!.id); @@ -706,7 +708,13 @@ {#if reevalStatus === 'running'} - KI bewertet… + {#if reevalPhase === 'rescrape'} + 📡 Daten aktualisieren… + {:else if reevalPhase === 'ki_bewertung'} + 🤖 KI bewertet… + {:else} + KI bewertet… + {/if} {:else} Bewertung starten