feat: ALLRIS-Rescrape vor KI-Neubewertung (#10)
- Neues Modul tracker/core/rescrape.py: Scrapt ALLRIS-Seiten live - rescrape_vorlage(): Beratungsfolge + Beschlusstexte + PDF-Volltext - rescrape_kette(): Alle Glieder + neue Suffix-Suche - Eingebaut in Neubewertung: Phase 1 Rescrape → Phase 2 KI - Status-Engine: Abstimmungen als Fallback für Beschluss-Erkennung - Frontend: Phase-Anzeige (Daten aktualisieren / KI bewertet) - Fehlertoleranz: Bei ALLRIS-Ausfall trotzdem KI mit alten Daten - Rate-Limiting 1s zwischen Requests Closes #10
This commit is contained in:
parent
0e7aa065e5
commit
abcb0ff8a2
@ -12,6 +12,9 @@ from fastapi import APIRouter, Depends, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
from tracker.db.session import get_connection
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
# Note: Re-evaluation should re-scrape ALLRIS data before KI evaluation
|
||||
# to exclude transfer errors. See Gitea issue for full spec.
|
||||
# IMPORTANT: Destructive changes (deleting old bewertungen etc.) only after
|
||||
@ -182,6 +185,16 @@ def _run_zusammenfassung(vorlage_id: int, anmerkung: str, job_id: str):
|
||||
try:
|
||||
conn = get_connection()
|
||||
|
||||
# --- Rescrape ALLRIS data before KI evaluation ---
|
||||
_jobs[job_id]["phase"] = "rescrape"
|
||||
try:
|
||||
from tracker.core.rescrape import rescrape_vorlage
|
||||
rescrape_vorlage(conn, vorlage_id)
|
||||
except Exception as e:
|
||||
logger.warning("Rescrape failed for vorlage %s: %s", vorlage_id, e)
|
||||
_jobs[job_id]["phase"] = "ki_bewertung"
|
||||
|
||||
# Reload fresh data after rescrape
|
||||
row = conn.execute("SELECT volltext_clean, aktenzeichen FROM vorlagen WHERE id = ?", (vorlage_id,)).fetchone()
|
||||
if not row or not row["volltext_clean"]:
|
||||
_jobs[job_id] = {"status": "error", "error": "Kein Volltext vorhanden"}
|
||||
@ -256,6 +269,15 @@ def _run_ketten_bewertung(kette_id: int, anmerkung: str, job_id: str):
|
||||
try:
|
||||
conn = get_connection()
|
||||
|
||||
# --- Rescrape all Glieder before KI evaluation ---
|
||||
_jobs[job_id]["phase"] = "rescrape"
|
||||
try:
|
||||
from tracker.core.rescrape import rescrape_kette
|
||||
rescrape_kette(conn, kette_id)
|
||||
except Exception as e:
|
||||
logger.warning("Rescrape failed for kette %s: %s", kette_id, e)
|
||||
_jobs[job_id]["phase"] = "ki_bewertung"
|
||||
|
||||
# Get kette + ursprung
|
||||
kette = conn.execute(
|
||||
"SELECT k.*, v.aktenzeichen, v.volltext_clean FROM ketten k JOIN vorlagen v ON k.ursprung_id = v.id WHERE k.id = ?",
|
||||
@ -426,4 +448,5 @@ def get_job_status(job_id: str):
|
||||
"""Check status of a re-evaluation job."""
|
||||
if job_id not in _jobs:
|
||||
raise HTTPException(status_code=404, detail="Job nicht gefunden")
|
||||
return _jobs[job_id]
|
||||
job = _jobs[job_id]
|
||||
return {**job, "phase": job.get("phase", "")}
|
||||
|
||||
274
backend/src/tracker/core/rescrape.py
Normal file
274
backend/src/tracker/core/rescrape.py
Normal file
@ -0,0 +1,274 @@
|
||||
"""
|
||||
ALLRIS Rescrape-Modul.
|
||||
|
||||
Scrapet Beratungsfolge, Beschlusstexte und PDF-Volltext für eine Vorlage
|
||||
oder alle Glieder einer Kette. Eigenständig importierbar aus dem Backend.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from tracker.db.session import get_connection
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ALLRIS_BASE = "https://allris.hagen.de"
|
||||
DELAY_SECONDS = 1.0
|
||||
HTTP_TIMEOUT = 30
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# HTTP helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _get(url: str) -> httpx.Response:
|
||||
"""GET with timeout and redirect following."""
|
||||
return httpx.get(url, timeout=HTTP_TIMEOUT, follow_redirects=True)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scraping functions (adapted from scripts/scrape_beratungsfolge.py)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _scrape_vorlage_page(url: str) -> list[dict]:
|
||||
"""Scrape Beratungsfolge von einer ALLRIS Vorlagen-Seite."""
|
||||
resp = _get(url)
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
beratungen: list[dict] = []
|
||||
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "to020" not in href or "TOLFDNR=" not in href:
|
||||
continue
|
||||
|
||||
tolfdnr_match = re.search(r"TOLFDNR=(\d+)", href)
|
||||
if not tolfdnr_match:
|
||||
continue
|
||||
|
||||
tolfdnr = tolfdnr_match.group(1)
|
||||
beschlussart = link.get_text(strip=True)
|
||||
|
||||
# Sitzungsinfo aus vorherigem Link
|
||||
sitzung_name = None
|
||||
prev = link.find_previous("a", href=re.compile(r"to010.*SILFDNR="))
|
||||
if prev:
|
||||
sitzung_name = prev.get_text(strip=True)
|
||||
|
||||
to_url = href if href.startswith("http") else ALLRIS_BASE + href
|
||||
|
||||
beratungen.append({
|
||||
"tolfdnr": tolfdnr,
|
||||
"beschlussart": beschlussart,
|
||||
"sitzung_name": sitzung_name,
|
||||
"to_url": to_url,
|
||||
})
|
||||
|
||||
return beratungen
|
||||
|
||||
|
||||
def _scrape_to_page(url: str) -> dict:
|
||||
"""Scrape Beschlusstext und Wortprotokoll von einer TO-Seite."""
|
||||
resp = _get(url)
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
result: dict = {
|
||||
"beschlusstext": None,
|
||||
"wortprotokoll": None,
|
||||
"sitzung_datum": None,
|
||||
}
|
||||
|
||||
# Datum aus Titel
|
||||
title = soup.find("h1", class_="title")
|
||||
if title:
|
||||
date_match = re.search(r"(\d{2}\.\d{2}\.\d{4})", title.get_text())
|
||||
if date_match:
|
||||
result["sitzung_datum"] = date_match.group(1)
|
||||
|
||||
# Texte in <span style="font-family:…Arial…">
|
||||
text_spans = soup.find_all("span", style=re.compile(r"font-family.*Arial"))
|
||||
texts = [s.get_text(strip=True) for s in text_spans if s.get_text(strip=True)]
|
||||
|
||||
if texts:
|
||||
result["beschlusstext"] = texts[-1]
|
||||
if len(texts) > 1:
|
||||
result["wortprotokoll"] = "\n\n".join(texts[:-1])
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _extract_pdf_text(url: str) -> str | None:
|
||||
"""Download PDF and extract text via PyMuPDF."""
|
||||
try:
|
||||
import pymupdf
|
||||
except ImportError:
|
||||
logger.warning("pymupdf not installed, skipping PDF extraction")
|
||||
return None
|
||||
|
||||
resp = httpx.get(url, timeout=60, follow_redirects=True)
|
||||
resp.raise_for_status()
|
||||
|
||||
if len(resp.content) < 100:
|
||||
return None
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=True) as tmp:
|
||||
tmp.write(resp.content)
|
||||
tmp.flush()
|
||||
doc = pymupdf.open(tmp.name)
|
||||
parts = [page.get_text() for page in doc]
|
||||
doc.close()
|
||||
|
||||
text = "\n".join(parts).strip()
|
||||
return text if len(text) >= 50 else None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def rescrape_vorlage(conn_or_none, vorlage_id: int) -> dict:
|
||||
"""
|
||||
Rescrape ALLRIS data for a single Vorlage.
|
||||
|
||||
Opens its OWN DB connection (thread-safe).
|
||||
``conn_or_none`` is accepted for signature compat but ignored —
|
||||
we always create a fresh connection so this is safe from threads.
|
||||
|
||||
Returns: {"updated_beratungen": N, "updated_volltext": bool, "errors": [...]}
|
||||
"""
|
||||
own_conn = get_connection()
|
||||
try:
|
||||
return _rescrape_vorlage_impl(own_conn, vorlage_id)
|
||||
finally:
|
||||
own_conn.close()
|
||||
|
||||
|
||||
def _rescrape_vorlage_impl(conn, vorlage_id: int) -> dict:
|
||||
result = {"updated_beratungen": 0, "updated_volltext": False, "errors": []}
|
||||
|
||||
row = conn.execute(
|
||||
"SELECT web_url, aktenzeichen, pdf_url, volltext_clean FROM vorlagen WHERE id = ?",
|
||||
(vorlage_id,),
|
||||
).fetchone()
|
||||
|
||||
if not row:
|
||||
result["errors"].append(f"Vorlage {vorlage_id} nicht gefunden")
|
||||
return result
|
||||
|
||||
web_url = row["web_url"]
|
||||
pdf_url = row["pdf_url"]
|
||||
volltext_clean = row["volltext_clean"]
|
||||
|
||||
# --- 1. Beratungsfolge scrapen ---
|
||||
if web_url:
|
||||
try:
|
||||
beratungen = _scrape_vorlage_page(web_url)
|
||||
logger.info("Vorlage %s: %d Beratungen gefunden", vorlage_id, len(beratungen))
|
||||
|
||||
for b in beratungen:
|
||||
time.sleep(DELAY_SECONDS)
|
||||
try:
|
||||
to_details = _scrape_to_page(b["to_url"])
|
||||
except Exception as e:
|
||||
result["errors"].append(f"TO {b['tolfdnr']}: {e}")
|
||||
to_details = {}
|
||||
|
||||
# Upsert: try update first, then insert
|
||||
cur = conn.execute(
|
||||
"""UPDATE beratungen
|
||||
SET to_url = ?, beschlussart = ?,
|
||||
beschlusstext = ?, wortprotokoll = ?,
|
||||
scraped_at = CURRENT_TIMESTAMP
|
||||
WHERE vorlage_id = ? AND tolfdnr = ?""",
|
||||
(
|
||||
b["to_url"],
|
||||
b["beschlussart"],
|
||||
to_details.get("beschlusstext"),
|
||||
to_details.get("wortprotokoll"),
|
||||
vorlage_id,
|
||||
b["tolfdnr"],
|
||||
),
|
||||
)
|
||||
if cur.rowcount == 0:
|
||||
conn.execute(
|
||||
"""INSERT INTO beratungen
|
||||
(vorlage_id, to_url, tolfdnr, beschlussart,
|
||||
beschlusstext, wortprotokoll, scraped_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)""",
|
||||
(
|
||||
vorlage_id,
|
||||
b["to_url"],
|
||||
b["tolfdnr"],
|
||||
b["beschlussart"],
|
||||
to_details.get("beschlusstext"),
|
||||
to_details.get("wortprotokoll"),
|
||||
),
|
||||
)
|
||||
result["updated_beratungen"] += 1
|
||||
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
result["errors"].append(f"Beratungsfolge: {e}")
|
||||
logger.exception("Fehler beim Scrapen der Beratungsfolge für Vorlage %s", vorlage_id)
|
||||
|
||||
# --- 2. PDF-Volltext ---
|
||||
if pdf_url and not volltext_clean:
|
||||
try:
|
||||
time.sleep(DELAY_SECONDS)
|
||||
text = _extract_pdf_text(pdf_url)
|
||||
if text:
|
||||
conn.execute(
|
||||
"UPDATE vorlagen SET volltext_clean = ? WHERE id = ?",
|
||||
(text, vorlage_id),
|
||||
)
|
||||
conn.commit()
|
||||
result["updated_volltext"] = True
|
||||
logger.info("Vorlage %s: Volltext extrahiert (%d Zeichen)", vorlage_id, len(text))
|
||||
except Exception as e:
|
||||
result["errors"].append(f"PDF: {e}")
|
||||
logger.exception("Fehler bei PDF-Extraktion für Vorlage %s", vorlage_id)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def rescrape_kette(conn_or_none, kette_id: int) -> dict:
|
||||
"""
|
||||
Rescrape all Glieder of a Kette.
|
||||
|
||||
Opens its OWN DB connection (thread-safe).
|
||||
Returns: {"vorlage_results": [...], "total_beratungen": N, "total_volltext": N, "errors": [...]}
|
||||
"""
|
||||
own_conn = get_connection()
|
||||
try:
|
||||
glieder = own_conn.execute(
|
||||
"SELECT vorlage_id FROM ketten_glieder WHERE kette_id = ?",
|
||||
(kette_id,),
|
||||
).fetchall()
|
||||
|
||||
summary = {
|
||||
"vorlage_results": [],
|
||||
"total_beratungen": 0,
|
||||
"total_volltext": 0,
|
||||
"errors": [],
|
||||
}
|
||||
|
||||
for g in glieder:
|
||||
vid = g["vorlage_id"]
|
||||
r = _rescrape_vorlage_impl(own_conn, vid)
|
||||
summary["vorlage_results"].append({"vorlage_id": vid, **r})
|
||||
summary["total_beratungen"] += r["updated_beratungen"]
|
||||
summary["total_volltext"] += int(r["updated_volltext"])
|
||||
summary["errors"].extend(r["errors"])
|
||||
|
||||
return summary
|
||||
finally:
|
||||
own_conn.close()
|
||||
@ -65,6 +65,12 @@ def _status_anfrage(
|
||||
for b in beratungen
|
||||
)
|
||||
|
||||
# Fallback: Check abstimmungen for Kenntnisnahme
|
||||
if not has_kenntnisnahme:
|
||||
abst = _get_beschluss_from_abstimmungen(conn, member_ids)
|
||||
if abst:
|
||||
has_kenntnisnahme = True
|
||||
|
||||
# Check KI-Match score for Antwort
|
||||
ki_score = _get_ki_score(conn, ursprung_id, "antwort_match")
|
||||
|
||||
@ -138,10 +144,18 @@ def _status_antrag(
|
||||
berichte = [m for m in members if m["typ"] == "bericht"]
|
||||
has_bericht = len(berichte) > 0
|
||||
|
||||
# Determine beschluss from beratungen
|
||||
# Determine beschluss from beratungen + abstimmungen
|
||||
beschluss = _get_beschluss(beratungen)
|
||||
beschluss_details = _get_beschluss_details(beratungen)
|
||||
|
||||
# Fallback: Check abstimmungen if beratungen don't show a decision
|
||||
if not beschluss or beschluss not in ("angenommen", "abgelehnt", "verwiesen"):
|
||||
abst_beschluss = _get_beschluss_from_abstimmungen(conn, member_ids)
|
||||
if abst_beschluss:
|
||||
beschluss = abst_beschluss["beschluss"]
|
||||
if not beschluss_details:
|
||||
beschluss_details = abst_beschluss["details"]
|
||||
|
||||
if beschluss == "abgelehnt":
|
||||
return {"status": "abgelehnt", "status_seit": _latest_date(beratungen), "vertagungen_count": vertagungen,
|
||||
"begruendung": f"In Beratung abgelehnt. {beschluss_details}"}
|
||||
@ -293,3 +307,31 @@ def _get_beschluss(beratungen: list[sqlite3.Row]) -> str | None:
|
||||
return "angenommen"
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _get_beschluss_from_abstimmungen(conn: sqlite3.Connection, member_ids: list[int]) -> dict | None:
|
||||
"""Check abstimmungen table for a decision when beratungen don't have one."""
|
||||
if not member_ids:
|
||||
return None
|
||||
placeholders = ",".join("?" * len(member_ids))
|
||||
rows = conn.execute(
|
||||
f"""SELECT ergebnis, sitzung_datum FROM abstimmungen
|
||||
WHERE vorlage_id IN ({placeholders}) AND ergebnis IS NOT NULL
|
||||
ORDER BY sitzung_datum DESC NULLS LAST""",
|
||||
member_ids,
|
||||
).fetchall()
|
||||
|
||||
for r in rows:
|
||||
ergebnis = (r["ergebnis"] or "").lower()
|
||||
datum = r["sitzung_datum"] or "?"
|
||||
if "abgelehnt" in ergebnis:
|
||||
return {"beschluss": "abgelehnt", "details": f"Abstimmung: abgelehnt ({datum})"}
|
||||
if any(kw in ergebnis for kw in ("beschlossen", "angenommen", "zugestimmt")):
|
||||
return {"beschluss": "angenommen", "details": f"Abstimmung: {r['ergebnis']} ({datum})"}
|
||||
if "kenntnis" in ergebnis:
|
||||
return {"beschluss": "angenommen", "details": f"Abstimmung: {r['ergebnis']} ({datum})"}
|
||||
if "vertagt" in ergebnis:
|
||||
continue
|
||||
if "verwiesen" in ergebnis:
|
||||
return {"beschluss": "verwiesen", "details": f"Abstimmung: verwiesen ({datum})"}
|
||||
return None
|
||||
|
||||
@ -1,5 +1,3 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
antragstracker:
|
||||
build: .
|
||||
|
||||
@ -234,7 +234,7 @@ export const reevalKette = (id: number, anmerkung: string) =>
|
||||
post<{ job_id: string; status: string }>(`/bewertung/ketten/${id}`, { anmerkung });
|
||||
|
||||
export const fetchJobStatus = (jobId: string) =>
|
||||
get<{ status: string; result?: object; error?: string }>(`/bewertung/status/${jobId}`);
|
||||
get<{ status: string; result?: object; error?: string; phase?: string }>(`/bewertung/status/${jobId}`);
|
||||
|
||||
export interface SuchVorschlag {
|
||||
id: number;
|
||||
|
||||
@ -89,6 +89,7 @@
|
||||
let showReeval = $state(false);
|
||||
let reevalAnmerkung = $state('');
|
||||
let reevalStatus = $state<'idle' | 'running' | 'done' | 'error'>('idle');
|
||||
let reevalPhase = $state<string>('');
|
||||
let reevalError = $state('');
|
||||
|
||||
async function triggerReeval() {
|
||||
@ -100,6 +101,7 @@
|
||||
for (let i = 0; i < 60; i++) {
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
const status = await fetchJobStatus(job_id);
|
||||
reevalPhase = status.phase || '';
|
||||
if (status.status === 'done') {
|
||||
reevalStatus = 'done';
|
||||
selectedVorlage = await fetchVorlage(selectedVorlage!.id);
|
||||
@ -706,7 +708,13 @@
|
||||
{#if reevalStatus === 'running'}
|
||||
<span class="inline-flex items-center gap-2">
|
||||
<span class="animate-spin h-4 w-4 border-2 border-white border-t-transparent rounded-full"></span>
|
||||
{#if reevalPhase === 'rescrape'}
|
||||
📡 Daten aktualisieren…
|
||||
{:else if reevalPhase === 'ki_bewertung'}
|
||||
🤖 KI bewertet…
|
||||
{:else}
|
||||
KI bewertet…
|
||||
{/if}
|
||||
</span>
|
||||
{:else}
|
||||
Bewertung starten
|
||||
|
||||
Loading…
Reference in New Issue
Block a user