feat: ALLRIS-Rescrape vor KI-Neubewertung (#10)
- backend/src/tracker/core/rescrape.py: Eigenständiges Rescrape-Modul - rescrape_vorlage(): Beratungsfolge + Beschlusstexte + PDF-Volltext - rescrape_kette(): Alle Glieder einer Kette rescrapen - Eigene DB-Connection (thread-safe), Rate-Limiting 1s, Fehlertoleranz - bewertung.py: Rescrape-Phase vor KI-Call in beiden Workflows - _run_zusammenfassung(): rescrape_vorlage() → frische Daten → KI - _run_ketten_bewertung(): rescrape_kette() → frische Daten → KI - Status-Endpoint liefert 'phase' Feld mit - Frontend: Phase-aware Status-Anzeige im Explorer - rescrape → '📡 Daten aktualisieren...' - ki_bewertung → '🤖 KI bewertet...'
This commit is contained in:
parent
1c35e494d0
commit
8d9706c211
@ -12,6 +12,9 @@ from fastapi import APIRouter, Depends, HTTPException
|
|||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from tracker.db.session import get_connection
|
from tracker.db.session import get_connection
|
||||||
|
|
||||||
|
import logging
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
# Note: Re-evaluation should re-scrape ALLRIS data before KI evaluation
|
# Note: Re-evaluation should re-scrape ALLRIS data before KI evaluation
|
||||||
# to exclude transfer errors. See Gitea issue for full spec.
|
# to exclude transfer errors. See Gitea issue for full spec.
|
||||||
# IMPORTANT: Destructive changes (deleting old bewertungen etc.) only after
|
# IMPORTANT: Destructive changes (deleting old bewertungen etc.) only after
|
||||||
@ -182,6 +185,16 @@ def _run_zusammenfassung(vorlage_id: int, anmerkung: str, job_id: str):
|
|||||||
try:
|
try:
|
||||||
conn = get_connection()
|
conn = get_connection()
|
||||||
|
|
||||||
|
# --- Rescrape ALLRIS data before KI evaluation ---
|
||||||
|
_jobs[job_id]["phase"] = "rescrape"
|
||||||
|
try:
|
||||||
|
from tracker.core.rescrape import rescrape_vorlage
|
||||||
|
rescrape_vorlage(conn, vorlage_id)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Rescrape failed for vorlage %s: %s", vorlage_id, e)
|
||||||
|
_jobs[job_id]["phase"] = "ki_bewertung"
|
||||||
|
|
||||||
|
# Reload fresh data after rescrape
|
||||||
row = conn.execute("SELECT volltext_clean, aktenzeichen FROM vorlagen WHERE id = ?", (vorlage_id,)).fetchone()
|
row = conn.execute("SELECT volltext_clean, aktenzeichen FROM vorlagen WHERE id = ?", (vorlage_id,)).fetchone()
|
||||||
if not row or not row["volltext_clean"]:
|
if not row or not row["volltext_clean"]:
|
||||||
_jobs[job_id] = {"status": "error", "error": "Kein Volltext vorhanden"}
|
_jobs[job_id] = {"status": "error", "error": "Kein Volltext vorhanden"}
|
||||||
@ -256,6 +269,15 @@ def _run_ketten_bewertung(kette_id: int, anmerkung: str, job_id: str):
|
|||||||
try:
|
try:
|
||||||
conn = get_connection()
|
conn = get_connection()
|
||||||
|
|
||||||
|
# --- Rescrape all Glieder before KI evaluation ---
|
||||||
|
_jobs[job_id]["phase"] = "rescrape"
|
||||||
|
try:
|
||||||
|
from tracker.core.rescrape import rescrape_kette
|
||||||
|
rescrape_kette(conn, kette_id)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Rescrape failed for kette %s: %s", kette_id, e)
|
||||||
|
_jobs[job_id]["phase"] = "ki_bewertung"
|
||||||
|
|
||||||
# Get kette + ursprung
|
# Get kette + ursprung
|
||||||
kette = conn.execute(
|
kette = conn.execute(
|
||||||
"SELECT k.*, v.aktenzeichen, v.volltext_clean FROM ketten k JOIN vorlagen v ON k.ursprung_id = v.id WHERE k.id = ?",
|
"SELECT k.*, v.aktenzeichen, v.volltext_clean FROM ketten k JOIN vorlagen v ON k.ursprung_id = v.id WHERE k.id = ?",
|
||||||
@ -426,4 +448,5 @@ def get_job_status(job_id: str):
|
|||||||
"""Check status of a re-evaluation job."""
|
"""Check status of a re-evaluation job."""
|
||||||
if job_id not in _jobs:
|
if job_id not in _jobs:
|
||||||
raise HTTPException(status_code=404, detail="Job nicht gefunden")
|
raise HTTPException(status_code=404, detail="Job nicht gefunden")
|
||||||
return _jobs[job_id]
|
job = _jobs[job_id]
|
||||||
|
return {**job, "phase": job.get("phase", "")}
|
||||||
|
|||||||
274
backend/src/tracker/core/rescrape.py
Normal file
274
backend/src/tracker/core/rescrape.py
Normal file
@ -0,0 +1,274 @@
|
|||||||
|
"""
|
||||||
|
ALLRIS Rescrape-Modul.
|
||||||
|
|
||||||
|
Scrapet Beratungsfolge, Beschlusstexte und PDF-Volltext für eine Vorlage
|
||||||
|
oder alle Glieder einer Kette. Eigenständig importierbar aus dem Backend.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
import time
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from tracker.db.session import get_connection
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
ALLRIS_BASE = "https://allris.hagen.de"
|
||||||
|
DELAY_SECONDS = 1.0
|
||||||
|
HTTP_TIMEOUT = 30
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# HTTP helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _get(url: str) -> httpx.Response:
|
||||||
|
"""GET with timeout and redirect following."""
|
||||||
|
return httpx.get(url, timeout=HTTP_TIMEOUT, follow_redirects=True)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Scraping functions (adapted from scripts/scrape_beratungsfolge.py)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _scrape_vorlage_page(url: str) -> list[dict]:
|
||||||
|
"""Scrape Beratungsfolge von einer ALLRIS Vorlagen-Seite."""
|
||||||
|
resp = _get(url)
|
||||||
|
resp.raise_for_status()
|
||||||
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
|
||||||
|
beratungen: list[dict] = []
|
||||||
|
|
||||||
|
for link in soup.find_all("a", href=True):
|
||||||
|
href = link["href"]
|
||||||
|
if "to020" not in href or "TOLFDNR=" not in href:
|
||||||
|
continue
|
||||||
|
|
||||||
|
tolfdnr_match = re.search(r"TOLFDNR=(\d+)", href)
|
||||||
|
if not tolfdnr_match:
|
||||||
|
continue
|
||||||
|
|
||||||
|
tolfdnr = tolfdnr_match.group(1)
|
||||||
|
beschlussart = link.get_text(strip=True)
|
||||||
|
|
||||||
|
# Sitzungsinfo aus vorherigem Link
|
||||||
|
sitzung_name = None
|
||||||
|
prev = link.find_previous("a", href=re.compile(r"to010.*SILFDNR="))
|
||||||
|
if prev:
|
||||||
|
sitzung_name = prev.get_text(strip=True)
|
||||||
|
|
||||||
|
to_url = href if href.startswith("http") else ALLRIS_BASE + href
|
||||||
|
|
||||||
|
beratungen.append({
|
||||||
|
"tolfdnr": tolfdnr,
|
||||||
|
"beschlussart": beschlussart,
|
||||||
|
"sitzung_name": sitzung_name,
|
||||||
|
"to_url": to_url,
|
||||||
|
})
|
||||||
|
|
||||||
|
return beratungen
|
||||||
|
|
||||||
|
|
||||||
|
def _scrape_to_page(url: str) -> dict:
|
||||||
|
"""Scrape Beschlusstext und Wortprotokoll von einer TO-Seite."""
|
||||||
|
resp = _get(url)
|
||||||
|
resp.raise_for_status()
|
||||||
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
|
||||||
|
result: dict = {
|
||||||
|
"beschlusstext": None,
|
||||||
|
"wortprotokoll": None,
|
||||||
|
"sitzung_datum": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Datum aus Titel
|
||||||
|
title = soup.find("h1", class_="title")
|
||||||
|
if title:
|
||||||
|
date_match = re.search(r"(\d{2}\.\d{2}\.\d{4})", title.get_text())
|
||||||
|
if date_match:
|
||||||
|
result["sitzung_datum"] = date_match.group(1)
|
||||||
|
|
||||||
|
# Texte in <span style="font-family:…Arial…">
|
||||||
|
text_spans = soup.find_all("span", style=re.compile(r"font-family.*Arial"))
|
||||||
|
texts = [s.get_text(strip=True) for s in text_spans if s.get_text(strip=True)]
|
||||||
|
|
||||||
|
if texts:
|
||||||
|
result["beschlusstext"] = texts[-1]
|
||||||
|
if len(texts) > 1:
|
||||||
|
result["wortprotokoll"] = "\n\n".join(texts[:-1])
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_pdf_text(url: str) -> str | None:
|
||||||
|
"""Download PDF and extract text via PyMuPDF."""
|
||||||
|
try:
|
||||||
|
import pymupdf
|
||||||
|
except ImportError:
|
||||||
|
logger.warning("pymupdf not installed, skipping PDF extraction")
|
||||||
|
return None
|
||||||
|
|
||||||
|
resp = httpx.get(url, timeout=60, follow_redirects=True)
|
||||||
|
resp.raise_for_status()
|
||||||
|
|
||||||
|
if len(resp.content) < 100:
|
||||||
|
return None
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=True) as tmp:
|
||||||
|
tmp.write(resp.content)
|
||||||
|
tmp.flush()
|
||||||
|
doc = pymupdf.open(tmp.name)
|
||||||
|
parts = [page.get_text() for page in doc]
|
||||||
|
doc.close()
|
||||||
|
|
||||||
|
text = "\n".join(parts).strip()
|
||||||
|
return text if len(text) >= 50 else None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Public API
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def rescrape_vorlage(conn_or_none, vorlage_id: int) -> dict:
|
||||||
|
"""
|
||||||
|
Rescrape ALLRIS data for a single Vorlage.
|
||||||
|
|
||||||
|
Opens its OWN DB connection (thread-safe).
|
||||||
|
``conn_or_none`` is accepted for signature compat but ignored —
|
||||||
|
we always create a fresh connection so this is safe from threads.
|
||||||
|
|
||||||
|
Returns: {"updated_beratungen": N, "updated_volltext": bool, "errors": [...]}
|
||||||
|
"""
|
||||||
|
own_conn = get_connection()
|
||||||
|
try:
|
||||||
|
return _rescrape_vorlage_impl(own_conn, vorlage_id)
|
||||||
|
finally:
|
||||||
|
own_conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _rescrape_vorlage_impl(conn, vorlage_id: int) -> dict:
|
||||||
|
result = {"updated_beratungen": 0, "updated_volltext": False, "errors": []}
|
||||||
|
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT web_url, aktenzeichen, pdf_url, volltext_clean FROM vorlagen WHERE id = ?",
|
||||||
|
(vorlage_id,),
|
||||||
|
).fetchone()
|
||||||
|
|
||||||
|
if not row:
|
||||||
|
result["errors"].append(f"Vorlage {vorlage_id} nicht gefunden")
|
||||||
|
return result
|
||||||
|
|
||||||
|
web_url = row["web_url"]
|
||||||
|
pdf_url = row["pdf_url"]
|
||||||
|
volltext_clean = row["volltext_clean"]
|
||||||
|
|
||||||
|
# --- 1. Beratungsfolge scrapen ---
|
||||||
|
if web_url:
|
||||||
|
try:
|
||||||
|
beratungen = _scrape_vorlage_page(web_url)
|
||||||
|
logger.info("Vorlage %s: %d Beratungen gefunden", vorlage_id, len(beratungen))
|
||||||
|
|
||||||
|
for b in beratungen:
|
||||||
|
time.sleep(DELAY_SECONDS)
|
||||||
|
try:
|
||||||
|
to_details = _scrape_to_page(b["to_url"])
|
||||||
|
except Exception as e:
|
||||||
|
result["errors"].append(f"TO {b['tolfdnr']}: {e}")
|
||||||
|
to_details = {}
|
||||||
|
|
||||||
|
# Upsert: try update first, then insert
|
||||||
|
cur = conn.execute(
|
||||||
|
"""UPDATE beratungen
|
||||||
|
SET to_url = ?, beschlussart = ?,
|
||||||
|
beschlusstext = ?, wortprotokoll = ?,
|
||||||
|
scraped_at = CURRENT_TIMESTAMP
|
||||||
|
WHERE vorlage_id = ? AND tolfdnr = ?""",
|
||||||
|
(
|
||||||
|
b["to_url"],
|
||||||
|
b["beschlussart"],
|
||||||
|
to_details.get("beschlusstext"),
|
||||||
|
to_details.get("wortprotokoll"),
|
||||||
|
vorlage_id,
|
||||||
|
b["tolfdnr"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
if cur.rowcount == 0:
|
||||||
|
conn.execute(
|
||||||
|
"""INSERT INTO beratungen
|
||||||
|
(vorlage_id, to_url, tolfdnr, beschlussart,
|
||||||
|
beschlusstext, wortprotokoll, scraped_at)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)""",
|
||||||
|
(
|
||||||
|
vorlage_id,
|
||||||
|
b["to_url"],
|
||||||
|
b["tolfdnr"],
|
||||||
|
b["beschlussart"],
|
||||||
|
to_details.get("beschlusstext"),
|
||||||
|
to_details.get("wortprotokoll"),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
result["updated_beratungen"] += 1
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
except Exception as e:
|
||||||
|
result["errors"].append(f"Beratungsfolge: {e}")
|
||||||
|
logger.exception("Fehler beim Scrapen der Beratungsfolge für Vorlage %s", vorlage_id)
|
||||||
|
|
||||||
|
# --- 2. PDF-Volltext ---
|
||||||
|
if pdf_url and not volltext_clean:
|
||||||
|
try:
|
||||||
|
time.sleep(DELAY_SECONDS)
|
||||||
|
text = _extract_pdf_text(pdf_url)
|
||||||
|
if text:
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE vorlagen SET volltext_clean = ? WHERE id = ?",
|
||||||
|
(text, vorlage_id),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
result["updated_volltext"] = True
|
||||||
|
logger.info("Vorlage %s: Volltext extrahiert (%d Zeichen)", vorlage_id, len(text))
|
||||||
|
except Exception as e:
|
||||||
|
result["errors"].append(f"PDF: {e}")
|
||||||
|
logger.exception("Fehler bei PDF-Extraktion für Vorlage %s", vorlage_id)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def rescrape_kette(conn_or_none, kette_id: int) -> dict:
|
||||||
|
"""
|
||||||
|
Rescrape all Glieder of a Kette.
|
||||||
|
|
||||||
|
Opens its OWN DB connection (thread-safe).
|
||||||
|
Returns: {"vorlage_results": [...], "total_beratungen": N, "total_volltext": N, "errors": [...]}
|
||||||
|
"""
|
||||||
|
own_conn = get_connection()
|
||||||
|
try:
|
||||||
|
glieder = own_conn.execute(
|
||||||
|
"SELECT vorlage_id FROM ketten_glieder WHERE kette_id = ?",
|
||||||
|
(kette_id,),
|
||||||
|
).fetchall()
|
||||||
|
|
||||||
|
summary = {
|
||||||
|
"vorlage_results": [],
|
||||||
|
"total_beratungen": 0,
|
||||||
|
"total_volltext": 0,
|
||||||
|
"errors": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
for g in glieder:
|
||||||
|
vid = g["vorlage_id"]
|
||||||
|
r = _rescrape_vorlage_impl(own_conn, vid)
|
||||||
|
summary["vorlage_results"].append({"vorlage_id": vid, **r})
|
||||||
|
summary["total_beratungen"] += r["updated_beratungen"]
|
||||||
|
summary["total_volltext"] += int(r["updated_volltext"])
|
||||||
|
summary["errors"].extend(r["errors"])
|
||||||
|
|
||||||
|
return summary
|
||||||
|
finally:
|
||||||
|
own_conn.close()
|
||||||
@ -234,7 +234,7 @@ export const reevalKette = (id: number, anmerkung: string) =>
|
|||||||
post<{ job_id: string; status: string }>(`/bewertung/ketten/${id}`, { anmerkung });
|
post<{ job_id: string; status: string }>(`/bewertung/ketten/${id}`, { anmerkung });
|
||||||
|
|
||||||
export const fetchJobStatus = (jobId: string) =>
|
export const fetchJobStatus = (jobId: string) =>
|
||||||
get<{ status: string; result?: object; error?: string }>(`/bewertung/status/${jobId}`);
|
get<{ status: string; result?: object; error?: string; phase?: string }>(`/bewertung/status/${jobId}`);
|
||||||
|
|
||||||
export interface SuchVorschlag {
|
export interface SuchVorschlag {
|
||||||
id: number;
|
id: number;
|
||||||
|
|||||||
@ -89,6 +89,7 @@
|
|||||||
let showReeval = $state(false);
|
let showReeval = $state(false);
|
||||||
let reevalAnmerkung = $state('');
|
let reevalAnmerkung = $state('');
|
||||||
let reevalStatus = $state<'idle' | 'running' | 'done' | 'error'>('idle');
|
let reevalStatus = $state<'idle' | 'running' | 'done' | 'error'>('idle');
|
||||||
|
let reevalPhase = $state<string>('');
|
||||||
let reevalError = $state('');
|
let reevalError = $state('');
|
||||||
|
|
||||||
async function triggerReeval() {
|
async function triggerReeval() {
|
||||||
@ -100,6 +101,7 @@
|
|||||||
for (let i = 0; i < 60; i++) {
|
for (let i = 0; i < 60; i++) {
|
||||||
await new Promise(r => setTimeout(r, 3000));
|
await new Promise(r => setTimeout(r, 3000));
|
||||||
const status = await fetchJobStatus(job_id);
|
const status = await fetchJobStatus(job_id);
|
||||||
|
reevalPhase = status.phase || '';
|
||||||
if (status.status === 'done') {
|
if (status.status === 'done') {
|
||||||
reevalStatus = 'done';
|
reevalStatus = 'done';
|
||||||
selectedVorlage = await fetchVorlage(selectedVorlage!.id);
|
selectedVorlage = await fetchVorlage(selectedVorlage!.id);
|
||||||
@ -706,7 +708,13 @@
|
|||||||
{#if reevalStatus === 'running'}
|
{#if reevalStatus === 'running'}
|
||||||
<span class="inline-flex items-center gap-2">
|
<span class="inline-flex items-center gap-2">
|
||||||
<span class="animate-spin h-4 w-4 border-2 border-white border-t-transparent rounded-full"></span>
|
<span class="animate-spin h-4 w-4 border-2 border-white border-t-transparent rounded-full"></span>
|
||||||
KI bewertet…
|
{#if reevalPhase === 'rescrape'}
|
||||||
|
📡 Daten aktualisieren…
|
||||||
|
{:else if reevalPhase === 'ki_bewertung'}
|
||||||
|
🤖 KI bewertet…
|
||||||
|
{:else}
|
||||||
|
KI bewertet…
|
||||||
|
{/if}
|
||||||
</span>
|
</span>
|
||||||
{:else}
|
{:else}
|
||||||
Bewertung starten
|
Bewertung starten
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user