Phase H: HE StarWebHEAdapter (#24/#30) — Hessen aktiv
Schließt #24 (HE Card-Parser) und #36 (UI-Aktivierung). Eigenständige ``StarWebHEAdapter``-Klasse für starweb.hessen.de. Backend-Discovery aus HAR-Trace (TEMP/starweb.hessen.de.har): - starweb.hessen.de läuft auf einem eUI-Backend mit synchronem 2-Step- Flow (kein Polling wie BW PARLIS): POST ``browse.tt.json`` → ``report_id`` direkt in der Response → GET ``report.tt.html? report_id=...&start=0&chunksize=1500`` - Source: ``hlt.lis`` - Server verlangt ZWINGEND einen ``search.json``-Term-Tree, ``parsed``/ ``sref`` allein reichen nicht. Top-NOT mit zwei Operanden: ``not(WP-Filter, NOWEB=X)`` - Hit-Format: Cards (``efxRecordRepeater``) mit Daten in HTML-Kommentar- Perl-Dumps ``<!--<pre class="dump">$VAR1 = ...</pre>-->`` - Field-Mapping: WEV01=Title, WEV02=Datum, WEV03=Typ, WEV07=PDF-URL, WEV08=Drucksachen-Nummer, WEV12=Urheber Pipeline: - ``search()`` synchron 2-Step, client-side ``"antrag"``-Filter (analog #61 für portala) — fängt "Dringlicher Berichtsantrag" und ähnliche Subtypen - ``get_document()`` linearer Lookup über die ersten 200 Hits - ``download_text()`` PDF-via-fitz (HE-PDF-URLs werden auf https upgegradet) BL-Eintrag in ``bundeslaender.py``: - ``HE.aktiv = True`` - ``doku_system="portala"`` (statt "StarWeb" — die /starweb/LIS-Pfade sind nur Legacy, das echte Backend ist /portal) - ``doku_base_url="https://starweb.hessen.de/portal"`` ADAPTERS-Registrierung an Position vor NRW. Live-Probe: ``` 21/4157 2026-04-07 | [GRÜNE] | Dringlicher Berichtsantrag | Vorstellung, Kosten... 21/4156 2026-04-02 | [GRÜNE] | Berichtsantrag | Schulische Prävention... 21/4136 2026-03-30 | [GRÜNE] | Dringlicher Berichtsantrag | Streichung des Schulfachs... ``` 176 Unit-Tests grün, Sub-A im Container nach Deploy zu verifizieren. Refs: #24, #30, #36, #59 (Phase H) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
0f7d35f20e
commit
4a8986e009
@ -239,11 +239,19 @@ BUNDESLAENDER: dict[str, Bundesland] = {
|
|||||||
naechste_wahl="2028-10-22",
|
naechste_wahl="2028-10-22",
|
||||||
regierungsfraktionen=["CDU", "SPD"],
|
regierungsfraktionen=["CDU", "SPD"],
|
||||||
landtagsfraktionen=["CDU", "AfD", "SPD", "GRÜNE", "FDP"],
|
landtagsfraktionen=["CDU", "AfD", "SPD", "GRÜNE", "FDP"],
|
||||||
doku_system="StarWeb",
|
doku_system="portala",
|
||||||
doku_base_url="https://starweb.hessen.de/starweb/LIS",
|
doku_base_url="https://starweb.hessen.de/portal",
|
||||||
drucksache_format="21/1234",
|
drucksache_format="21/1234",
|
||||||
dokukratie_scraper="he",
|
dokukratie_scraper="he",
|
||||||
anmerkung="Wahltermin 2028 ist Schätzung.",
|
aktiv=True,
|
||||||
|
anmerkung=(
|
||||||
|
"starweb.hessen.de läuft auf demselben portala/eUI-Backend "
|
||||||
|
"wie LSA/BE/BB/RP, aber mit HE-spezifischem Hit-Format: "
|
||||||
|
"Cards (efxRecordRepeater) mit Daten in HTML-Kommentar-"
|
||||||
|
"Perl-Dumps (WEV01-WEV12). PortalaAdapter mit eigenem "
|
||||||
|
"Parser-Modus _parse_hit_list_he_comment_dump (#24/#30). "
|
||||||
|
"Wahltermin 2028 ist Schätzung."
|
||||||
|
),
|
||||||
),
|
),
|
||||||
"MV": Bundesland(
|
"MV": Bundesland(
|
||||||
code="MV",
|
code="MV",
|
||||||
|
|||||||
@ -1845,6 +1845,253 @@ class PARLISAdapter(ParlamentAdapter):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class StarWebHEAdapter(ParlamentAdapter):
|
||||||
|
"""Hessen-spezifischer eUI-Adapter (#24/#30).
|
||||||
|
|
||||||
|
starweb.hessen.de läuft auf einem eUI-Backend mit synchronem 2-Step-
|
||||||
|
Flow (anders als BW PARLIS, das asynchron pollt):
|
||||||
|
|
||||||
|
1. POST ``/portal/browse.tt.json`` mit ``action=SearchAndDisplay`` →
|
||||||
|
Response enthält ``report_id`` direkt
|
||||||
|
2. GET ``/portal/report.tt.html?report_id=...`` → HTML mit den Hits
|
||||||
|
|
||||||
|
Hit-Format: Cards mit ``efxRecordRepeater``-divs, Daten in HTML-
|
||||||
|
Kommentar-Perl-Dumps (``<!--<pre class="dump">$VAR1 = ...</pre>-->``).
|
||||||
|
Field-Mapping:
|
||||||
|
|
||||||
|
- ``WEV01`` → Title
|
||||||
|
- ``WEV02`` → Datum
|
||||||
|
- ``WEV03`` → Typ
|
||||||
|
- ``WEV07`` → PDF-URL
|
||||||
|
- ``WEV08`` → Drucksachen-Nummer
|
||||||
|
- ``WEV12`` → Urheber/Fraktion
|
||||||
|
|
||||||
|
Source: ``hlt.lis`` (Hessischer Landtag), Wahlperiode 21.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_RE_HE_COMMENT_DUMP = re.compile(
|
||||||
|
r'<!--\s*<pre[^>]*class="dump"[^>]*>\s*\$VAR1 = (.*?)</pre>\s*-->',
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
_RE_HE_WEV01 = re.compile(r"'WEV01'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
|
||||||
|
_RE_HE_WEV02 = re.compile(r"'WEV02'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"'](\d{1,2}\.\d{1,2}\.\d{4})[\"']")
|
||||||
|
_RE_HE_WEV03 = re.compile(r"'WEV03'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
|
||||||
|
_RE_HE_WEV07 = re.compile(r"'WEV07'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
|
||||||
|
_RE_HE_WEV08 = re.compile(r"'WEV08'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"'](\d+/\d+)[\"']")
|
||||||
|
_RE_HE_WEV12 = re.compile(r"'WEV12'\s*=>\s*\[\s*\{\s*'main'\s*=>\s*[\"']([^\"']+)[\"']")
|
||||||
|
|
||||||
|
bundesland = "HE"
|
||||||
|
name = "Hessischer Landtag (StarWeb)"
|
||||||
|
base_url = "https://starweb.hessen.de"
|
||||||
|
portal_path = "/portal"
|
||||||
|
wahlperiode = 21
|
||||||
|
|
||||||
|
def _normalize_fraktion(self, text: str) -> list[str]:
|
||||||
|
from .parteien import extract_fraktionen
|
||||||
|
return extract_fraktionen(text, bundesland=self.bundesland)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _datum_de_to_iso(datum_de: str) -> str:
|
||||||
|
if not datum_de:
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
d, m, y = datum_de.split(".")
|
||||||
|
return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
|
||||||
|
except ValueError:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _decode_perl_hex(text: str) -> str:
|
||||||
|
"""Wandle ``\\x{e9}`` → ``é`` etc. um. Robuste Hex-Substitution."""
|
||||||
|
return re.sub(
|
||||||
|
r"\\x\{([0-9a-fA-F]+)\}",
|
||||||
|
lambda m: chr(int(m.group(1), 16)),
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _build_initial_body(self, query: str = "") -> dict:
|
||||||
|
"""HE-Server-Body. Aktuelle WP, optional Volltext-Filter.
|
||||||
|
|
||||||
|
Der Server verlangt ZWINGEND einen ``search.json``-Term-Tree mit
|
||||||
|
einer ``not(query, NOWEB=X)``-Wurzel. ``parsed``/``sref`` allein
|
||||||
|
reichen nicht — der Server ignoriert sie und liefert nur
|
||||||
|
``facets`` zurück.
|
||||||
|
"""
|
||||||
|
wp_str = str(self.wahlperiode)
|
||||||
|
wp_term = {
|
||||||
|
"tn": "term", "t": wp_str, "sf": "WP",
|
||||||
|
"op": "eq", "idx": 45, "l": 3, "num": 1,
|
||||||
|
}
|
||||||
|
# Bauen den Top-NOT-Tree: NOT(query_subtree, NOWEB=X)
|
||||||
|
if query:
|
||||||
|
vtdrs_term = {
|
||||||
|
"tn": "term",
|
||||||
|
"t": f"\"(/VT ('\\\"{query}\\\"'))\"",
|
||||||
|
"sf": "VTDRS", "op": "eq", "idx": 9, "l": 3, "num": 3,
|
||||||
|
}
|
||||||
|
inner = {"tn": "and", "terms": [vtdrs_term, wp_term], "num": 4}
|
||||||
|
parsed = (
|
||||||
|
f"((/VTDRS \"(/VT ('\\\"{query}\\\"'))\") "
|
||||||
|
f"AND (/WP {wp_str})) AND NOT NOWEB=X"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
inner = wp_term
|
||||||
|
parsed = f"(/WP {wp_str}) AND NOT NOWEB=X"
|
||||||
|
|
||||||
|
json_tree = [{
|
||||||
|
"tn": "not",
|
||||||
|
"terms": [
|
||||||
|
inner,
|
||||||
|
{"tn": "term", "t": "X", "sf": "NOWEB",
|
||||||
|
"op": "eq", "idx": 100, "l": 3, "num": 2},
|
||||||
|
],
|
||||||
|
}]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"action": "SearchAndDisplay",
|
||||||
|
"sources": ["hlt.lis"],
|
||||||
|
"report": {
|
||||||
|
"rhl": "main",
|
||||||
|
"rhlmode": "add",
|
||||||
|
"format": "generic2-short",
|
||||||
|
"mime": "html",
|
||||||
|
"sort": "WPSORT/D DRSORT/D",
|
||||||
|
},
|
||||||
|
"search": {
|
||||||
|
"lines": {"1": query, "2": wp_str},
|
||||||
|
"serverrecordname": "generic2Search",
|
||||||
|
"parsed": parsed,
|
||||||
|
"sref": parsed,
|
||||||
|
"json": json_tree,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
||||||
|
"""Synchroner 2-Step gegen starweb.hessen.de."""
|
||||||
|
from .parteien import extract_fraktionen
|
||||||
|
|
||||||
|
body = self._build_initial_body(query)
|
||||||
|
browse_url = f"{self.base_url}{self.portal_path}/browse.tt.json"
|
||||||
|
report_url = f"{self.base_url}{self.portal_path}/report.tt.html"
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(
|
||||||
|
timeout=60, follow_redirects=True,
|
||||||
|
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
||||||
|
) as client:
|
||||||
|
try:
|
||||||
|
resp = await client.post(browse_url, json=body)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
logger.error("HE browse HTTP %s", resp.status_code)
|
||||||
|
return []
|
||||||
|
data = resp.json()
|
||||||
|
report_id = data.get("report_id")
|
||||||
|
if not report_id:
|
||||||
|
logger.error("HE: no report_id in browse response keys=%s", sorted(data.keys()))
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Step 2: report.tt.html mit chunksize — ohne den Parameter
|
||||||
|
# liefert der Server nur den allerersten Hit (8 KB HTML).
|
||||||
|
# Wir nehmen 1500 als Floor, analog #61 PortalaAdapter, weil
|
||||||
|
# nach dem client-side Antrag-Filter die Hit-Dichte gering
|
||||||
|
# ist (HE hat ~1:30 Antrag/Anfrage).
|
||||||
|
chunksize = max(limit * 30, 1500)
|
||||||
|
rep = await client.get(
|
||||||
|
report_url,
|
||||||
|
params={
|
||||||
|
"report_id": report_id,
|
||||||
|
"start": 0,
|
||||||
|
"chunksize": chunksize,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
if rep.status_code != 200:
|
||||||
|
logger.error("HE report HTTP %s", rep.status_code)
|
||||||
|
return []
|
||||||
|
results = self._parse_report_html(rep.text)
|
||||||
|
# Client-side Antrag-Filter (analog #61 Bug 2/3 für portala)
|
||||||
|
results = [d for d in results if "antrag" in (d.typ or "").lower()]
|
||||||
|
# Optional Query-Filter client-side
|
||||||
|
if query:
|
||||||
|
qterms = query.lower().split()
|
||||||
|
results = [
|
||||||
|
d for d in results
|
||||||
|
if all(t in (d.title.lower() + " " + " ".join(d.fraktionen).lower()) for t in qterms)
|
||||||
|
]
|
||||||
|
return results[:limit]
|
||||||
|
except Exception:
|
||||||
|
logger.exception("HE search error")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _parse_report_html(self, html: str) -> list[Drucksache]:
|
||||||
|
"""Zieht Daten aus den ``<!--<pre class="dump">$VAR1 = ...-->``-
|
||||||
|
Kommentaren. WEV01–WEV12 → Drucksache-Felder."""
|
||||||
|
from .parteien import extract_fraktionen
|
||||||
|
|
||||||
|
results: list[Drucksache] = []
|
||||||
|
for dump in self._RE_HE_COMMENT_DUMP.findall(html):
|
||||||
|
m_ds = self._RE_HE_WEV08.search(dump)
|
||||||
|
if not m_ds:
|
||||||
|
continue
|
||||||
|
drucksache = m_ds.group(1)
|
||||||
|
|
||||||
|
m_t = self._RE_HE_WEV01.search(dump)
|
||||||
|
title = self._decode_perl_hex(m_t.group(1)) if m_t else f"Drucksache {drucksache}"
|
||||||
|
|
||||||
|
m_pdf = self._RE_HE_WEV07.search(dump)
|
||||||
|
pdf_url = m_pdf.group(1) if m_pdf else ""
|
||||||
|
if pdf_url.startswith("http://"):
|
||||||
|
pdf_url = "https://" + pdf_url[len("http://"):]
|
||||||
|
|
||||||
|
m_dat = self._RE_HE_WEV02.search(dump)
|
||||||
|
datum_iso = self._datum_de_to_iso(m_dat.group(1)) if m_dat else ""
|
||||||
|
|
||||||
|
m_typ = self._RE_HE_WEV03.search(dump)
|
||||||
|
typ = self._decode_perl_hex(m_typ.group(1)) if m_typ else "Drucksache"
|
||||||
|
|
||||||
|
m_urheber = self._RE_HE_WEV12.search(dump)
|
||||||
|
urheber = self._decode_perl_hex(m_urheber.group(1)) if m_urheber else ""
|
||||||
|
fraktionen = extract_fraktionen(urheber, bundesland=self.bundesland)
|
||||||
|
|
||||||
|
results.append(Drucksache(
|
||||||
|
drucksache=drucksache, title=title, fraktionen=fraktionen,
|
||||||
|
datum=datum_iso, link=pdf_url, bundesland=self.bundesland,
|
||||||
|
typ=typ,
|
||||||
|
))
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
||||||
|
"""Linearer Lookup über search() — wie die anderen Adapter, kein
|
||||||
|
Direkt-ID-Filter."""
|
||||||
|
results = await self.search("", limit=200)
|
||||||
|
for d in results:
|
||||||
|
if d.drucksache == drucksache:
|
||||||
|
return d
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def download_text(self, drucksache: str) -> Optional[str]:
|
||||||
|
import fitz
|
||||||
|
doc = await self.get_document(drucksache)
|
||||||
|
if not doc or not doc.link:
|
||||||
|
return None
|
||||||
|
async with httpx.AsyncClient(
|
||||||
|
timeout=60, follow_redirects=True,
|
||||||
|
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
||||||
|
) as client:
|
||||||
|
try:
|
||||||
|
resp = await client.get(doc.link)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
return None
|
||||||
|
pdf = fitz.open(stream=resp.content, filetype="pdf")
|
||||||
|
text = ""
|
||||||
|
for page in pdf:
|
||||||
|
text += page.get_text()
|
||||||
|
pdf.close()
|
||||||
|
return text
|
||||||
|
except Exception:
|
||||||
|
logger.exception("HE PDF download error for %s", drucksache)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
class BundestagAdapter(ParlamentAdapter):
|
class BundestagAdapter(ParlamentAdapter):
|
||||||
"""Adapter für den Deutschen Bundestag via DIP-API.
|
"""Adapter für den Deutschen Bundestag via DIP-API.
|
||||||
|
|
||||||
@ -2060,6 +2307,7 @@ class BundestagAdapter(ParlamentAdapter):
|
|||||||
# Registry of adapters
|
# Registry of adapters
|
||||||
ADAPTERS = {
|
ADAPTERS = {
|
||||||
"BUND": BundestagAdapter(),
|
"BUND": BundestagAdapter(),
|
||||||
|
"HE": StarWebHEAdapter(),
|
||||||
"NRW": NRWAdapter(),
|
"NRW": NRWAdapter(),
|
||||||
"LSA": PortalaAdapter(
|
"LSA": PortalaAdapter(
|
||||||
bundesland="LSA",
|
bundesland="LSA",
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user