#22 NI-Adapter: PortalaAdapter mit JSON-in-Comment-Parsing
Niedersachsen (NILAS) nutzt denselben portala/eUI-Stack wie LSA/BE/BB/RP,
aber mit einem dritten Hit-Format: JSON-Objekte in HTML-Kommentaren
(statt Perl-Dumps oder HTML-Card-Elements). Reverse-engineered aus
HAR-Capture www.nilas.niedersachsen.de.har.
Neuer dritter Parsing-Pfad in PortalaAdapter._parse_hit_list_html:
Auto-Detection via "<!-- {" + "WEV" im HTML → _parse_hit_list_json_comments.
Feld-Mapping (NI JSON-in-Comment):
- WEV01[0].main → Titel
- WEV03[0].main → Typ
- WEV05[0].main → Metadata (Urheber + DD.MM.YYYY + "Drucksache XX/YYYY")
- WEV05[0].1 oder WEV08[0].1 → PDF-URL
ADAPTERS-Eintrag:
- bundesland="NI", db_id="lns.lissh", wahlperiode=19,
portala_path="/portala", document_type="Antrag"
Tests: 201 passed.
Refs: #22, #34 (UI-Aktivierung folgt separat)
This commit is contained in:
parent
4565a5cf0c
commit
edcb4e9c76
@ -445,6 +445,91 @@ class PortalaAdapter(ParlamentAdapter):
|
||||
"""Decode \\x{abcd} escape sequences from Perl Data::Dumper output."""
|
||||
return re.sub(r'\\x\{([0-9a-f]+)\}', lambda m: chr(int(m.group(1), 16)), s)
|
||||
|
||||
_RE_JSON_COMMENT = re.compile(r'<!-- (\{.*?\})\s*-->', re.DOTALL)
|
||||
_RE_DRUCKSACHE_IN_META = re.compile(r'Drucksache\s+(\d+/\d+)')
|
||||
_RE_DATUM_IN_META = re.compile(r'(\d{2}\.\d{2}\.\d{4})')
|
||||
|
||||
def _parse_hit_list_json_comments(self, html: str, query_filter: str) -> list[Drucksache]:
|
||||
"""Parse NI-style JSON-in-HTML-Comment records (#22).
|
||||
|
||||
Niedersachsen's NILAS uses efxRecordRepeater cards like Berlin,
|
||||
but embeds structured data as JSON objects in HTML comments
|
||||
(``<!-- { "WEV01": [...], ... } -->``) instead of Perl dumps.
|
||||
|
||||
Field mapping (from HAR-Analyse 2026-04-10):
|
||||
- WEV01[0].main → Titel
|
||||
- WEV03[0].main → Typ (z.B. "Kleine Anfrage zur schriftlichen Beantwortung")
|
||||
- WEV05[0].main → Metadata-Zeile (Typ + Urheber + Datum + "Drucksache XX/YYYY")
|
||||
- WEV05[0].1 → PDF-URL
|
||||
- WEV08[0].1 → PDF-URL (alternativ)
|
||||
"""
|
||||
results: list[Drucksache] = []
|
||||
|
||||
for m in self._RE_JSON_COMMENT.finditer(html):
|
||||
try:
|
||||
data = json.loads(m.group(1))
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
continue
|
||||
|
||||
# Titel
|
||||
wev01 = data.get("WEV01", [{}])
|
||||
title = wev01[0].get("main", "") if wev01 else ""
|
||||
|
||||
# Typ
|
||||
wev03 = data.get("WEV03", [{}])
|
||||
typ = wev03[0].get("main", "") if wev03 else ""
|
||||
|
||||
# Metadata-Zeile (Urheber, Datum, Drucksache-Nr)
|
||||
wev05 = data.get("WEV05", [{}])
|
||||
meta = wev05[0].get("main", "") if wev05 else ""
|
||||
|
||||
# PDF-URL: WEV05.1 bevorzugt, WEV08.1 als Fallback
|
||||
pdf_url = ""
|
||||
if wev05 and wev05[0].get("1"):
|
||||
pdf_url = wev05[0]["1"]
|
||||
elif data.get("WEV08", [{}]):
|
||||
wev08 = data["WEV08"]
|
||||
if wev08 and wev08[0].get("1"):
|
||||
pdf_url = wev08[0]["1"]
|
||||
if pdf_url.startswith("http://"):
|
||||
pdf_url = "https://" + pdf_url[len("http://"):]
|
||||
|
||||
# Drucksache-Nr aus Metadata
|
||||
m_ds = self._RE_DRUCKSACHE_IN_META.search(meta)
|
||||
if not m_ds:
|
||||
continue
|
||||
drucksache = m_ds.group(1)
|
||||
|
||||
# Datum aus Metadata
|
||||
m_dat = self._RE_DATUM_IN_META.search(meta)
|
||||
datum_iso = self._datum_de_to_iso(m_dat.group(1)) if m_dat else ""
|
||||
|
||||
# Fraktionen aus Metadata
|
||||
fraktionen = self._normalize_fraktion(meta)
|
||||
|
||||
doc = Drucksache(
|
||||
drucksache=drucksache,
|
||||
title=title or f"Drucksache {drucksache}",
|
||||
fraktionen=fraktionen,
|
||||
datum=datum_iso,
|
||||
link=pdf_url,
|
||||
bundesland=self.bundesland,
|
||||
typ=typ or "Antrag",
|
||||
)
|
||||
|
||||
# Client-seitig Antrag-Filter (wie bei allen Adaptern)
|
||||
if "antrag" not in (doc.typ or "").lower():
|
||||
continue
|
||||
|
||||
if query_filter:
|
||||
hay = f"{title} {meta}".lower()
|
||||
if not all(t in hay for t in query_filter.lower().split()):
|
||||
continue
|
||||
|
||||
results.append(doc)
|
||||
|
||||
return results
|
||||
|
||||
def _normalize_fraktion(self, urheber: str) -> list[str]:
|
||||
"""Thin shim — die ganze Regex-Logik lebt jetzt zentral in
|
||||
``app.parteien.extract_fraktionen`` (siehe #55). ``self.bundesland``
|
||||
@ -597,6 +682,11 @@ class PortalaAdapter(ParlamentAdapter):
|
||||
"""
|
||||
if self._RE_PRE_BLOCK.search(html):
|
||||
return self._parse_hit_list_dump(html, query_filter)
|
||||
# NI-style: JSON-in-HTML-Comments statt Perl-Dumps (#22).
|
||||
# Auto-detect: NI's efxRecordRepeater-Cards enthalten JSON-
|
||||
# Objekte in ``<!-- { ... } -->`` Kommentaren statt Perl-Dumps.
|
||||
if "<!-- {" in html and '"WEV' in html:
|
||||
return self._parse_hit_list_json_comments(html, query_filter)
|
||||
return self._parse_hit_list_cards(html, query_filter)
|
||||
|
||||
def _parse_hit_list_dump(self, html: str, query_filter: str) -> list[Drucksache]:
|
||||
@ -3250,6 +3340,17 @@ ADAPTERS = {
|
||||
prefix="/parlis",
|
||||
document_typ="Antrag",
|
||||
),
|
||||
"NI": PortalaAdapter(
|
||||
bundesland="NI",
|
||||
name="Niedersächsischer Landtag (NILAS)",
|
||||
base_url="https://www.nilas.niedersachsen.de",
|
||||
db_id="lns.lissh",
|
||||
wahlperiode=19,
|
||||
portala_path="/portala",
|
||||
document_type="Antrag",
|
||||
# NI nutzt JSON-in-HTML-Comments statt Perl-Dumps (auto-detected
|
||||
# in _parse_hit_list_html via "<!-- {" + "WEV"). HAR 2026-04-10.
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user