#22 NI-Adapter: PortalaAdapter mit JSON-in-Comment-Parsing
Niedersachsen (NILAS) nutzt denselben portala/eUI-Stack wie LSA/BE/BB/RP,
aber mit einem dritten Hit-Format: JSON-Objekte in HTML-Kommentaren
(statt Perl-Dumps oder HTML-Card-Elements). Reverse-engineered aus
HAR-Capture www.nilas.niedersachsen.de.har.
Neuer dritter Parsing-Pfad in PortalaAdapter._parse_hit_list_html:
Auto-Detection via "<!-- {" + "WEV" im HTML → _parse_hit_list_json_comments.
Feld-Mapping (NI JSON-in-Comment):
- WEV01[0].main → Titel
- WEV03[0].main → Typ
- WEV05[0].main → Metadata (Urheber + DD.MM.YYYY + "Drucksache XX/YYYY")
- WEV05[0].1 oder WEV08[0].1 → PDF-URL
ADAPTERS-Eintrag:
- bundesland="NI", db_id="lns.lissh", wahlperiode=19,
portala_path="/portala", document_type="Antrag"
Tests: 201 passed.
Refs: #22, #34 (UI-Aktivierung folgt separat)
This commit is contained in:
parent
4565a5cf0c
commit
edcb4e9c76
@ -445,6 +445,91 @@ class PortalaAdapter(ParlamentAdapter):
|
|||||||
"""Decode \\x{abcd} escape sequences from Perl Data::Dumper output."""
|
"""Decode \\x{abcd} escape sequences from Perl Data::Dumper output."""
|
||||||
return re.sub(r'\\x\{([0-9a-f]+)\}', lambda m: chr(int(m.group(1), 16)), s)
|
return re.sub(r'\\x\{([0-9a-f]+)\}', lambda m: chr(int(m.group(1), 16)), s)
|
||||||
|
|
||||||
|
_RE_JSON_COMMENT = re.compile(r'<!-- (\{.*?\})\s*-->', re.DOTALL)
|
||||||
|
_RE_DRUCKSACHE_IN_META = re.compile(r'Drucksache\s+(\d+/\d+)')
|
||||||
|
_RE_DATUM_IN_META = re.compile(r'(\d{2}\.\d{2}\.\d{4})')
|
||||||
|
|
||||||
|
def _parse_hit_list_json_comments(self, html: str, query_filter: str) -> list[Drucksache]:
|
||||||
|
"""Parse NI-style JSON-in-HTML-Comment records (#22).
|
||||||
|
|
||||||
|
Niedersachsen's NILAS uses efxRecordRepeater cards like Berlin,
|
||||||
|
but embeds structured data as JSON objects in HTML comments
|
||||||
|
(``<!-- { "WEV01": [...], ... } -->``) instead of Perl dumps.
|
||||||
|
|
||||||
|
Field mapping (from HAR-Analyse 2026-04-10):
|
||||||
|
- WEV01[0].main → Titel
|
||||||
|
- WEV03[0].main → Typ (z.B. "Kleine Anfrage zur schriftlichen Beantwortung")
|
||||||
|
- WEV05[0].main → Metadata-Zeile (Typ + Urheber + Datum + "Drucksache XX/YYYY")
|
||||||
|
- WEV05[0].1 → PDF-URL
|
||||||
|
- WEV08[0].1 → PDF-URL (alternativ)
|
||||||
|
"""
|
||||||
|
results: list[Drucksache] = []
|
||||||
|
|
||||||
|
for m in self._RE_JSON_COMMENT.finditer(html):
|
||||||
|
try:
|
||||||
|
data = json.loads(m.group(1))
|
||||||
|
except (json.JSONDecodeError, ValueError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Titel
|
||||||
|
wev01 = data.get("WEV01", [{}])
|
||||||
|
title = wev01[0].get("main", "") if wev01 else ""
|
||||||
|
|
||||||
|
# Typ
|
||||||
|
wev03 = data.get("WEV03", [{}])
|
||||||
|
typ = wev03[0].get("main", "") if wev03 else ""
|
||||||
|
|
||||||
|
# Metadata-Zeile (Urheber, Datum, Drucksache-Nr)
|
||||||
|
wev05 = data.get("WEV05", [{}])
|
||||||
|
meta = wev05[0].get("main", "") if wev05 else ""
|
||||||
|
|
||||||
|
# PDF-URL: WEV05.1 bevorzugt, WEV08.1 als Fallback
|
||||||
|
pdf_url = ""
|
||||||
|
if wev05 and wev05[0].get("1"):
|
||||||
|
pdf_url = wev05[0]["1"]
|
||||||
|
elif data.get("WEV08", [{}]):
|
||||||
|
wev08 = data["WEV08"]
|
||||||
|
if wev08 and wev08[0].get("1"):
|
||||||
|
pdf_url = wev08[0]["1"]
|
||||||
|
if pdf_url.startswith("http://"):
|
||||||
|
pdf_url = "https://" + pdf_url[len("http://"):]
|
||||||
|
|
||||||
|
# Drucksache-Nr aus Metadata
|
||||||
|
m_ds = self._RE_DRUCKSACHE_IN_META.search(meta)
|
||||||
|
if not m_ds:
|
||||||
|
continue
|
||||||
|
drucksache = m_ds.group(1)
|
||||||
|
|
||||||
|
# Datum aus Metadata
|
||||||
|
m_dat = self._RE_DATUM_IN_META.search(meta)
|
||||||
|
datum_iso = self._datum_de_to_iso(m_dat.group(1)) if m_dat else ""
|
||||||
|
|
||||||
|
# Fraktionen aus Metadata
|
||||||
|
fraktionen = self._normalize_fraktion(meta)
|
||||||
|
|
||||||
|
doc = Drucksache(
|
||||||
|
drucksache=drucksache,
|
||||||
|
title=title or f"Drucksache {drucksache}",
|
||||||
|
fraktionen=fraktionen,
|
||||||
|
datum=datum_iso,
|
||||||
|
link=pdf_url,
|
||||||
|
bundesland=self.bundesland,
|
||||||
|
typ=typ or "Antrag",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Client-seitig Antrag-Filter (wie bei allen Adaptern)
|
||||||
|
if "antrag" not in (doc.typ or "").lower():
|
||||||
|
continue
|
||||||
|
|
||||||
|
if query_filter:
|
||||||
|
hay = f"{title} {meta}".lower()
|
||||||
|
if not all(t in hay for t in query_filter.lower().split()):
|
||||||
|
continue
|
||||||
|
|
||||||
|
results.append(doc)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
def _normalize_fraktion(self, urheber: str) -> list[str]:
|
def _normalize_fraktion(self, urheber: str) -> list[str]:
|
||||||
"""Thin shim — die ganze Regex-Logik lebt jetzt zentral in
|
"""Thin shim — die ganze Regex-Logik lebt jetzt zentral in
|
||||||
``app.parteien.extract_fraktionen`` (siehe #55). ``self.bundesland``
|
``app.parteien.extract_fraktionen`` (siehe #55). ``self.bundesland``
|
||||||
@ -597,6 +682,11 @@ class PortalaAdapter(ParlamentAdapter):
|
|||||||
"""
|
"""
|
||||||
if self._RE_PRE_BLOCK.search(html):
|
if self._RE_PRE_BLOCK.search(html):
|
||||||
return self._parse_hit_list_dump(html, query_filter)
|
return self._parse_hit_list_dump(html, query_filter)
|
||||||
|
# NI-style: JSON-in-HTML-Comments statt Perl-Dumps (#22).
|
||||||
|
# Auto-detect: NI's efxRecordRepeater-Cards enthalten JSON-
|
||||||
|
# Objekte in ``<!-- { ... } -->`` Kommentaren statt Perl-Dumps.
|
||||||
|
if "<!-- {" in html and '"WEV' in html:
|
||||||
|
return self._parse_hit_list_json_comments(html, query_filter)
|
||||||
return self._parse_hit_list_cards(html, query_filter)
|
return self._parse_hit_list_cards(html, query_filter)
|
||||||
|
|
||||||
def _parse_hit_list_dump(self, html: str, query_filter: str) -> list[Drucksache]:
|
def _parse_hit_list_dump(self, html: str, query_filter: str) -> list[Drucksache]:
|
||||||
@ -3250,6 +3340,17 @@ ADAPTERS = {
|
|||||||
prefix="/parlis",
|
prefix="/parlis",
|
||||||
document_typ="Antrag",
|
document_typ="Antrag",
|
||||||
),
|
),
|
||||||
|
"NI": PortalaAdapter(
|
||||||
|
bundesland="NI",
|
||||||
|
name="Niedersächsischer Landtag (NILAS)",
|
||||||
|
base_url="https://www.nilas.niedersachsen.de",
|
||||||
|
db_id="lns.lissh",
|
||||||
|
wahlperiode=19,
|
||||||
|
portala_path="/portala",
|
||||||
|
document_type="Antrag",
|
||||||
|
# NI nutzt JSON-in-HTML-Comments statt Perl-Dumps (auto-detected
|
||||||
|
# in _parse_hit_list_html via "<!-- {" + "WEV"). HAR 2026-04-10.
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user