Activate Schleswig-Holstein via StarFinderCGIAdapter (#20, Phase 2)
SH läuft auf der ältesten der vier Backend-Familien: Starfinder-CGI auf lissh.lvn.parlanet.de. URL-basiert (nicht stateful wie das moderne StarWeb-Servlet von BB/HE/NI/RP/HB), Latin-1-encoding, flat HTML-Tabelle als Hit-Format. Eigener Adapter weil das Schema fundamental anders ist als alles andere. Endpoint: http://lissh.lvn.parlanet.de/cgi-bin/starfinder/0 ?path=lisshfl.txt&id=FASTLINK&pass=&search=WP=20+AND+dtyp=antrag &format=WEBKURZFL Hit-Format pro <tr class="tabcol*">: <b>{TITLE}</b><br> Antrag {URHEBER} {DD.MM.YYYY} Drucksache <a href="{PDF}">{N/M}</a> Quelle: dokukratie/sh.yml + Live-Probing. Encoding: Server liefert iso-8859-1 ohne korrektes Content-Type- Header. Adapter dekodiert resp.content explizit als latin-1. SSW-Detection im _normalize_fraktion: SH ist das einzige BL mit SSW-Fraktion (von der 5%-Hürde befreit), pattern ist \\bSSW\\b analog zu \\bAfD\\b. Free-Text-Suche client-seitig (siehe #18) — server-side query- syntax mit (term) im starfinder-search-Param wird vom Server nicht als Volltext interpretiert, einheitlich mit allen anderen aktiven Adaptern. Smoke-Test (lokal): SH q="": 8 hits in 14.4s SH q="Schule": 8 hits in 14.8s (Schulentwicklung Westküste, Hochschulen, queere Vielfalt an Schule etc.) SH q="Klima": 8 hits (klimafreundlich, Klimafolgen, Strategischer Aktionsplan) SH q="Bildung": 8 hits (berufliche Bildung, Holocaust-Wissen) bundeslaender.py::SH.aktiv = True. doku_base_url auf lissh.lvn.parlanet.de korrigiert (ehemaliger landtag.ltsh.de- Eintrag passte nicht zum echten Endpoint). Damit ist Phase 2 (1/6) angefangen — als Nebenpfad, weil das StarWeb-Servlet (#27 BB als Template für 5 weitere) ohne HAR- Trace nicht sauber reverse-engineerbar war. Phase 2 (1/6) aus Roadmap-Issue #49. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
dc0bb07c12
commit
f82c60e40d
@ -364,10 +364,19 @@ BUNDESLAENDER: dict[str, Bundesland] = {
|
|||||||
regierungsfraktionen=["CDU", "GRÜNE"],
|
regierungsfraktionen=["CDU", "GRÜNE"],
|
||||||
landtagsfraktionen=["CDU", "GRÜNE", "SPD", "FDP", "SSW"],
|
landtagsfraktionen=["CDU", "GRÜNE", "SPD", "FDP", "SSW"],
|
||||||
doku_system="StarWeb",
|
doku_system="StarWeb",
|
||||||
doku_base_url="https://www.landtag.ltsh.de",
|
doku_base_url="http://lissh.lvn.parlanet.de",
|
||||||
drucksache_format="20/1234",
|
drucksache_format="20/1234",
|
||||||
dokukratie_scraper="sh",
|
dokukratie_scraper="sh",
|
||||||
anmerkung="SSW ist von der 5%-Hürde befreit.",
|
aktiv=True,
|
||||||
|
anmerkung=(
|
||||||
|
"SSW ist von der 5%-Hürde befreit. Doku-System ist die "
|
||||||
|
"alte Starfinder-CGI auf lissh.lvn.parlanet.de — URL-"
|
||||||
|
"basiert via "
|
||||||
|
"/cgi-bin/starfinder/0?path=lisshfl.txt&search=WP=20+AND+dtyp=antrag, "
|
||||||
|
"Latin-1-encoding. NICHT die moderne StarWeb-Servlet-"
|
||||||
|
"Variante (BB/HE/NI/RP/HB) — eigene Klasse "
|
||||||
|
"StarFinderCGIAdapter."
|
||||||
|
),
|
||||||
),
|
),
|
||||||
"TH": Bundesland(
|
"TH": Bundesland(
|
||||||
code="TH",
|
code="TH",
|
||||||
|
|||||||
@ -1269,6 +1269,215 @@ class ParLDokAdapter(ParlamentAdapter):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class StarFinderCGIAdapter(ParlamentAdapter):
|
||||||
|
"""Adapter for old-school CGI Starfinder instances.
|
||||||
|
|
||||||
|
Currently used by Schleswig-Holstein on
|
||||||
|
``lissh.lvn.parlanet.de/cgi-bin/starfinder/0`` — the **oldest** of the
|
||||||
|
parliament backends we touch. Predates StarWeb's HTML form-submit
|
||||||
|
machinery: instead of submitting a stateful AdvancedSearch form
|
||||||
|
(which BB/HE/NI/RP/HB do), Starfinder accepts the entire query as
|
||||||
|
URL parameters and returns plain HTML with a flat ``<tr>`` table of
|
||||||
|
records.
|
||||||
|
|
||||||
|
Reverse-engineering quelle: ``dokukratie/sh.yml`` plus a probe
|
||||||
|
against the live endpoint. Format details:
|
||||||
|
|
||||||
|
- URL template: ``{base}/cgi-bin/starfinder/0?path={db_path}&id=FASTLINK
|
||||||
|
&pass=&search={starfinder_query}&format=WEBKURZFL``
|
||||||
|
- Query syntax: ``WP=20+AND+dtyp=antrag`` (URL-encoded). The
|
||||||
|
``dtyp`` codes are lowercase short labels (``antrag``, ``kleine``).
|
||||||
|
- Encoding: ``iso-8859-1`` (Latin-1) — NOT UTF-8. The HTTP response
|
||||||
|
doesn't always declare it via Content-Type, so we explicitly
|
||||||
|
decode with ``latin1`` to avoid mojibake on the German umlauts.
|
||||||
|
- Hit-format: each record is one ``<tr class="tabcol|tabcol2|tabcol3">``
|
||||||
|
with the title in ``<b>``, then ``Antrag <Urheber> <DD.MM.YYYY>
|
||||||
|
Drucksache <a href="...pdf">XX/YYYY</a>``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_RE_RECORD = re.compile(
|
||||||
|
r'<tr class="tabcol[23]?">.*?</tr>',
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
_RE_TITLE = re.compile(r"<b>(.*?)</b>", re.DOTALL)
|
||||||
|
_RE_DRUCKSACHE_LINK = re.compile(
|
||||||
|
r'<a href="([^"]+\.pdf)"[^>]*>(\d+/\d+)</a>'
|
||||||
|
)
|
||||||
|
# The line between <b>title</b> and the <a>-link looks like:
|
||||||
|
# "Antrag Christian Dirschauer (SSW) 07.04.2026 Drucksache "
|
||||||
|
# We pull the originator(s) and the date out of it.
|
||||||
|
_RE_URHEBER_DATUM = re.compile(
|
||||||
|
r"</b>\s*<br>\s*[A-Za-zÄÖÜäöüß]+\s+(.+?)\s+(\d{1,2}\.\d{1,2}\.\d{4})\s+Drucksache",
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
bundesland: str,
|
||||||
|
name: str,
|
||||||
|
base_url: str,
|
||||||
|
wahlperiode: int,
|
||||||
|
db_path: str = "lisshfl.txt",
|
||||||
|
document_typ_code: str = "antrag",
|
||||||
|
) -> None:
|
||||||
|
self.bundesland = bundesland
|
||||||
|
self.name = name
|
||||||
|
self.base_url = base_url.rstrip("/")
|
||||||
|
self.wahlperiode = wahlperiode
|
||||||
|
self.db_path = db_path
|
||||||
|
self.document_typ_code = document_typ_code
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _datum_de_to_iso(datum_de: str) -> str:
|
||||||
|
if not datum_de:
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
d, m, y = datum_de.split(".")
|
||||||
|
return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
|
||||||
|
except ValueError:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _normalize_fraktion(text: str) -> list[str]:
|
||||||
|
"""SH format: 'Christian Dirschauer (SSW), Jette Waldinger-Thiering (SSW)'.
|
||||||
|
|
||||||
|
Includes SSW which is unique to SH (befreit von 5%-Hürde).
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return []
|
||||||
|
u = text.upper()
|
||||||
|
out: list[str] = []
|
||||||
|
if re.search(r"\bBÜNDNIS\s*90\b", u) or re.search(r"\bGR(?:Ü|UE)NE\b", u):
|
||||||
|
out.append("GRÜNE")
|
||||||
|
if re.search(r"\bCDU\b", u):
|
||||||
|
out.append("CDU")
|
||||||
|
if re.search(r"\bSPD\b", u):
|
||||||
|
out.append("SPD")
|
||||||
|
if re.search(r"\bF\.?\s*D\.?\s*P\.?\b", u):
|
||||||
|
out.append("FDP")
|
||||||
|
if re.search(r"\bAFD\b", u):
|
||||||
|
out.append("AfD")
|
||||||
|
if re.search(r"\bLINKE\b", u):
|
||||||
|
out.append("LINKE")
|
||||||
|
if re.search(r"\bSSW\b", u):
|
||||||
|
out.append("SSW")
|
||||||
|
if re.search(r"LANDESREGIERUNG|\bMINISTER|STAATSKANZLEI|MINISTERPRÄSIDENT", u):
|
||||||
|
out.append("Landesregierung")
|
||||||
|
return out
|
||||||
|
|
||||||
|
def _build_url(self) -> str:
|
||||||
|
"""Build the Starfinder URL for the structural WP+dtyp browse.
|
||||||
|
|
||||||
|
Free-text filtering is done client-side on the parsed records
|
||||||
|
(consistent with #18 — alle Adapter machen einheitlich Title-
|
||||||
|
Filter ohne Server-Volltext, weil das Verhalten zwischen
|
||||||
|
Adaptern sonst asymmetrisch wird).
|
||||||
|
"""
|
||||||
|
search_param = f"WP={self.wahlperiode}+AND+dtyp={self.document_typ_code}"
|
||||||
|
return (
|
||||||
|
f"{self.base_url}/cgi-bin/starfinder/0"
|
||||||
|
f"?path={self.db_path}&id=FASTLINK&pass=&search={search_param}"
|
||||||
|
f"&format=WEBKURZFL"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _parse_records(self, html: str) -> list[Drucksache]:
|
||||||
|
results: list[Drucksache] = []
|
||||||
|
for record_html in self._RE_RECORD.findall(html):
|
||||||
|
m_link = self._RE_DRUCKSACHE_LINK.search(record_html)
|
||||||
|
if not m_link:
|
||||||
|
continue
|
||||||
|
pdf_url, drucksache = m_link.group(1), m_link.group(2)
|
||||||
|
|
||||||
|
m_title = self._RE_TITLE.search(record_html)
|
||||||
|
title = re.sub(r"\s+", " ", m_title.group(1)).strip() if m_title else f"Drucksache {drucksache}"
|
||||||
|
|
||||||
|
urheber = ""
|
||||||
|
datum_iso = ""
|
||||||
|
m_meta = self._RE_URHEBER_DATUM.search(record_html)
|
||||||
|
if m_meta:
|
||||||
|
urheber = m_meta.group(1).strip()
|
||||||
|
datum_iso = self._datum_de_to_iso(m_meta.group(2))
|
||||||
|
|
||||||
|
results.append(Drucksache(
|
||||||
|
drucksache=drucksache,
|
||||||
|
title=title,
|
||||||
|
fraktionen=self._normalize_fraktion(urheber),
|
||||||
|
datum=datum_iso,
|
||||||
|
link=pdf_url,
|
||||||
|
bundesland=self.bundesland,
|
||||||
|
typ="Antrag",
|
||||||
|
))
|
||||||
|
return results
|
||||||
|
|
||||||
|
async def search(self, query: str, limit: int = 20) -> list[Drucksache]:
|
||||||
|
url = self._build_url()
|
||||||
|
async with httpx.AsyncClient(
|
||||||
|
timeout=60,
|
||||||
|
follow_redirects=True,
|
||||||
|
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
||||||
|
) as client:
|
||||||
|
try:
|
||||||
|
resp = await client.get(url)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
logger.error("%s search HTTP %s", self.bundesland, resp.status_code)
|
||||||
|
return []
|
||||||
|
# Force latin1 because the Starfinder server doesn't always
|
||||||
|
# advertise the encoding correctly.
|
||||||
|
html = resp.content.decode("latin-1", errors="replace")
|
||||||
|
results = self._parse_records(html)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("%s search error", self.bundesland)
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Client-side title + Urheber filter (siehe #18)
|
||||||
|
if query:
|
||||||
|
terms = [t.lower() for t in query.split() if t]
|
||||||
|
results = [
|
||||||
|
d for d in results
|
||||||
|
if all(t in f"{d.title} {' '.join(d.fraktionen)}".lower() for t in terms)
|
||||||
|
]
|
||||||
|
return results[:limit]
|
||||||
|
|
||||||
|
async def get_document(self, drucksache: str) -> Optional[Drucksache]:
|
||||||
|
"""Look up a single Drucksache by ID.
|
||||||
|
|
||||||
|
SH responses are pre-sorted newest-first; we re-fetch up to 200
|
||||||
|
records and scan for the exact match. The Starfinder server
|
||||||
|
doesn't expose a number-only filter that we know of.
|
||||||
|
"""
|
||||||
|
results = await self.search(query="", limit=200)
|
||||||
|
for doc in results:
|
||||||
|
if doc.drucksache == drucksache:
|
||||||
|
return doc
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def download_text(self, drucksache: str) -> Optional[str]:
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
|
||||||
|
doc = await self.get_document(drucksache)
|
||||||
|
if not doc or not doc.link:
|
||||||
|
return None
|
||||||
|
async with httpx.AsyncClient(
|
||||||
|
timeout=60,
|
||||||
|
follow_redirects=True,
|
||||||
|
headers={"User-Agent": "Mozilla/5.0 GWOE-Antragspruefer"},
|
||||||
|
) as client:
|
||||||
|
try:
|
||||||
|
resp = await client.get(doc.link)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
return None
|
||||||
|
pdf = fitz.open(stream=resp.content, filetype="pdf")
|
||||||
|
text = ""
|
||||||
|
for page in pdf:
|
||||||
|
text += page.get_text()
|
||||||
|
pdf.close()
|
||||||
|
return text
|
||||||
|
except Exception:
|
||||||
|
logger.exception("%s PDF download error for %s", self.bundesland, drucksache)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
class BayernAdapter(ParlamentAdapter):
|
class BayernAdapter(ParlamentAdapter):
|
||||||
"""Adapter for Bayerischer Landtag."""
|
"""Adapter for Bayerischer Landtag."""
|
||||||
|
|
||||||
@ -1754,6 +1963,14 @@ ADAPTERS = {
|
|||||||
document_typ_substring=True,
|
document_typ_substring=True,
|
||||||
kinds=["Drucksache", "Vorlage"],
|
kinds=["Drucksache", "Vorlage"],
|
||||||
),
|
),
|
||||||
|
"SH": StarFinderCGIAdapter(
|
||||||
|
bundesland="SH",
|
||||||
|
name="Schleswig-Holsteinischer Landtag (LIS-SH)",
|
||||||
|
base_url="http://lissh.lvn.parlanet.de",
|
||||||
|
wahlperiode=20,
|
||||||
|
db_path="lisshfl.txt",
|
||||||
|
document_typ_code="antrag",
|
||||||
|
),
|
||||||
"BY": BayernAdapter(),
|
"BY": BayernAdapter(),
|
||||||
"BW": PARLISAdapter(
|
"BW": PARLISAdapter(
|
||||||
bundesland="BW",
|
bundesland="BW",
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user