antragstracker/scripts/geocode_pending.py
Dotty Dotter 69edf8f64c feat: Geocoding-Script verbessert + Karten-Clustering (#5, #6)
- scripts/geocode_pending.py: Nominatim mit Hagen-Fokus, Rate-Limiting 1/s
- 2.293 Orte geocodiert (vorher 608), 31k+ noch offen (läuft weiter)
- Karte: Marker-Clustering für bessere Performance
- 27k+ Orte-Einträge → Clustering nötig

Teilweise Closes #5, Closes #6
2026-04-02 15:42:25 +02:00

198 lines
6.2 KiB
Python

#!/usr/bin/env python3
"""Geocodiert pending Orte via Nominatim (1 req/s, Hagen-fokussiert)."""
import argparse
import re
import sqlite3
import sys
import time
from pathlib import Path
from typing import Optional, Tuple
import httpx
DB = Path(__file__).resolve().parent.parent / "data" / "tracker.db"
NOMINATIM = "https://nominatim.openstreetmap.org/search"
UA = "Antragstracker-Hagen/1.0 (tobias.roedel@econgood.org)"
HAGEN_BBOX = "7.35,51.30,7.65,51.45"
# Orte die zu generisch sind für sinnvolles Geocoding
SKIP_PATTERNS = [
r"^hagen$", r"^hagen,?\s*(nordrhein-westfalen|nrw)$",
r"^stadtgebiet", r"^gesamtes?\s+stadtgebiet",
r"^(bab|a)\s*\d", r"^bundesstraße\s+\d", r"^b\s*\d+$",
r"^(alle|diverse|verschiedene)\s+", r"^(stadt|kreis)\s+hagen$",
]
def should_skip(name: str) -> bool:
"""Orte überspringen die nicht sinnvoll geocodierbar sind."""
clean = name.strip().lower()
for pat in SKIP_PATTERNS:
if re.search(pat, clean):
return True
# Zu kurz / generisch
if len(clean) < 3:
return True
return False
def normalize_query(name: str) -> str:
"""Ortsnamen für Nominatim aufbereiten."""
clean = name.strip()
# Trailing "Hagen" entfernen um Duplikation zu vermeiden
clean = re.sub(r',?\s*Hagen\s*$', '', clean, flags=re.IGNORECASE).strip().rstrip(',').strip()
# "Hagen-" Prefix bei Stadtteilen behalten
if clean.lower().startswith('hagen-'):
clean = clean[6:].strip() + ', Hagen'
return clean
def geocode(client: httpx.Client, name: str) -> Optional[Tuple[float, float]]:
"""Versuche einen Ort in Hagen zu geocodieren."""
clean = normalize_query(name)
queries = [
# Strikt in Hagen Bounding Box
(f"{clean}, Hagen", {"viewbox": HAGEN_BBOX, "bounded": 1}),
# Etwas lockerer
(f"{clean}, Hagen, NRW, Germany", {}),
# Originalname als Fallback
(f"{name}, Germany", {}),
]
for q, extra_params in queries:
params = {"q": q, "format": "json", "limit": 1, "addressdetails": 1}
params.update(extra_params)
try:
r = client.get(
NOMINATIM, params=params,
headers={"User-Agent": UA}, timeout=10
)
if r.status_code == 200 and r.json():
d = r.json()[0]
lat, lon = float(d["lat"]), float(d["lon"])
# Sanity: muss grob in Hagen-Region liegen
if 51.25 <= lat <= 51.50 and 7.30 <= lon <= 7.70:
return lat, lon
except Exception:
pass
time.sleep(1.1) # Nominatim Policy: 1 req/s
return None
def main():
parser = argparse.ArgumentParser(description="Geocode pending Orte in tracker.db")
parser.add_argument("--limit", type=int, default=500,
help="Max Orte pro Durchlauf (Default: 500)")
parser.add_argument("--retry-failed", action="store_true",
help="Auch fehlgeschlagene Orte erneut versuchen")
parser.add_argument("--dry-run", action="store_true",
help="Nur anzeigen, nichts schreiben")
args = parser.parse_args()
conn = sqlite3.connect(str(DB))
conn.row_factory = sqlite3.Row
# Status-Filter
status_filter = "geocode_status='pending'"
if args.retry_failed:
status_filter = "geocode_status IN ('pending','failed')"
# Erst generische Orte skippen
generics = conn.execute(
f"SELECT id, name FROM orte WHERE {status_filter}"
).fetchall()
skipped = 0
for row in generics:
if should_skip(row["name"]):
if not args.dry_run:
conn.execute(
"UPDATE orte SET geocode_status='skipped' WHERE id=?",
(row["id"],)
)
skipped += 1
if skipped:
conn.commit()
print(f"⏭️ {skipped} generische Orte übersprungen")
# Dann die geocodierbaren holen
pending = conn.execute(
f"SELECT id, name FROM orte WHERE {status_filter} "
"ORDER BY vorlage_count DESC, id LIMIT ?",
(args.limit,)
).fetchall()
total_pending = conn.execute(
f"SELECT COUNT(*) FROM orte WHERE {status_filter}"
).fetchone()[0]
print(f"📍 Geocoding: {len(pending)} von {total_pending} pending Orten (Limit: {args.limit})")
if args.dry_run:
for row in pending[:20]:
print(f"{row['name']}")
return
success = 0
failed = 0
client = httpx.Client()
start = time.time()
try:
for i, row in enumerate(pending):
coords = geocode(client, row["name"])
if coords:
conn.execute(
"UPDATE orte SET lat=?, lon=?, geocode_status='success' WHERE id=?",
(coords[0], coords[1], row["id"])
)
success += 1
sym = ""
else:
conn.execute(
"UPDATE orte SET geocode_status='failed' WHERE id=?",
(row["id"],)
)
failed += 1
sym = ""
elapsed = time.time() - start
rate = (i + 1) / elapsed if elapsed > 0 else 0
print(
f" [{i+1:4d}/{len(pending)}] {sym} {row['name'][:50]:<50s} "
f"(✓{success}{failed} | {rate:.1f}/s)",
end="\r"
)
# Periodisch committen
if (i + 1) % 25 == 0:
conn.commit()
except KeyboardInterrupt:
print("\n⚠️ Abgebrochen!")
finally:
conn.commit()
conn.close()
client.close()
elapsed = time.time() - start
print(f"\n\n{'='*60}")
print(f"✅ Fertig in {elapsed:.0f}s")
print(f"{success} geocodiert")
print(f"{failed} fehlgeschlagen")
print(f" ⏭️ {skipped} übersprungen")
# Gesamtstatus
conn2 = sqlite3.connect(str(DB))
stats = conn2.execute(
"SELECT geocode_status, COUNT(*) FROM orte GROUP BY geocode_status"
).fetchall()
conn2.close()
print(f"\n📊 Gesamt:")
for status, count in stats:
print(f" {status}: {count}")
if __name__ == "__main__":
main()