#!/usr/bin/env python3 """Geocodiert pending Orte via Nominatim (1 req/s, Hagen-fokussiert).""" import argparse import re import sqlite3 import sys import time from pathlib import Path from typing import Optional, Tuple import httpx DB = Path(__file__).resolve().parent.parent / "data" / "tracker.db" NOMINATIM = "https://nominatim.openstreetmap.org/search" UA = "Antragstracker-Hagen/1.0 (tobias.roedel@econgood.org)" HAGEN_BBOX = "7.35,51.30,7.65,51.45" # Orte die zu generisch sind für sinnvolles Geocoding SKIP_PATTERNS = [ r"^hagen$", r"^hagen,?\s*(nordrhein-westfalen|nrw)$", r"^stadtgebiet", r"^gesamtes?\s+stadtgebiet", r"^(bab|a)\s*\d", r"^bundesstraße\s+\d", r"^b\s*\d+$", r"^(alle|diverse|verschiedene)\s+", r"^(stadt|kreis)\s+hagen$", ] def should_skip(name: str) -> bool: """Orte überspringen die nicht sinnvoll geocodierbar sind.""" clean = name.strip().lower() for pat in SKIP_PATTERNS: if re.search(pat, clean): return True # Zu kurz / generisch if len(clean) < 3: return True return False def normalize_query(name: str) -> str: """Ortsnamen für Nominatim aufbereiten.""" clean = name.strip() # Trailing "Hagen" entfernen um Duplikation zu vermeiden clean = re.sub(r',?\s*Hagen\s*$', '', clean, flags=re.IGNORECASE).strip().rstrip(',').strip() # "Hagen-" Prefix bei Stadtteilen behalten if clean.lower().startswith('hagen-'): clean = clean[6:].strip() + ', Hagen' return clean def geocode(client: httpx.Client, name: str) -> Optional[Tuple[float, float]]: """Versuche einen Ort in Hagen zu geocodieren.""" clean = normalize_query(name) queries = [ # Strikt in Hagen Bounding Box (f"{clean}, Hagen", {"viewbox": HAGEN_BBOX, "bounded": 1}), # Etwas lockerer — nur wenn erster Versuch nichts liefert (f"{clean}, Hagen, NRW, Germany", {}), ] for q, extra_params in queries: params = {"q": q, "format": "json", "limit": 1} params.update(extra_params) try: r = client.get( NOMINATIM, params=params, headers={"User-Agent": UA}, timeout=10 ) if r.status_code == 200 and r.json(): d = r.json()[0] lat, lon = float(d["lat"]), float(d["lon"]) # Sanity: muss grob in Hagen-Region liegen if 51.25 <= lat <= 51.50 and 7.30 <= lon <= 7.70: return lat, lon except Exception: pass time.sleep(1.1) # Nominatim Policy: 1 req/s return None def main(): parser = argparse.ArgumentParser(description="Geocode pending Orte in tracker.db") parser.add_argument("--limit", type=int, default=500, help="Max Orte pro Durchlauf (Default: 500)") parser.add_argument("--retry-failed", action="store_true", help="Auch fehlgeschlagene Orte erneut versuchen") parser.add_argument("--dry-run", action="store_true", help="Nur anzeigen, nichts schreiben") args = parser.parse_args() conn = sqlite3.connect(str(DB)) conn.row_factory = sqlite3.Row # Status-Filter status_filter = "geocode_status='pending'" if args.retry_failed: status_filter = "geocode_status IN ('pending','failed')" # Erst generische Orte skippen generics = conn.execute( f"SELECT id, name FROM orte WHERE {status_filter}" ).fetchall() skipped = 0 for row in generics: if should_skip(row["name"]): if not args.dry_run: conn.execute( "UPDATE orte SET geocode_status='skipped' WHERE id=?", (row["id"],) ) skipped += 1 if skipped: conn.commit() print(f"⏭️ {skipped} generische Orte übersprungen") # Dann die geocodierbaren holen pending = conn.execute( f"SELECT id, name FROM orte WHERE {status_filter} " "ORDER BY vorlage_count DESC, id LIMIT ?", (args.limit,) ).fetchall() total_pending = conn.execute( f"SELECT COUNT(*) FROM orte WHERE {status_filter}" ).fetchone()[0] print(f"📍 Geocoding: {len(pending)} von {total_pending} pending Orten (Limit: {args.limit})") if args.dry_run: for row in pending[:20]: print(f" → {row['name']}") return success = 0 failed = 0 client = httpx.Client() start = time.time() try: for i, row in enumerate(pending): coords = geocode(client, row["name"]) if coords: conn.execute( "UPDATE orte SET lat=?, lon=?, geocode_status='success' WHERE id=?", (coords[0], coords[1], row["id"]) ) success += 1 sym = "✓" else: conn.execute( "UPDATE orte SET geocode_status='failed' WHERE id=?", (row["id"],) ) failed += 1 sym = "✗" elapsed = time.time() - start rate = (i + 1) / elapsed if elapsed > 0 else 0 print( f" [{i+1:4d}/{len(pending)}] {sym} {row['name'][:50]:<50s} " f"(✓{success} ✗{failed} | {rate:.1f}/s)", end="\r" ) # Periodisch committen if (i + 1) % 25 == 0: conn.commit() except KeyboardInterrupt: print("\n⚠️ Abgebrochen!") finally: conn.commit() conn.close() client.close() elapsed = time.time() - start print(f"\n\n{'='*60}") print(f"✅ Fertig in {elapsed:.0f}s") print(f" ✓ {success} geocodiert") print(f" ✗ {failed} fehlgeschlagen") print(f" ⏭️ {skipped} übersprungen") # Gesamtstatus conn2 = sqlite3.connect(str(DB)) stats = conn2.execute( "SELECT geocode_status, COUNT(*) FROM orte GROUP BY geocode_status" ).fetchall() conn2.close() print(f"\n📊 Gesamt:") for status, count in stats: print(f" {status}: {count}") if __name__ == "__main__": main()