- scripts/geocode_pending.py: Nominatim mit Hagen-Fokus, Rate-Limiting 1/s - 2.293 Orte geocodiert (vorher 608), 31k+ noch offen (läuft weiter) - Karte: Marker-Clustering für bessere Performance - 27k+ Orte-Einträge → Clustering nötig Teilweise Closes #5, Closes #6
198 lines
6.2 KiB
Python
198 lines
6.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Geocodiert pending Orte via Nominatim (1 req/s, Hagen-fokussiert)."""
|
|
import argparse
|
|
import re
|
|
import sqlite3
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Optional, Tuple
|
|
|
|
import httpx
|
|
|
|
DB = Path(__file__).resolve().parent.parent / "data" / "tracker.db"
|
|
NOMINATIM = "https://nominatim.openstreetmap.org/search"
|
|
UA = "Antragstracker-Hagen/1.0 (tobias.roedel@econgood.org)"
|
|
HAGEN_BBOX = "7.35,51.30,7.65,51.45"
|
|
|
|
# Orte die zu generisch sind für sinnvolles Geocoding
|
|
SKIP_PATTERNS = [
|
|
r"^hagen$", r"^hagen,?\s*(nordrhein-westfalen|nrw)$",
|
|
r"^stadtgebiet", r"^gesamtes?\s+stadtgebiet",
|
|
r"^(bab|a)\s*\d", r"^bundesstraße\s+\d", r"^b\s*\d+$",
|
|
r"^(alle|diverse|verschiedene)\s+", r"^(stadt|kreis)\s+hagen$",
|
|
]
|
|
|
|
|
|
def should_skip(name: str) -> bool:
|
|
"""Orte überspringen die nicht sinnvoll geocodierbar sind."""
|
|
clean = name.strip().lower()
|
|
for pat in SKIP_PATTERNS:
|
|
if re.search(pat, clean):
|
|
return True
|
|
# Zu kurz / generisch
|
|
if len(clean) < 3:
|
|
return True
|
|
return False
|
|
|
|
|
|
def normalize_query(name: str) -> str:
|
|
"""Ortsnamen für Nominatim aufbereiten."""
|
|
clean = name.strip()
|
|
# Trailing "Hagen" entfernen um Duplikation zu vermeiden
|
|
clean = re.sub(r',?\s*Hagen\s*$', '', clean, flags=re.IGNORECASE).strip().rstrip(',').strip()
|
|
# "Hagen-" Prefix bei Stadtteilen behalten
|
|
if clean.lower().startswith('hagen-'):
|
|
clean = clean[6:].strip() + ', Hagen'
|
|
return clean
|
|
|
|
|
|
def geocode(client: httpx.Client, name: str) -> Optional[Tuple[float, float]]:
|
|
"""Versuche einen Ort in Hagen zu geocodieren."""
|
|
clean = normalize_query(name)
|
|
|
|
queries = [
|
|
# Strikt in Hagen Bounding Box
|
|
(f"{clean}, Hagen", {"viewbox": HAGEN_BBOX, "bounded": 1}),
|
|
# Etwas lockerer
|
|
(f"{clean}, Hagen, NRW, Germany", {}),
|
|
# Originalname als Fallback
|
|
(f"{name}, Germany", {}),
|
|
]
|
|
|
|
for q, extra_params in queries:
|
|
params = {"q": q, "format": "json", "limit": 1, "addressdetails": 1}
|
|
params.update(extra_params)
|
|
try:
|
|
r = client.get(
|
|
NOMINATIM, params=params,
|
|
headers={"User-Agent": UA}, timeout=10
|
|
)
|
|
if r.status_code == 200 and r.json():
|
|
d = r.json()[0]
|
|
lat, lon = float(d["lat"]), float(d["lon"])
|
|
# Sanity: muss grob in Hagen-Region liegen
|
|
if 51.25 <= lat <= 51.50 and 7.30 <= lon <= 7.70:
|
|
return lat, lon
|
|
except Exception:
|
|
pass
|
|
time.sleep(1.1) # Nominatim Policy: 1 req/s
|
|
|
|
return None
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Geocode pending Orte in tracker.db")
|
|
parser.add_argument("--limit", type=int, default=500,
|
|
help="Max Orte pro Durchlauf (Default: 500)")
|
|
parser.add_argument("--retry-failed", action="store_true",
|
|
help="Auch fehlgeschlagene Orte erneut versuchen")
|
|
parser.add_argument("--dry-run", action="store_true",
|
|
help="Nur anzeigen, nichts schreiben")
|
|
args = parser.parse_args()
|
|
|
|
conn = sqlite3.connect(str(DB))
|
|
conn.row_factory = sqlite3.Row
|
|
|
|
# Status-Filter
|
|
status_filter = "geocode_status='pending'"
|
|
if args.retry_failed:
|
|
status_filter = "geocode_status IN ('pending','failed')"
|
|
|
|
# Erst generische Orte skippen
|
|
generics = conn.execute(
|
|
f"SELECT id, name FROM orte WHERE {status_filter}"
|
|
).fetchall()
|
|
|
|
skipped = 0
|
|
for row in generics:
|
|
if should_skip(row["name"]):
|
|
if not args.dry_run:
|
|
conn.execute(
|
|
"UPDATE orte SET geocode_status='skipped' WHERE id=?",
|
|
(row["id"],)
|
|
)
|
|
skipped += 1
|
|
if skipped:
|
|
conn.commit()
|
|
print(f"⏭️ {skipped} generische Orte übersprungen")
|
|
|
|
# Dann die geocodierbaren holen
|
|
pending = conn.execute(
|
|
f"SELECT id, name FROM orte WHERE {status_filter} "
|
|
"ORDER BY vorlage_count DESC, id LIMIT ?",
|
|
(args.limit,)
|
|
).fetchall()
|
|
|
|
total_pending = conn.execute(
|
|
f"SELECT COUNT(*) FROM orte WHERE {status_filter}"
|
|
).fetchone()[0]
|
|
|
|
print(f"📍 Geocoding: {len(pending)} von {total_pending} pending Orten (Limit: {args.limit})")
|
|
if args.dry_run:
|
|
for row in pending[:20]:
|
|
print(f" → {row['name']}")
|
|
return
|
|
|
|
success = 0
|
|
failed = 0
|
|
client = httpx.Client()
|
|
start = time.time()
|
|
|
|
try:
|
|
for i, row in enumerate(pending):
|
|
coords = geocode(client, row["name"])
|
|
if coords:
|
|
conn.execute(
|
|
"UPDATE orte SET lat=?, lon=?, geocode_status='success' WHERE id=?",
|
|
(coords[0], coords[1], row["id"])
|
|
)
|
|
success += 1
|
|
sym = "✓"
|
|
else:
|
|
conn.execute(
|
|
"UPDATE orte SET geocode_status='failed' WHERE id=?",
|
|
(row["id"],)
|
|
)
|
|
failed += 1
|
|
sym = "✗"
|
|
|
|
elapsed = time.time() - start
|
|
rate = (i + 1) / elapsed if elapsed > 0 else 0
|
|
print(
|
|
f" [{i+1:4d}/{len(pending)}] {sym} {row['name'][:50]:<50s} "
|
|
f"(✓{success} ✗{failed} | {rate:.1f}/s)",
|
|
end="\r"
|
|
)
|
|
|
|
# Periodisch committen
|
|
if (i + 1) % 25 == 0:
|
|
conn.commit()
|
|
except KeyboardInterrupt:
|
|
print("\n⚠️ Abgebrochen!")
|
|
finally:
|
|
conn.commit()
|
|
conn.close()
|
|
client.close()
|
|
|
|
elapsed = time.time() - start
|
|
print(f"\n\n{'='*60}")
|
|
print(f"✅ Fertig in {elapsed:.0f}s")
|
|
print(f" ✓ {success} geocodiert")
|
|
print(f" ✗ {failed} fehlgeschlagen")
|
|
print(f" ⏭️ {skipped} übersprungen")
|
|
|
|
# Gesamtstatus
|
|
conn2 = sqlite3.connect(str(DB))
|
|
stats = conn2.execute(
|
|
"SELECT geocode_status, COUNT(*) FROM orte GROUP BY geocode_status"
|
|
).fetchall()
|
|
conn2.close()
|
|
print(f"\n📊 Gesamt:")
|
|
for status, count in stats:
|
|
print(f" {status}: {count}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|