223 lines
6.5 KiB
Python
223 lines
6.5 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Geocodierung von Orten aus KI-Zusammenfassungen.
|
||
|
|
Nutzt Nominatim (OpenStreetMap) für Hagen-spezifische Orte.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import json
|
||
|
|
import sqlite3
|
||
|
|
import time
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
import httpx
|
||
|
|
|
||
|
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||
|
|
DB_PATH = PROJECT_ROOT / "data" / "tracker_remote.db"
|
||
|
|
|
||
|
|
# Nominatim API (OpenStreetMap)
|
||
|
|
NOMINATIM_URL = "https://nominatim.openstreetmap.org/search"
|
||
|
|
USER_AGENT = "Antragstracker-Hagen/1.0 (tobias.roedel@econgood.org)"
|
||
|
|
|
||
|
|
# Hagen Bounding Box (ungefähr)
|
||
|
|
HAGEN_BBOX = "7.35,51.30,7.65,51.45" # minLon,minLat,maxLon,maxLat
|
||
|
|
|
||
|
|
# Generische Begriffe die nicht geocodiert werden sollten
|
||
|
|
BLACKLIST = {
|
||
|
|
"polizeiwache", "polizei", "feuerwehr", "krankenhaus", "rathaus",
|
||
|
|
"aldi", "aldi-markt", "lidl", "rewe", "edeka", "penny", "netto",
|
||
|
|
"schule", "grundschule", "gymnasium", "kindergarten", "kita",
|
||
|
|
"spielplatz", "kirche", "friedhof", "sportplatz", "schwimmbad",
|
||
|
|
"bushaltestelle", "bahnhof", "parkplatz", "parkhaus",
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def get_db():
|
||
|
|
conn = sqlite3.connect(str(DB_PATH))
|
||
|
|
conn.row_factory = sqlite3.Row
|
||
|
|
return conn
|
||
|
|
|
||
|
|
|
||
|
|
def geocode_ort(client: httpx.Client, name: str) -> tuple[float, float] | None:
|
||
|
|
"""Geocodiert einen Ort in Hagen."""
|
||
|
|
# Verschiedene Suchvarianten
|
||
|
|
queries = [
|
||
|
|
f"{name}, Hagen, Germany",
|
||
|
|
f"{name}, Hagen",
|
||
|
|
f"{name} Hagen",
|
||
|
|
]
|
||
|
|
|
||
|
|
for q in queries:
|
||
|
|
try:
|
||
|
|
resp = client.get(
|
||
|
|
NOMINATIM_URL,
|
||
|
|
params={
|
||
|
|
"q": q,
|
||
|
|
"format": "json",
|
||
|
|
"limit": 1,
|
||
|
|
"viewbox": HAGEN_BBOX,
|
||
|
|
"bounded": 1,
|
||
|
|
},
|
||
|
|
headers={"User-Agent": USER_AGENT},
|
||
|
|
timeout=10
|
||
|
|
)
|
||
|
|
resp.raise_for_status()
|
||
|
|
|
||
|
|
results = resp.json()
|
||
|
|
if results:
|
||
|
|
lat = float(results[0]["lat"])
|
||
|
|
lon = float(results[0]["lon"])
|
||
|
|
return (lat, lon)
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print(f" Geocoding-Fehler für '{q}': {e}")
|
||
|
|
|
||
|
|
time.sleep(1.1) # Nominatim Rate Limit: 1 req/s
|
||
|
|
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def extract_orte_from_ki(conn: sqlite3.Connection) -> list[tuple[int, str, str]]:
|
||
|
|
"""Extrahiert Orte aus ki_bewertungen.anmerkungen.
|
||
|
|
Returns: List of (vorlage_id, ort_name, kontext)
|
||
|
|
"""
|
||
|
|
rows = conn.execute("""
|
||
|
|
SELECT vorlage_id, anmerkungen
|
||
|
|
FROM ki_bewertungen
|
||
|
|
WHERE typ = 'zusammenfassung'
|
||
|
|
AND anmerkungen IS NOT NULL
|
||
|
|
""").fetchall()
|
||
|
|
|
||
|
|
orte = []
|
||
|
|
for vorlage_id, anmerkungen in rows:
|
||
|
|
try:
|
||
|
|
data = json.loads(anmerkungen)
|
||
|
|
for ort in data.get("betroffene_orte", []):
|
||
|
|
if ort and len(ort) > 2:
|
||
|
|
orte.append((vorlage_id, ort, data.get("kernforderung", "")))
|
||
|
|
except:
|
||
|
|
continue
|
||
|
|
|
||
|
|
return orte
|
||
|
|
|
||
|
|
|
||
|
|
def process_ort(conn: sqlite3.Connection, client: httpx.Client,
|
||
|
|
vorlage_id: int, ort_name: str, kontext: str) -> bool:
|
||
|
|
"""Verarbeitet einen einzelnen Ort."""
|
||
|
|
# Normalisieren
|
||
|
|
ort_name_clean = ort_name.strip()
|
||
|
|
|
||
|
|
# Blacklist prüfen
|
||
|
|
if ort_name_clean.lower() in BLACKLIST:
|
||
|
|
print(f" ⊘ {ort_name_clean} (generisch, übersprungen)")
|
||
|
|
return False
|
||
|
|
|
||
|
|
# Zu kurze Namen ignorieren
|
||
|
|
if len(ort_name_clean) < 4:
|
||
|
|
return False
|
||
|
|
|
||
|
|
# Prüfen ob Ort schon existiert
|
||
|
|
existing = conn.execute(
|
||
|
|
"SELECT id, lat, lon FROM orte WHERE name = ?",
|
||
|
|
(ort_name_clean,)
|
||
|
|
).fetchone()
|
||
|
|
|
||
|
|
if existing:
|
||
|
|
ort_id = existing["id"]
|
||
|
|
# Verknüpfung erstellen falls nicht vorhanden
|
||
|
|
conn.execute("""
|
||
|
|
INSERT OR IGNORE INTO vorlagen_orte (vorlage_id, ort_id, kontext)
|
||
|
|
VALUES (?, ?, ?)
|
||
|
|
""", (vorlage_id, ort_id, kontext[:500] if kontext else None))
|
||
|
|
conn.execute("UPDATE orte SET vorlage_count = vorlage_count + 1 WHERE id = ?", (ort_id,))
|
||
|
|
conn.commit()
|
||
|
|
|
||
|
|
if existing["lat"]:
|
||
|
|
print(f" ✓ {ort_name_clean} (cached)")
|
||
|
|
return True
|
||
|
|
else:
|
||
|
|
# Noch nicht geocodiert, versuchen
|
||
|
|
pass
|
||
|
|
else:
|
||
|
|
# Neuen Ort anlegen
|
||
|
|
cursor = conn.execute(
|
||
|
|
"INSERT INTO orte (name, vorlage_count) VALUES (?, 1)",
|
||
|
|
(ort_name_clean,)
|
||
|
|
)
|
||
|
|
ort_id = cursor.lastrowid
|
||
|
|
conn.execute("""
|
||
|
|
INSERT OR IGNORE INTO vorlagen_orte (vorlage_id, ort_id, kontext)
|
||
|
|
VALUES (?, ?, ?)
|
||
|
|
""", (vorlage_id, ort_id, kontext[:500] if kontext else None))
|
||
|
|
conn.commit()
|
||
|
|
|
||
|
|
# Geocodieren
|
||
|
|
coords = geocode_ort(client, ort_name_clean)
|
||
|
|
|
||
|
|
if coords:
|
||
|
|
lat, lon = coords
|
||
|
|
conn.execute(
|
||
|
|
"UPDATE orte SET lat = ?, lon = ?, typ = 'geocoded' WHERE id = ?",
|
||
|
|
(lat, lon, ort_id)
|
||
|
|
)
|
||
|
|
conn.commit()
|
||
|
|
print(f" ✓ {ort_name_clean} → ({lat:.5f}, {lon:.5f})")
|
||
|
|
return True
|
||
|
|
else:
|
||
|
|
print(f" ✗ {ort_name_clean} (nicht gefunden)")
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser(description="Geocodierung von Orten")
|
||
|
|
parser.add_argument("--limit", type=int, default=50, help="Max. Anzahl neuer Orte")
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
print(f"=== Geocodierung von Orten ===\n")
|
||
|
|
|
||
|
|
conn = get_db()
|
||
|
|
client = httpx.Client()
|
||
|
|
|
||
|
|
# Orte aus KI-Zusammenfassungen extrahieren
|
||
|
|
orte = extract_orte_from_ki(conn)
|
||
|
|
print(f"Gefunden: {len(orte)} Ort-Erwähnungen\n")
|
||
|
|
|
||
|
|
# Deduplizieren
|
||
|
|
seen = set()
|
||
|
|
unique_orte = []
|
||
|
|
for vorlage_id, ort, kontext in orte:
|
||
|
|
key = (vorlage_id, ort)
|
||
|
|
if key not in seen:
|
||
|
|
seen.add(key)
|
||
|
|
unique_orte.append((vorlage_id, ort, kontext))
|
||
|
|
|
||
|
|
print(f"Unique: {len(unique_orte)} Verknüpfungen\n")
|
||
|
|
|
||
|
|
# Verarbeiten
|
||
|
|
success = 0
|
||
|
|
processed = 0
|
||
|
|
for vorlage_id, ort, kontext in unique_orte:
|
||
|
|
if processed >= args.limit:
|
||
|
|
break
|
||
|
|
if process_ort(conn, client, vorlage_id, ort, kontext):
|
||
|
|
success += 1
|
||
|
|
processed += 1
|
||
|
|
|
||
|
|
client.close()
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
# Stats
|
||
|
|
conn = get_db()
|
||
|
|
total_orte = conn.execute("SELECT COUNT(*) FROM orte").fetchone()[0]
|
||
|
|
geocoded = conn.execute("SELECT COUNT(*) FROM orte WHERE lat IS NOT NULL").fetchone()[0]
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
print(f"\n=== Fertig ===")
|
||
|
|
print(f"Orte gesamt: {total_orte}")
|
||
|
|
print(f"Geocodiert: {geocoded}")
|
||
|
|
print(f"Diese Runde: {success}/{processed}")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|