match_claims.py: kein_bezug als Verarbeitungs-Marker speichern

Aenderungen: - HARD_BUDGET_USD via env-var ueberschreibbar (Default 1.50). - kein_bezug-Klassifikationen werden ebenfalls in claim_matches gespeichert, damit der NOT-EXISTS-Filter in fetch_claims sie als verarbeitet erkennt und Re-Runs nicht erneut Kosten erzeugen. - backend /api/.../claims filtert kein_bezug aus match_counts und best_match raus, sodass das Frontend nur sinnvolle Verbindungen anzeigt. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 10:44:57 +02:00 · 2026-04-28 10:44:57 +02:00 · a5a0bcc260
commit a5a0bcc260
parent d7a0ed2715
2 changed files with 16 additions and 13 deletions
--- a/backend/app.py
+++ b/backend/app.py
@ -137,26 +137,28 @@ def get_episode_claims(podcast_id: str, episode_id: str, claim_type: Optional[st
    rows = db.execute(sql, params).fetchall()
    claims_list = [dict(r) for r in rows]
-    # Match-Counts und besten Match je claim_id anhaengen, falls Tabelle existiert
+    # Match-Counts und besten Match je claim_id anhaengen, falls Tabelle existiert.
    # kein_bezug wird gefiltert (dient nur als Verarbeitungs-Marker fuer das Skript).
    if claims_list and _table_exists(db, "claim_matches"):
        ids = [c["id"] for c in claims_list]
        placeholder = ",".join("?" * len(ids))
        match_rows = db.execute(
            f"SELECT claim_id, relation, COUNT(*) c FROM claim_matches "
-            f"WHERE claim_id IN ({placeholder}) GROUP BY claim_id, relation",
+            f"WHERE claim_id IN ({placeholder}) AND relation != 'kein_bezug' "
            f"GROUP BY claim_id, relation",
            ids,
        ).fetchall()
        counts = {}
        for r in match_rows:
            counts.setdefault(r["claim_id"], {})[r["relation"]] = r["c"]
-        # bester Match je claim (fuer Quick-Link)
+        # bester Match je claim (fuer Quick-Link), kein_bezug ausblenden
        best_rows = db.execute(
            f"SELECT cm.claim_id, cm.relation, cm.target_podcast, cm.target_episode, "
            f"cm.target_idx, cm.reason, cm.score "
            f"FROM claim_matches cm "
-            f"WHERE cm.claim_id IN ({placeholder}) "
+            f"WHERE cm.claim_id IN ({placeholder}) AND cm.relation != 'kein_bezug' "
            f"AND cm.id IN (SELECT MIN(id) FROM claim_matches WHERE claim_id IN ({placeholder}) "
-            f"GROUP BY claim_id) ",
+            f"AND relation != 'kein_bezug' GROUP BY claim_id) ",
            ids + ids,
        ).fetchall()
        best = {r["claim_id"]: dict(r) for r in best_rows}
--- a/scripts/match_claims.py
+++ b/scripts/match_claims.py
@ -43,7 +43,7 @@ EMBED_MODEL = "text-embedding-v3"
 LLM_MODEL = "qwen-plus"
 EMBED_BATCH = 6
 MIN_SCORE = 0.55
-HARD_BUDGET_USD = 1.50
+HARD_BUDGET_USD = float(os.environ.get("HARD_BUDGET_USD", "1.50"))
 COST_IN = 0.0008 / 1000
 COST_OUT = 0.002 / 1000
@ -237,7 +237,8 @@ def main():
            rel = "kein_bezug"
        rel_counts[rel] += 1
-        if rel != "kein_bezug":
+        # Auch kein_bezug speichern, damit der Claim als verarbeitet gilt und im
        # Re-Run nicht erneut Kosten erzeugt. Frontend filtert kein_bezug raus.
        db.execute(
            "INSERT INTO claim_matches (claim_id, target_podcast, target_episode, "
            "target_idx, relation, reason, score) VALUES (?, ?, ?, ?, ?, ?, ?)",