From a5a0bcc260111f32d6cfa5c182b9a0514f2f5a4f Mon Sep 17 00:00:00 2001 From: Dotty Dotter Date: Tue, 28 Apr 2026 10:44:57 +0200 Subject: [PATCH] match_claims.py: kein_bezug als Verarbeitungs-Marker speichern Aenderungen: - HARD_BUDGET_USD via env-var ueberschreibbar (Default 1.50). - kein_bezug-Klassifikationen werden ebenfalls in claim_matches gespeichert, damit der NOT-EXISTS-Filter in fetch_claims sie als verarbeitet erkennt und Re-Runs nicht erneut Kosten erzeugen. - backend /api/.../claims filtert kein_bezug aus match_counts und best_match raus, sodass das Frontend nur sinnvolle Verbindungen anzeigt. Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/app.py | 12 +++++++----- scripts/match_claims.py | 17 +++++++++-------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/backend/app.py b/backend/app.py index da335d5..d828ff0 100644 --- a/backend/app.py +++ b/backend/app.py @@ -137,26 +137,28 @@ def get_episode_claims(podcast_id: str, episode_id: str, claim_type: Optional[st rows = db.execute(sql, params).fetchall() claims_list = [dict(r) for r in rows] - # Match-Counts und besten Match je claim_id anhaengen, falls Tabelle existiert + # Match-Counts und besten Match je claim_id anhaengen, falls Tabelle existiert. + # kein_bezug wird gefiltert (dient nur als Verarbeitungs-Marker fuer das Skript). if claims_list and _table_exists(db, "claim_matches"): ids = [c["id"] for c in claims_list] placeholder = ",".join("?" * len(ids)) match_rows = db.execute( f"SELECT claim_id, relation, COUNT(*) c FROM claim_matches " - f"WHERE claim_id IN ({placeholder}) GROUP BY claim_id, relation", + f"WHERE claim_id IN ({placeholder}) AND relation != 'kein_bezug' " + f"GROUP BY claim_id, relation", ids, ).fetchall() counts = {} for r in match_rows: counts.setdefault(r["claim_id"], {})[r["relation"]] = r["c"] - # bester Match je claim (fuer Quick-Link) + # bester Match je claim (fuer Quick-Link), kein_bezug ausblenden best_rows = db.execute( f"SELECT cm.claim_id, cm.relation, cm.target_podcast, cm.target_episode, " f"cm.target_idx, cm.reason, cm.score " f"FROM claim_matches cm " - f"WHERE cm.claim_id IN ({placeholder}) " + f"WHERE cm.claim_id IN ({placeholder}) AND cm.relation != 'kein_bezug' " f"AND cm.id IN (SELECT MIN(id) FROM claim_matches WHERE claim_id IN ({placeholder}) " - f"GROUP BY claim_id) ", + f"AND relation != 'kein_bezug' GROUP BY claim_id) ", ids + ids, ).fetchall() best = {r["claim_id"]: dict(r) for r in best_rows} diff --git a/scripts/match_claims.py b/scripts/match_claims.py index 8bcdc43..bd6ae1e 100644 --- a/scripts/match_claims.py +++ b/scripts/match_claims.py @@ -43,7 +43,7 @@ EMBED_MODEL = "text-embedding-v3" LLM_MODEL = "qwen-plus" EMBED_BATCH = 6 MIN_SCORE = 0.55 -HARD_BUDGET_USD = 1.50 +HARD_BUDGET_USD = float(os.environ.get("HARD_BUDGET_USD", "1.50")) COST_IN = 0.0008 / 1000 COST_OUT = 0.002 / 1000 @@ -237,13 +237,14 @@ def main(): rel = "kein_bezug" rel_counts[rel] += 1 - if rel != "kein_bezug": - db.execute( - "INSERT INTO claim_matches (claim_id, target_podcast, target_episode, " - "target_idx, relation, reason, score) VALUES (?, ?, ?, ?, ?, ?, ?)", - (c["id"], cand["podcast_id"], cand["episode_id"], cand["idx"], - rel, (result.get("reason") or "")[:500], score), - ) + # Auch kein_bezug speichern, damit der Claim als verarbeitet gilt und im + # Re-Run nicht erneut Kosten erzeugt. Frontend filtert kein_bezug raus. + db.execute( + "INSERT INTO claim_matches (claim_id, target_podcast, target_episode, " + "target_idx, relation, reason, score) VALUES (?, ?, ?, ?, ?, ?, ?)", + (c["id"], cand["podcast_id"], cand["episode_id"], cand["idx"], + rel, (result.get("reason") or "")[:500], score), + ) if (i + 1) % 20 == 0: db.commit()