match_claims.py: kein_bezug als Verarbeitungs-Marker speichern

Aenderungen:
- HARD_BUDGET_USD via env-var ueberschreibbar (Default 1.50).
- kein_bezug-Klassifikationen werden ebenfalls in claim_matches gespeichert,
  damit der NOT-EXISTS-Filter in fetch_claims sie als verarbeitet erkennt und
  Re-Runs nicht erneut Kosten erzeugen.
- backend /api/.../claims filtert kein_bezug aus match_counts und best_match
  raus, sodass das Frontend nur sinnvolle Verbindungen anzeigt.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dotty Dotter 2026-04-28 10:44:57 +02:00
parent d7a0ed2715
commit a5a0bcc260
2 changed files with 16 additions and 13 deletions

View File

@ -137,26 +137,28 @@ def get_episode_claims(podcast_id: str, episode_id: str, claim_type: Optional[st
rows = db.execute(sql, params).fetchall() rows = db.execute(sql, params).fetchall()
claims_list = [dict(r) for r in rows] claims_list = [dict(r) for r in rows]
# Match-Counts und besten Match je claim_id anhaengen, falls Tabelle existiert # Match-Counts und besten Match je claim_id anhaengen, falls Tabelle existiert.
# kein_bezug wird gefiltert (dient nur als Verarbeitungs-Marker fuer das Skript).
if claims_list and _table_exists(db, "claim_matches"): if claims_list and _table_exists(db, "claim_matches"):
ids = [c["id"] for c in claims_list] ids = [c["id"] for c in claims_list]
placeholder = ",".join("?" * len(ids)) placeholder = ",".join("?" * len(ids))
match_rows = db.execute( match_rows = db.execute(
f"SELECT claim_id, relation, COUNT(*) c FROM claim_matches " f"SELECT claim_id, relation, COUNT(*) c FROM claim_matches "
f"WHERE claim_id IN ({placeholder}) GROUP BY claim_id, relation", f"WHERE claim_id IN ({placeholder}) AND relation != 'kein_bezug' "
f"GROUP BY claim_id, relation",
ids, ids,
).fetchall() ).fetchall()
counts = {} counts = {}
for r in match_rows: for r in match_rows:
counts.setdefault(r["claim_id"], {})[r["relation"]] = r["c"] counts.setdefault(r["claim_id"], {})[r["relation"]] = r["c"]
# bester Match je claim (fuer Quick-Link) # bester Match je claim (fuer Quick-Link), kein_bezug ausblenden
best_rows = db.execute( best_rows = db.execute(
f"SELECT cm.claim_id, cm.relation, cm.target_podcast, cm.target_episode, " f"SELECT cm.claim_id, cm.relation, cm.target_podcast, cm.target_episode, "
f"cm.target_idx, cm.reason, cm.score " f"cm.target_idx, cm.reason, cm.score "
f"FROM claim_matches cm " f"FROM claim_matches cm "
f"WHERE cm.claim_id IN ({placeholder}) " f"WHERE cm.claim_id IN ({placeholder}) AND cm.relation != 'kein_bezug' "
f"AND cm.id IN (SELECT MIN(id) FROM claim_matches WHERE claim_id IN ({placeholder}) " f"AND cm.id IN (SELECT MIN(id) FROM claim_matches WHERE claim_id IN ({placeholder}) "
f"GROUP BY claim_id) ", f"AND relation != 'kein_bezug' GROUP BY claim_id) ",
ids + ids, ids + ids,
).fetchall() ).fetchall()
best = {r["claim_id"]: dict(r) for r in best_rows} best = {r["claim_id"]: dict(r) for r in best_rows}

View File

@ -43,7 +43,7 @@ EMBED_MODEL = "text-embedding-v3"
LLM_MODEL = "qwen-plus" LLM_MODEL = "qwen-plus"
EMBED_BATCH = 6 EMBED_BATCH = 6
MIN_SCORE = 0.55 MIN_SCORE = 0.55
HARD_BUDGET_USD = 1.50 HARD_BUDGET_USD = float(os.environ.get("HARD_BUDGET_USD", "1.50"))
COST_IN = 0.0008 / 1000 COST_IN = 0.0008 / 1000
COST_OUT = 0.002 / 1000 COST_OUT = 0.002 / 1000
@ -237,13 +237,14 @@ def main():
rel = "kein_bezug" rel = "kein_bezug"
rel_counts[rel] += 1 rel_counts[rel] += 1
if rel != "kein_bezug": # Auch kein_bezug speichern, damit der Claim als verarbeitet gilt und im
db.execute( # Re-Run nicht erneut Kosten erzeugt. Frontend filtert kein_bezug raus.
"INSERT INTO claim_matches (claim_id, target_podcast, target_episode, " db.execute(
"target_idx, relation, reason, score) VALUES (?, ?, ?, ?, ?, ?, ?)", "INSERT INTO claim_matches (claim_id, target_podcast, target_episode, "
(c["id"], cand["podcast_id"], cand["episode_id"], cand["idx"], "target_idx, relation, reason, score) VALUES (?, ?, ?, ?, ?, ?, ?)",
rel, (result.get("reason") or "")[:500], score), (c["id"], cand["podcast_id"], cand["episode_id"], cand["idx"],
) rel, (result.get("reason") or "")[:500], score),
)
if (i + 1) % 20 == 0: if (i + 1) % 20 == 0:
db.commit() db.commit()