match_claims.py: kein_bezug als Verarbeitungs-Marker speichern
Aenderungen: - HARD_BUDGET_USD via env-var ueberschreibbar (Default 1.50). - kein_bezug-Klassifikationen werden ebenfalls in claim_matches gespeichert, damit der NOT-EXISTS-Filter in fetch_claims sie als verarbeitet erkennt und Re-Runs nicht erneut Kosten erzeugen. - backend /api/.../claims filtert kein_bezug aus match_counts und best_match raus, sodass das Frontend nur sinnvolle Verbindungen anzeigt. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
d7a0ed2715
commit
a5a0bcc260
@ -137,26 +137,28 @@ def get_episode_claims(podcast_id: str, episode_id: str, claim_type: Optional[st
|
|||||||
rows = db.execute(sql, params).fetchall()
|
rows = db.execute(sql, params).fetchall()
|
||||||
claims_list = [dict(r) for r in rows]
|
claims_list = [dict(r) for r in rows]
|
||||||
|
|
||||||
# Match-Counts und besten Match je claim_id anhaengen, falls Tabelle existiert
|
# Match-Counts und besten Match je claim_id anhaengen, falls Tabelle existiert.
|
||||||
|
# kein_bezug wird gefiltert (dient nur als Verarbeitungs-Marker fuer das Skript).
|
||||||
if claims_list and _table_exists(db, "claim_matches"):
|
if claims_list and _table_exists(db, "claim_matches"):
|
||||||
ids = [c["id"] for c in claims_list]
|
ids = [c["id"] for c in claims_list]
|
||||||
placeholder = ",".join("?" * len(ids))
|
placeholder = ",".join("?" * len(ids))
|
||||||
match_rows = db.execute(
|
match_rows = db.execute(
|
||||||
f"SELECT claim_id, relation, COUNT(*) c FROM claim_matches "
|
f"SELECT claim_id, relation, COUNT(*) c FROM claim_matches "
|
||||||
f"WHERE claim_id IN ({placeholder}) GROUP BY claim_id, relation",
|
f"WHERE claim_id IN ({placeholder}) AND relation != 'kein_bezug' "
|
||||||
|
f"GROUP BY claim_id, relation",
|
||||||
ids,
|
ids,
|
||||||
).fetchall()
|
).fetchall()
|
||||||
counts = {}
|
counts = {}
|
||||||
for r in match_rows:
|
for r in match_rows:
|
||||||
counts.setdefault(r["claim_id"], {})[r["relation"]] = r["c"]
|
counts.setdefault(r["claim_id"], {})[r["relation"]] = r["c"]
|
||||||
# bester Match je claim (fuer Quick-Link)
|
# bester Match je claim (fuer Quick-Link), kein_bezug ausblenden
|
||||||
best_rows = db.execute(
|
best_rows = db.execute(
|
||||||
f"SELECT cm.claim_id, cm.relation, cm.target_podcast, cm.target_episode, "
|
f"SELECT cm.claim_id, cm.relation, cm.target_podcast, cm.target_episode, "
|
||||||
f"cm.target_idx, cm.reason, cm.score "
|
f"cm.target_idx, cm.reason, cm.score "
|
||||||
f"FROM claim_matches cm "
|
f"FROM claim_matches cm "
|
||||||
f"WHERE cm.claim_id IN ({placeholder}) "
|
f"WHERE cm.claim_id IN ({placeholder}) AND cm.relation != 'kein_bezug' "
|
||||||
f"AND cm.id IN (SELECT MIN(id) FROM claim_matches WHERE claim_id IN ({placeholder}) "
|
f"AND cm.id IN (SELECT MIN(id) FROM claim_matches WHERE claim_id IN ({placeholder}) "
|
||||||
f"GROUP BY claim_id) ",
|
f"AND relation != 'kein_bezug' GROUP BY claim_id) ",
|
||||||
ids + ids,
|
ids + ids,
|
||||||
).fetchall()
|
).fetchall()
|
||||||
best = {r["claim_id"]: dict(r) for r in best_rows}
|
best = {r["claim_id"]: dict(r) for r in best_rows}
|
||||||
|
|||||||
@ -43,7 +43,7 @@ EMBED_MODEL = "text-embedding-v3"
|
|||||||
LLM_MODEL = "qwen-plus"
|
LLM_MODEL = "qwen-plus"
|
||||||
EMBED_BATCH = 6
|
EMBED_BATCH = 6
|
||||||
MIN_SCORE = 0.55
|
MIN_SCORE = 0.55
|
||||||
HARD_BUDGET_USD = 1.50
|
HARD_BUDGET_USD = float(os.environ.get("HARD_BUDGET_USD", "1.50"))
|
||||||
|
|
||||||
COST_IN = 0.0008 / 1000
|
COST_IN = 0.0008 / 1000
|
||||||
COST_OUT = 0.002 / 1000
|
COST_OUT = 0.002 / 1000
|
||||||
@ -237,7 +237,8 @@ def main():
|
|||||||
rel = "kein_bezug"
|
rel = "kein_bezug"
|
||||||
rel_counts[rel] += 1
|
rel_counts[rel] += 1
|
||||||
|
|
||||||
if rel != "kein_bezug":
|
# Auch kein_bezug speichern, damit der Claim als verarbeitet gilt und im
|
||||||
|
# Re-Run nicht erneut Kosten erzeugt. Frontend filtert kein_bezug raus.
|
||||||
db.execute(
|
db.execute(
|
||||||
"INSERT INTO claim_matches (claim_id, target_podcast, target_episode, "
|
"INSERT INTO claim_matches (claim_id, target_podcast, target_episode, "
|
||||||
"target_idx, relation, reason, score) VALUES (?, ?, ?, ?, ?, ?, ?)",
|
"target_idx, relation, reason, score) VALUES (?, ?, ?, ?, ?, ?, ?)",
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user