#!/usr/bin/env python3 """#16 Claim-Verification Stufe 2: matche Claims gegen Bestaetigung/Widerspruch in anderen Episoden. Vorgehen: 1. Lade Claims (default verifiable=1, optional alle). 2. Embedde jeden Claim-Text und suche per Cosinus den besten Kandidaten in anderen Episoden. 3. Bei score >= MIN_SCORE: Qwen-Verifikation: 'belegt' / 'widerspricht' / 'erweitert' / 'kein_bezug'. 4. Schreibe Treffer in neue Tabelle claim_matches. Nutzung: DASHSCOPE_API_KEY=... python3 match_claims.py [db-pfad] [limit] Optionen: --cross-podcast erlaubt Treffer in anderen Podcasts --include-non-verifiable matche auch Claims, die nicht als verifizierbar markiert sind """ import json import os import sqlite3 import sys import time from pathlib import Path import numpy as np from openai import OpenAI sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from json_utils import parse_llm_json ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(ROOT / "backend")) from database import get_all_embeddings # noqa: E402 DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite" LIMIT = int(sys.argv[2]) if len(sys.argv) > 2 and not sys.argv[2].startswith("--") else 500 CROSS_PODCAST = "--cross-podcast" in sys.argv INCLUDE_NON_VERIFIABLE = "--include-non-verifiable" in sys.argv API_KEY = os.environ.get("DASHSCOPE_API_KEY", "") BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" EMBED_MODEL = "text-embedding-v3" LLM_MODEL = "qwen-plus" EMBED_BATCH = 6 MIN_SCORE = 0.55 HARD_BUDGET_USD = 1.50 COST_IN = 0.0008 / 1000 COST_OUT = 0.002 / 1000 SYSTEM_PROMPT = """Du erhaeltst eine prueffbare Behauptung aus einem Podcast und einen Kandidat-Absatz aus einer anderen Episode. Pruefe die logische Beziehung des Kandidaten zur Behauptung. Antworte NUR mit JSON: {"relation": "belegt" | "widerspricht" | "erweitert" | "kein_bezug", "reason": "1 Satz Begruendung"} - "belegt": der Kandidat liefert Evidenz/Daten, die die Behauptung stuetzen. - "widerspricht": der Kandidat widerspricht der Behauptung oder nennt Gegenargumente. - "erweitert": der Kandidat ergaenzt die Behauptung um Aspekte, ohne ihr zuzustimmen oder zu widersprechen. - "kein_bezug": kein logischer Bezug, auch wenn das Thema verwandt ist.""" def setup_db(db): db.executescript(""" CREATE TABLE IF NOT EXISTS claim_matches ( id INTEGER PRIMARY KEY AUTOINCREMENT, claim_id INTEGER NOT NULL, target_podcast TEXT NOT NULL, target_episode TEXT NOT NULL, target_idx INTEGER NOT NULL, relation TEXT NOT NULL, reason TEXT, score REAL, FOREIGN KEY (claim_id) REFERENCES claims(id) ); CREATE INDEX IF NOT EXISTS idx_claim_matches ON claim_matches(claim_id); CREATE INDEX IF NOT EXISTS idx_claim_matches_relation ON claim_matches(relation); """) def fetch_claims(db): where = "WHERE 1=1" if not INCLUDE_NON_VERIFIABLE: where += " AND c.verifiable = 1" where += " AND NOT EXISTS (SELECT 1 FROM claim_matches m WHERE m.claim_id = c.id)" rows = db.execute(f""" SELECT c.id, c.podcast_id, c.episode_id, c.paragraph_idx, c.claim_text, c.claim_type FROM claims c {where} ORDER BY c.id LIMIT ? """, (LIMIT,)).fetchall() return rows def embed_batch(client, texts): resp = client.embeddings.create(model=EMBED_MODEL, input=texts, dimensions=1024) return [item.embedding for item in resp.data] def best_candidate(c_vec, vectors, meta, exclude_podcast, exclude_episode, allow_cross): scores = vectors @ c_vec order = np.argsort(scores)[::-1] for idx in order[:30]: m = meta[idx] if m["podcast_id"] == exclude_podcast and m["episode_id"] == exclude_episode: continue if not allow_cross and m["podcast_id"] != exclude_podcast: continue return m, float(scores[idx]) return None, 0.0 class Budget: def __init__(self, hard_limit_usd): self.hard_limit = hard_limit_usd self.tokens_in = 0 self.tokens_out = 0 def add(self, usage): if usage: self.tokens_in += getattr(usage, "prompt_tokens", 0) or 0 self.tokens_out += getattr(usage, "completion_tokens", 0) or 0 def cost(self): return self.tokens_in * COST_IN + self.tokens_out * COST_OUT def over(self): return self.cost() > self.hard_limit def verify(client, claim_text, candidate_text, budget): user_msg = ( f"BEHAUPTUNG:\n\"{claim_text}\"\n\n" f"KANDIDAT-ABSATZ:\n\"{candidate_text[:1000]}\"" ) last_err = None for attempt in range(2): try: resp = client.chat.completions.create( model=LLM_MODEL, messages=[ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user_msg}, ], temperature=0.0, max_tokens=200, ) budget.add(getattr(resp, "usage", None)) content = resp.choices[0].message.content try: return parse_llm_json(content, expect="object"), None except ValueError as pe: last_err = f"parse: {pe}" break except Exception as e: last_err = str(e) if attempt < 1: time.sleep(2) continue return None, last_err def main(): if not API_KEY: print("DASHSCOPE_API_KEY nicht gesetzt.") sys.exit(1) client = OpenAI(api_key=API_KEY, base_url=BASE_URL, timeout=60.0, max_retries=1) db = sqlite3.connect(DB_PATH, timeout=30.0) db.execute("PRAGMA busy_timeout=30000") db.row_factory = sqlite3.Row setup_db(db) claims = fetch_claims(db) print(f"Claims zu matchen: {len(claims)}") if not claims: return print("Lade paragraph-embeddings…") vectors, meta = get_all_embeddings(None) if vectors is None: print("Keine Embeddings vorhanden — abbruch.") return nan_mask = np.isnan(vectors).any(axis=1) if nan_mask.any(): print(f" {nan_mask.sum()} NaN-Vektoren maskiert") vectors[nan_mask] = 0 print(f" {len(meta)} Vektoren geladen") print(f"Embedde {len(claims)} Claims…") c_vecs = [] for i in range(0, len(claims), EMBED_BATCH): batch = claims[i:i + EMBED_BATCH] texts = [c["claim_text"][:500] for c in batch] try: embs = embed_batch(client, texts) except Exception as e: print(f" Embedding-Batch {i // EMBED_BATCH + 1} fehler: {e}") time.sleep(2) embs = [None] * len(batch) for emb in embs: if emb is None: c_vecs.append(None) else: v = np.array(emb, dtype=np.float32) v /= np.linalg.norm(v) or 1 c_vecs.append(v) budget = Budget(hard_limit_usd=HARD_BUDGET_USD) rel_counts = {"belegt": 0, "widerspricht": 0, "erweitert": 0, "kein_bezug": 0} skipped_low_score = 0 parse_failed = 0 for i, (c, c_vec) in enumerate(zip(claims, c_vecs)): if budget.over(): print(f"!! Budget ({budget.cost():.4f} USD) erreicht — Abbruch nach {i}") break if c_vec is None: continue cand, score = best_candidate(c_vec, vectors, meta, c["podcast_id"], c["episode_id"], CROSS_PODCAST) if cand is None or score < MIN_SCORE: skipped_low_score += 1 continue cand_text = db.execute( "SELECT text FROM paragraphs WHERE id=?", (cand["id"],) ).fetchone()["text"] result, err = verify(client, c["claim_text"], cand_text, budget) if result is None: parse_failed += 1 continue rel = (result.get("relation") or "").lower() if rel not in rel_counts: rel = "kein_bezug" rel_counts[rel] += 1 if rel != "kein_bezug": db.execute( "INSERT INTO claim_matches (claim_id, target_podcast, target_episode, " "target_idx, relation, reason, score) VALUES (?, ?, ?, ?, ?, ?, ?)", (c["id"], cand["podcast_id"], cand["episode_id"], cand["idx"], rel, (result.get("reason") or "")[:500], score), ) if (i + 1) % 20 == 0: db.commit() print(f" [{i+1}/{len(claims)}] belegt={rel_counts['belegt']} " f"widerspricht={rel_counts['widerspricht']} erweitert={rel_counts['erweitert']} " f"kein_bezug={rel_counts['kein_bezug']} skipped={skipped_low_score} " f"cost=${budget.cost():.4f}") time.sleep(0.25) db.commit() print() print("=== Zusammenfassung ===") for k, v in rel_counts.items(): print(f" {k}: {v}") print(f" skipped (score<{MIN_SCORE}): {skipped_low_score}") print(f" parse-failures: {parse_failed}") print(f" Tokens in={budget.tokens_in} out={budget.tokens_out}") print(f" Kosten ~${budget.cost():.4f}") db.close() if __name__ == "__main__": main()