podcast-mindmap/scripts/match_claims.py

272 lines
9.2 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""#16 Claim-Verification Stufe 2: matche Claims gegen Bestaetigung/Widerspruch in anderen Episoden.
Vorgehen:
1. Lade Claims (default verifiable=1, optional alle).
2. Embedde jeden Claim-Text und suche per Cosinus den besten Kandidaten in anderen Episoden.
3. Bei score >= MIN_SCORE: Qwen-Verifikation: 'belegt' / 'widerspricht' / 'erweitert' / 'kein_bezug'.
4. Schreibe Treffer in neue Tabelle claim_matches.
Nutzung:
DASHSCOPE_API_KEY=... python3 match_claims.py [db-pfad] [limit]
Optionen:
--cross-podcast erlaubt Treffer in anderen Podcasts
--include-non-verifiable matche auch Claims, die nicht als verifizierbar markiert sind
"""
import json
import os
import sqlite3
import sys
import time
from pathlib import Path
import numpy as np
from openai import OpenAI
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from json_utils import parse_llm_json
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT / "backend"))
from database import get_all_embeddings # noqa: E402
DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite"
LIMIT = int(sys.argv[2]) if len(sys.argv) > 2 and not sys.argv[2].startswith("--") else 500
CROSS_PODCAST = "--cross-podcast" in sys.argv
INCLUDE_NON_VERIFIABLE = "--include-non-verifiable" in sys.argv
API_KEY = os.environ.get("DASHSCOPE_API_KEY", "")
BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
EMBED_MODEL = "text-embedding-v3"
LLM_MODEL = "qwen-plus"
EMBED_BATCH = 6
MIN_SCORE = 0.55
HARD_BUDGET_USD = float(os.environ.get("HARD_BUDGET_USD", "1.50"))
COST_IN = 0.0008 / 1000
COST_OUT = 0.002 / 1000
SYSTEM_PROMPT = """Du erhaeltst eine prueffbare Behauptung aus einem Podcast und einen Kandidat-Absatz aus einer anderen Episode.
Pruefe die logische Beziehung des Kandidaten zur Behauptung. Antworte NUR mit JSON:
{"relation": "belegt" | "widerspricht" | "erweitert" | "kein_bezug", "reason": "1 Satz Begruendung"}
- "belegt": der Kandidat liefert Evidenz/Daten, die die Behauptung stuetzen.
- "widerspricht": der Kandidat widerspricht der Behauptung oder nennt Gegenargumente.
- "erweitert": der Kandidat ergaenzt die Behauptung um Aspekte, ohne ihr zuzustimmen oder zu widersprechen.
- "kein_bezug": kein logischer Bezug, auch wenn das Thema verwandt ist."""
def setup_db(db):
db.executescript("""
CREATE TABLE IF NOT EXISTS claim_matches (
id INTEGER PRIMARY KEY AUTOINCREMENT,
claim_id INTEGER NOT NULL,
target_podcast TEXT NOT NULL,
target_episode TEXT NOT NULL,
target_idx INTEGER NOT NULL,
relation TEXT NOT NULL,
reason TEXT,
score REAL,
FOREIGN KEY (claim_id) REFERENCES claims(id)
);
CREATE INDEX IF NOT EXISTS idx_claim_matches ON claim_matches(claim_id);
CREATE INDEX IF NOT EXISTS idx_claim_matches_relation ON claim_matches(relation);
""")
def fetch_claims(db):
where = "WHERE 1=1"
if not INCLUDE_NON_VERIFIABLE:
where += " AND c.verifiable = 1"
where += " AND NOT EXISTS (SELECT 1 FROM claim_matches m WHERE m.claim_id = c.id)"
rows = db.execute(f"""
SELECT c.id, c.podcast_id, c.episode_id, c.paragraph_idx, c.claim_text, c.claim_type
FROM claims c
{where}
ORDER BY c.id
LIMIT ?
""", (LIMIT,)).fetchall()
return rows
def embed_batch(client, texts):
resp = client.embeddings.create(model=EMBED_MODEL, input=texts, dimensions=1024)
return [item.embedding for item in resp.data]
def best_candidate(c_vec, vectors, meta, exclude_podcast, exclude_episode, allow_cross):
scores = vectors @ c_vec
order = np.argsort(scores)[::-1]
for idx in order[:30]:
m = meta[idx]
if m["podcast_id"] == exclude_podcast and m["episode_id"] == exclude_episode:
continue
if not allow_cross and m["podcast_id"] != exclude_podcast:
continue
return m, float(scores[idx])
return None, 0.0
class Budget:
def __init__(self, hard_limit_usd):
self.hard_limit = hard_limit_usd
self.tokens_in = 0
self.tokens_out = 0
def add(self, usage):
if usage:
self.tokens_in += getattr(usage, "prompt_tokens", 0) or 0
self.tokens_out += getattr(usage, "completion_tokens", 0) or 0
def cost(self):
return self.tokens_in * COST_IN + self.tokens_out * COST_OUT
def over(self):
return self.cost() > self.hard_limit
def verify(client, claim_text, candidate_text, budget):
user_msg = (
f"BEHAUPTUNG:\n\"{claim_text}\"\n\n"
f"KANDIDAT-ABSATZ:\n\"{candidate_text[:1000]}\""
)
last_err = None
for attempt in range(2):
try:
resp = client.chat.completions.create(
model=LLM_MODEL,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_msg},
],
temperature=0.0,
max_tokens=200,
)
budget.add(getattr(resp, "usage", None))
content = resp.choices[0].message.content
try:
return parse_llm_json(content, expect="object"), None
except ValueError as pe:
last_err = f"parse: {pe}"
break
except Exception as e:
last_err = str(e)
if attempt < 1:
time.sleep(2)
continue
return None, last_err
def main():
if not API_KEY:
print("DASHSCOPE_API_KEY nicht gesetzt.")
sys.exit(1)
client = OpenAI(api_key=API_KEY, base_url=BASE_URL, timeout=60.0, max_retries=1)
db = sqlite3.connect(DB_PATH, timeout=30.0)
db.execute("PRAGMA busy_timeout=30000")
db.row_factory = sqlite3.Row
setup_db(db)
claims = fetch_claims(db)
print(f"Claims zu matchen: {len(claims)}")
if not claims:
return
print("Lade paragraph-embeddings…")
vectors, meta = get_all_embeddings(None)
if vectors is None:
print("Keine Embeddings vorhanden — abbruch.")
return
nan_mask = np.isnan(vectors).any(axis=1)
if nan_mask.any():
print(f" {nan_mask.sum()} NaN-Vektoren maskiert")
vectors[nan_mask] = 0
print(f" {len(meta)} Vektoren geladen")
print(f"Embedde {len(claims)} Claims…")
c_vecs = []
for i in range(0, len(claims), EMBED_BATCH):
batch = claims[i:i + EMBED_BATCH]
texts = [c["claim_text"][:500] for c in batch]
try:
embs = embed_batch(client, texts)
except Exception as e:
print(f" Embedding-Batch {i // EMBED_BATCH + 1} fehler: {e}")
time.sleep(2)
embs = [None] * len(batch)
for emb in embs:
if emb is None:
c_vecs.append(None)
else:
v = np.array(emb, dtype=np.float32)
v /= np.linalg.norm(v) or 1
c_vecs.append(v)
budget = Budget(hard_limit_usd=HARD_BUDGET_USD)
rel_counts = {"belegt": 0, "widerspricht": 0, "erweitert": 0, "kein_bezug": 0}
skipped_low_score = 0
parse_failed = 0
for i, (c, c_vec) in enumerate(zip(claims, c_vecs)):
if budget.over():
print(f"!! Budget ({budget.cost():.4f} USD) erreicht — Abbruch nach {i}")
break
if c_vec is None:
continue
cand, score = best_candidate(c_vec, vectors, meta, c["podcast_id"], c["episode_id"], CROSS_PODCAST)
if cand is None or score < MIN_SCORE:
skipped_low_score += 1
continue
cand_text = db.execute(
"SELECT text FROM paragraphs WHERE id=?", (cand["id"],)
).fetchone()["text"]
result, err = verify(client, c["claim_text"], cand_text, budget)
if result is None:
parse_failed += 1
continue
rel = (result.get("relation") or "").lower()
if rel not in rel_counts:
rel = "kein_bezug"
rel_counts[rel] += 1
# Auch kein_bezug speichern, damit der Claim als verarbeitet gilt und im
# Re-Run nicht erneut Kosten erzeugt. Frontend filtert kein_bezug raus.
db.execute(
"INSERT INTO claim_matches (claim_id, target_podcast, target_episode, "
"target_idx, relation, reason, score) VALUES (?, ?, ?, ?, ?, ?, ?)",
(c["id"], cand["podcast_id"], cand["episode_id"], cand["idx"],
rel, (result.get("reason") or "")[:500], score),
)
if (i + 1) % 20 == 0:
db.commit()
print(f" [{i+1}/{len(claims)}] belegt={rel_counts['belegt']} "
f"widerspricht={rel_counts['widerspricht']} erweitert={rel_counts['erweitert']} "
f"kein_bezug={rel_counts['kein_bezug']} skipped={skipped_low_score} "
f"cost=${budget.cost():.4f}")
time.sleep(0.25)
db.commit()
print()
print("=== Zusammenfassung ===")
for k, v in rel_counts.items():
print(f" {k}: {v}")
print(f" skipped (score<{MIN_SCORE}): {skipped_low_score}")
print(f" parse-failures: {parse_failed}")
print(f" Tokens in={budget.tokens_in} out={budget.tokens_out}")
print(f" Kosten ~${budget.cost():.4f}")
db.close()
if __name__ == "__main__":
main()