#16/#17 match_answers.py und match_claims.py: Cross-Episode-Matching via Embeddings + Qwen
scripts/match_answers.py (#17): - Laedt offene Fragen (genuine, follow_up; answered='no'). - Embedded jede Frage und sucht den besten Kandidat-Absatz aus einer anderen Episode (optional cross-podcast) per Cosinus-Aehnlichkeit ueber die paragraph- embeddings. - Bei score >= 0.55: Qwen-Verifikation 'Beantwortet B die Frage in A?' (yes/partial/no), bei yes/partial wird answered + answered_by_* in der questions-Tabelle gesetzt. - Hard-Budget 1,50 USD, --rerun setzt bestehende Matches neu. scripts/match_claims.py (#16 Stufe 2): - Analoge Mechanik fuer claims: Embedding, Cosinus-Suche, Qwen-Verifikation in der vier-stufigen Skala 'belegt' / 'widerspricht' / 'erweitert' / 'kein_bezug'. - Schreibt Treffer (ohne 'kein_bezug') in neue Tabelle claim_matches. - Default nur verifizierbare Claims (--include-non-verifiable kippt das), --cross-podcast erlaubt Cross-Podcast-Treffer. Beide Skripte nutzen json_utils.parse_llm_json fuer robustes Parsing und sind gegen NaN-Vektoren in den Embeddings abgesichert. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
6f53f35c09
commit
c5489eabaa
271
scripts/match_answers.py
Normal file
271
scripts/match_answers.py
Normal file
@ -0,0 +1,271 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""#17 Frage-Antwort-Asymmetrie: matche unbeantwortete Fragen gegen Antworten in anderen Episoden.
|
||||||
|
|
||||||
|
Vorgehen:
|
||||||
|
1. Lade Fragen mit answered='no' und question_type in ('genuine', 'follow_up').
|
||||||
|
2. Embedde jede Frage (text-embedding-v3) und suche per Cosinus-Aehnlichkeit den
|
||||||
|
besten Kandidaten in den vorhandenen paragraph-embeddings (cross-episode, optional
|
||||||
|
cross-podcast).
|
||||||
|
3. Bei score >= MIN_SCORE: Qwen-Verifikation "Beantwortet Absatz B die Frage in A?"
|
||||||
|
4. Wenn yes/partial: questions.answered + answered_by_podcast/episode/idx setzen.
|
||||||
|
|
||||||
|
Nutzung:
|
||||||
|
DASHSCOPE_API_KEY=... python3 match_answers.py [db-pfad] [limit]
|
||||||
|
|
||||||
|
Optionen:
|
||||||
|
--cross-podcast erlaubt Antworten in anderen Podcasts
|
||||||
|
--rerun Fragen erneut bearbeiten, auch wenn schon ein answered_by-Wert steht
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
from json_utils import parse_llm_json
|
||||||
|
|
||||||
|
# Backend-Helfer wiederverwenden (Pfad hinzufuegen)
|
||||||
|
ROOT = Path(__file__).resolve().parent.parent
|
||||||
|
sys.path.insert(0, str(ROOT / "backend"))
|
||||||
|
from database import get_all_embeddings # noqa: E402
|
||||||
|
|
||||||
|
DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite"
|
||||||
|
LIMIT = int(sys.argv[2]) if len(sys.argv) > 2 and not sys.argv[2].startswith("--") else 500
|
||||||
|
CROSS_PODCAST = "--cross-podcast" in sys.argv
|
||||||
|
RERUN = "--rerun" in sys.argv
|
||||||
|
|
||||||
|
API_KEY = os.environ.get("DASHSCOPE_API_KEY", "")
|
||||||
|
BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
|
||||||
|
EMBED_MODEL = "text-embedding-v3"
|
||||||
|
LLM_MODEL = "qwen-plus"
|
||||||
|
EMBED_BATCH = 6
|
||||||
|
MIN_SCORE = 0.55
|
||||||
|
HARD_BUDGET_USD = 1.50
|
||||||
|
|
||||||
|
COST_IN = 0.0008 / 1000
|
||||||
|
COST_OUT = 0.002 / 1000
|
||||||
|
|
||||||
|
SYSTEM_PROMPT = """Du erhaeltst eine Frage aus einem Podcast und einen Kandidat-Absatz aus einer anderen Episode.
|
||||||
|
|
||||||
|
Pruefe: Beantwortet der Kandidat-Absatz die Frage konkret?
|
||||||
|
|
||||||
|
Antworte NUR mit JSON:
|
||||||
|
{"answers": "yes" | "partial" | "no", "reason": "1 Satz Begruendung"}
|
||||||
|
|
||||||
|
- "yes": der Absatz beantwortet die Frage direkt und vollstaendig.
|
||||||
|
- "partial": der Absatz nennt einen Teilaspekt oder eine verwandte Position, aber nicht die volle Antwort.
|
||||||
|
- "no": der Absatz beantwortet die Frage nicht (auch wenn er thematisch verwandt ist)."""
|
||||||
|
|
||||||
|
|
||||||
|
def setup_db(db):
|
||||||
|
db.executescript("""
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_questions_answered ON questions(podcast_id, answered);
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_open_questions(db):
|
||||||
|
where = "WHERE q.answered='no' AND q.question_type IN ('genuine','follow_up')"
|
||||||
|
if not RERUN:
|
||||||
|
where += " AND (q.answered_by_episode IS NULL OR q.answered_by_episode = '')"
|
||||||
|
rows = db.execute(f"""
|
||||||
|
SELECT q.id, q.podcast_id, q.episode_id, q.paragraph_idx, q.question_text, q.question_type,
|
||||||
|
p.text AS para_text, e.title AS ep_title, e.guest AS ep_guest
|
||||||
|
FROM questions q
|
||||||
|
JOIN paragraphs p ON q.podcast_id=p.podcast_id AND q.episode_id=p.episode_id AND q.paragraph_idx=p.idx
|
||||||
|
JOIN episodes e ON q.podcast_id=e.podcast_id AND q.episode_id=e.id
|
||||||
|
{where}
|
||||||
|
ORDER BY q.id
|
||||||
|
LIMIT ?
|
||||||
|
""", (LIMIT,)).fetchall()
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
def embed_batch(client, texts):
|
||||||
|
resp = client.embeddings.create(model=EMBED_MODEL, input=texts, dimensions=1024)
|
||||||
|
return [item.embedding for item in resp.data]
|
||||||
|
|
||||||
|
|
||||||
|
def best_candidate(q_vec, vectors, meta, exclude_podcast, exclude_episode, allow_cross_podcast):
|
||||||
|
scores = vectors @ q_vec
|
||||||
|
order = np.argsort(scores)[::-1]
|
||||||
|
for idx in order[:30]:
|
||||||
|
m = meta[idx]
|
||||||
|
if m["podcast_id"] == exclude_podcast and m["episode_id"] == exclude_episode:
|
||||||
|
continue
|
||||||
|
if not allow_cross_podcast and m["podcast_id"] != exclude_podcast:
|
||||||
|
continue
|
||||||
|
return m, float(scores[idx])
|
||||||
|
return None, 0.0
|
||||||
|
|
||||||
|
|
||||||
|
class Budget:
|
||||||
|
def __init__(self, hard_limit_usd):
|
||||||
|
self.hard_limit = hard_limit_usd
|
||||||
|
self.tokens_in = 0
|
||||||
|
self.tokens_out = 0
|
||||||
|
|
||||||
|
def add(self, usage):
|
||||||
|
if usage:
|
||||||
|
self.tokens_in += getattr(usage, "prompt_tokens", 0) or 0
|
||||||
|
self.tokens_out += getattr(usage, "completion_tokens", 0) or 0
|
||||||
|
|
||||||
|
def cost(self):
|
||||||
|
return self.tokens_in * COST_IN + self.tokens_out * COST_OUT
|
||||||
|
|
||||||
|
def over(self):
|
||||||
|
return self.cost() > self.hard_limit
|
||||||
|
|
||||||
|
|
||||||
|
def verify(client, question_text, candidate_text, budget):
|
||||||
|
user_msg = (
|
||||||
|
f"FRAGE:\n\"{question_text}\"\n\n"
|
||||||
|
f"KANDIDAT-ABSATZ:\n\"{candidate_text[:1000]}\""
|
||||||
|
)
|
||||||
|
last_err = None
|
||||||
|
for attempt in range(2):
|
||||||
|
try:
|
||||||
|
resp = client.chat.completions.create(
|
||||||
|
model=LLM_MODEL,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||||||
|
{"role": "user", "content": user_msg},
|
||||||
|
],
|
||||||
|
temperature=0.0,
|
||||||
|
max_tokens=200,
|
||||||
|
)
|
||||||
|
budget.add(getattr(resp, "usage", None))
|
||||||
|
content = resp.choices[0].message.content
|
||||||
|
try:
|
||||||
|
return parse_llm_json(content, expect="object"), None
|
||||||
|
except ValueError as pe:
|
||||||
|
last_err = f"parse: {pe}"
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
last_err = str(e)
|
||||||
|
if attempt < 1:
|
||||||
|
time.sleep(2)
|
||||||
|
continue
|
||||||
|
return None, last_err
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if not API_KEY:
|
||||||
|
print("DASHSCOPE_API_KEY nicht gesetzt.")
|
||||||
|
sys.exit(1)
|
||||||
|
client = OpenAI(api_key=API_KEY, base_url=BASE_URL, timeout=60.0, max_retries=1)
|
||||||
|
|
||||||
|
db = sqlite3.connect(DB_PATH, timeout=30.0)
|
||||||
|
db.execute("PRAGMA busy_timeout=30000")
|
||||||
|
db.row_factory = sqlite3.Row
|
||||||
|
setup_db(db)
|
||||||
|
|
||||||
|
questions = fetch_open_questions(db)
|
||||||
|
print(f"Fragen zu matchen: {len(questions)}")
|
||||||
|
if not questions:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Lade alle paragraph-embeddings
|
||||||
|
print("Lade paragraph-embeddings…")
|
||||||
|
vectors, meta = get_all_embeddings(None)
|
||||||
|
if vectors is None:
|
||||||
|
print("Keine Embeddings vorhanden — abbruch.")
|
||||||
|
return
|
||||||
|
nan_mask = np.isnan(vectors).any(axis=1)
|
||||||
|
if nan_mask.any():
|
||||||
|
print(f" {nan_mask.sum()} NaN-Vektoren maskiert")
|
||||||
|
vectors[nan_mask] = 0
|
||||||
|
print(f" {len(meta)} Vektoren geladen")
|
||||||
|
|
||||||
|
# Embedde Fragen in Batches
|
||||||
|
print(f"Embedde {len(questions)} Fragen…")
|
||||||
|
q_vecs = []
|
||||||
|
for i in range(0, len(questions), EMBED_BATCH):
|
||||||
|
batch = questions[i:i + EMBED_BATCH]
|
||||||
|
texts = [q["question_text"][:500] for q in batch]
|
||||||
|
try:
|
||||||
|
embs = embed_batch(client, texts)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Embedding-Batch {i // EMBED_BATCH + 1} fehler: {e}")
|
||||||
|
time.sleep(2)
|
||||||
|
embs = [None] * len(batch)
|
||||||
|
for q, emb in zip(batch, embs):
|
||||||
|
if emb is None:
|
||||||
|
q_vecs.append(None)
|
||||||
|
else:
|
||||||
|
v = np.array(emb, dtype=np.float32)
|
||||||
|
v /= np.linalg.norm(v) or 1
|
||||||
|
q_vecs.append(v)
|
||||||
|
print(f" {sum(1 for v in q_vecs if v is not None)} Frage-Embeddings ok")
|
||||||
|
|
||||||
|
budget = Budget(hard_limit_usd=HARD_BUDGET_USD)
|
||||||
|
|
||||||
|
matched = 0
|
||||||
|
no_answer = 0
|
||||||
|
skipped_low_score = 0
|
||||||
|
parse_failed = 0
|
||||||
|
|
||||||
|
for i, (q, q_vec) in enumerate(zip(questions, q_vecs)):
|
||||||
|
if budget.over():
|
||||||
|
print(f"!! Budget ({budget.cost():.4f} USD) erreicht — Abbruch nach {i} Fragen")
|
||||||
|
break
|
||||||
|
if q_vec is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
cand, score = best_candidate(q_vec, vectors, meta, q["podcast_id"], q["episode_id"], CROSS_PODCAST)
|
||||||
|
if cand is None or score < MIN_SCORE:
|
||||||
|
skipped_low_score += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
cand_text = db.execute(
|
||||||
|
"SELECT text FROM paragraphs WHERE id=?", (cand["id"],)
|
||||||
|
).fetchone()["text"]
|
||||||
|
|
||||||
|
result, err = verify(client, q["question_text"], cand_text, budget)
|
||||||
|
if result is None:
|
||||||
|
parse_failed += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
ans = (result.get("answers") or "").lower()
|
||||||
|
if ans in ("yes", "partial"):
|
||||||
|
db.execute(
|
||||||
|
"UPDATE questions SET answered=?, answered_by_podcast=?, "
|
||||||
|
"answered_by_episode=?, answered_by_idx=? WHERE id=?",
|
||||||
|
(ans, cand["podcast_id"], cand["episode_id"], cand["idx"], q["id"]),
|
||||||
|
)
|
||||||
|
matched += 1
|
||||||
|
else:
|
||||||
|
no_answer += 1
|
||||||
|
|
||||||
|
if (i + 1) % 20 == 0:
|
||||||
|
db.commit()
|
||||||
|
print(f" [{i+1}/{len(questions)}] matched={matched} no_answer={no_answer} "
|
||||||
|
f"skipped={skipped_low_score} parse_err={parse_failed} cost=${budget.cost():.4f}")
|
||||||
|
time.sleep(0.25)
|
||||||
|
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("=== Zusammenfassung ===")
|
||||||
|
print(f" matched (yes/partial): {matched}")
|
||||||
|
print(f" no_answer: {no_answer}")
|
||||||
|
print(f" skipped (score<{MIN_SCORE}): {skipped_low_score}")
|
||||||
|
print(f" parse-failures: {parse_failed}")
|
||||||
|
print(f" Tokens in={budget.tokens_in} out={budget.tokens_out}")
|
||||||
|
print(f" Kosten ~${budget.cost():.4f}")
|
||||||
|
|
||||||
|
# Verteilung nach run
|
||||||
|
stats = db.execute("SELECT answered, COUNT(*) FROM questions GROUP BY answered").fetchall()
|
||||||
|
print(" questions.answered nach Lauf:")
|
||||||
|
for s in stats:
|
||||||
|
print(f" {s[0]}: {s[1]}")
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
270
scripts/match_claims.py
Normal file
270
scripts/match_claims.py
Normal file
@ -0,0 +1,270 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""#16 Claim-Verification Stufe 2: matche Claims gegen Bestaetigung/Widerspruch in anderen Episoden.
|
||||||
|
|
||||||
|
Vorgehen:
|
||||||
|
1. Lade Claims (default verifiable=1, optional alle).
|
||||||
|
2. Embedde jeden Claim-Text und suche per Cosinus den besten Kandidaten in anderen Episoden.
|
||||||
|
3. Bei score >= MIN_SCORE: Qwen-Verifikation: 'belegt' / 'widerspricht' / 'erweitert' / 'kein_bezug'.
|
||||||
|
4. Schreibe Treffer in neue Tabelle claim_matches.
|
||||||
|
|
||||||
|
Nutzung:
|
||||||
|
DASHSCOPE_API_KEY=... python3 match_claims.py [db-pfad] [limit]
|
||||||
|
|
||||||
|
Optionen:
|
||||||
|
--cross-podcast erlaubt Treffer in anderen Podcasts
|
||||||
|
--include-non-verifiable matche auch Claims, die nicht als verifizierbar markiert sind
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
from json_utils import parse_llm_json
|
||||||
|
|
||||||
|
ROOT = Path(__file__).resolve().parent.parent
|
||||||
|
sys.path.insert(0, str(ROOT / "backend"))
|
||||||
|
from database import get_all_embeddings # noqa: E402
|
||||||
|
|
||||||
|
DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite"
|
||||||
|
LIMIT = int(sys.argv[2]) if len(sys.argv) > 2 and not sys.argv[2].startswith("--") else 500
|
||||||
|
CROSS_PODCAST = "--cross-podcast" in sys.argv
|
||||||
|
INCLUDE_NON_VERIFIABLE = "--include-non-verifiable" in sys.argv
|
||||||
|
|
||||||
|
API_KEY = os.environ.get("DASHSCOPE_API_KEY", "")
|
||||||
|
BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
|
||||||
|
EMBED_MODEL = "text-embedding-v3"
|
||||||
|
LLM_MODEL = "qwen-plus"
|
||||||
|
EMBED_BATCH = 6
|
||||||
|
MIN_SCORE = 0.55
|
||||||
|
HARD_BUDGET_USD = 1.50
|
||||||
|
|
||||||
|
COST_IN = 0.0008 / 1000
|
||||||
|
COST_OUT = 0.002 / 1000
|
||||||
|
|
||||||
|
SYSTEM_PROMPT = """Du erhaeltst eine prueffbare Behauptung aus einem Podcast und einen Kandidat-Absatz aus einer anderen Episode.
|
||||||
|
|
||||||
|
Pruefe die logische Beziehung des Kandidaten zur Behauptung. Antworte NUR mit JSON:
|
||||||
|
{"relation": "belegt" | "widerspricht" | "erweitert" | "kein_bezug", "reason": "1 Satz Begruendung"}
|
||||||
|
|
||||||
|
- "belegt": der Kandidat liefert Evidenz/Daten, die die Behauptung stuetzen.
|
||||||
|
- "widerspricht": der Kandidat widerspricht der Behauptung oder nennt Gegenargumente.
|
||||||
|
- "erweitert": der Kandidat ergaenzt die Behauptung um Aspekte, ohne ihr zuzustimmen oder zu widersprechen.
|
||||||
|
- "kein_bezug": kein logischer Bezug, auch wenn das Thema verwandt ist."""
|
||||||
|
|
||||||
|
|
||||||
|
def setup_db(db):
|
||||||
|
db.executescript("""
|
||||||
|
CREATE TABLE IF NOT EXISTS claim_matches (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
claim_id INTEGER NOT NULL,
|
||||||
|
target_podcast TEXT NOT NULL,
|
||||||
|
target_episode TEXT NOT NULL,
|
||||||
|
target_idx INTEGER NOT NULL,
|
||||||
|
relation TEXT NOT NULL,
|
||||||
|
reason TEXT,
|
||||||
|
score REAL,
|
||||||
|
FOREIGN KEY (claim_id) REFERENCES claims(id)
|
||||||
|
);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_claim_matches ON claim_matches(claim_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_claim_matches_relation ON claim_matches(relation);
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_claims(db):
|
||||||
|
where = "WHERE 1=1"
|
||||||
|
if not INCLUDE_NON_VERIFIABLE:
|
||||||
|
where += " AND c.verifiable = 1"
|
||||||
|
where += " AND NOT EXISTS (SELECT 1 FROM claim_matches m WHERE m.claim_id = c.id)"
|
||||||
|
rows = db.execute(f"""
|
||||||
|
SELECT c.id, c.podcast_id, c.episode_id, c.paragraph_idx, c.claim_text, c.claim_type
|
||||||
|
FROM claims c
|
||||||
|
{where}
|
||||||
|
ORDER BY c.id
|
||||||
|
LIMIT ?
|
||||||
|
""", (LIMIT,)).fetchall()
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
def embed_batch(client, texts):
|
||||||
|
resp = client.embeddings.create(model=EMBED_MODEL, input=texts, dimensions=1024)
|
||||||
|
return [item.embedding for item in resp.data]
|
||||||
|
|
||||||
|
|
||||||
|
def best_candidate(c_vec, vectors, meta, exclude_podcast, exclude_episode, allow_cross):
|
||||||
|
scores = vectors @ c_vec
|
||||||
|
order = np.argsort(scores)[::-1]
|
||||||
|
for idx in order[:30]:
|
||||||
|
m = meta[idx]
|
||||||
|
if m["podcast_id"] == exclude_podcast and m["episode_id"] == exclude_episode:
|
||||||
|
continue
|
||||||
|
if not allow_cross and m["podcast_id"] != exclude_podcast:
|
||||||
|
continue
|
||||||
|
return m, float(scores[idx])
|
||||||
|
return None, 0.0
|
||||||
|
|
||||||
|
|
||||||
|
class Budget:
|
||||||
|
def __init__(self, hard_limit_usd):
|
||||||
|
self.hard_limit = hard_limit_usd
|
||||||
|
self.tokens_in = 0
|
||||||
|
self.tokens_out = 0
|
||||||
|
|
||||||
|
def add(self, usage):
|
||||||
|
if usage:
|
||||||
|
self.tokens_in += getattr(usage, "prompt_tokens", 0) or 0
|
||||||
|
self.tokens_out += getattr(usage, "completion_tokens", 0) or 0
|
||||||
|
|
||||||
|
def cost(self):
|
||||||
|
return self.tokens_in * COST_IN + self.tokens_out * COST_OUT
|
||||||
|
|
||||||
|
def over(self):
|
||||||
|
return self.cost() > self.hard_limit
|
||||||
|
|
||||||
|
|
||||||
|
def verify(client, claim_text, candidate_text, budget):
|
||||||
|
user_msg = (
|
||||||
|
f"BEHAUPTUNG:\n\"{claim_text}\"\n\n"
|
||||||
|
f"KANDIDAT-ABSATZ:\n\"{candidate_text[:1000]}\""
|
||||||
|
)
|
||||||
|
last_err = None
|
||||||
|
for attempt in range(2):
|
||||||
|
try:
|
||||||
|
resp = client.chat.completions.create(
|
||||||
|
model=LLM_MODEL,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||||||
|
{"role": "user", "content": user_msg},
|
||||||
|
],
|
||||||
|
temperature=0.0,
|
||||||
|
max_tokens=200,
|
||||||
|
)
|
||||||
|
budget.add(getattr(resp, "usage", None))
|
||||||
|
content = resp.choices[0].message.content
|
||||||
|
try:
|
||||||
|
return parse_llm_json(content, expect="object"), None
|
||||||
|
except ValueError as pe:
|
||||||
|
last_err = f"parse: {pe}"
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
last_err = str(e)
|
||||||
|
if attempt < 1:
|
||||||
|
time.sleep(2)
|
||||||
|
continue
|
||||||
|
return None, last_err
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if not API_KEY:
|
||||||
|
print("DASHSCOPE_API_KEY nicht gesetzt.")
|
||||||
|
sys.exit(1)
|
||||||
|
client = OpenAI(api_key=API_KEY, base_url=BASE_URL, timeout=60.0, max_retries=1)
|
||||||
|
|
||||||
|
db = sqlite3.connect(DB_PATH, timeout=30.0)
|
||||||
|
db.execute("PRAGMA busy_timeout=30000")
|
||||||
|
db.row_factory = sqlite3.Row
|
||||||
|
setup_db(db)
|
||||||
|
|
||||||
|
claims = fetch_claims(db)
|
||||||
|
print(f"Claims zu matchen: {len(claims)}")
|
||||||
|
if not claims:
|
||||||
|
return
|
||||||
|
|
||||||
|
print("Lade paragraph-embeddings…")
|
||||||
|
vectors, meta = get_all_embeddings(None)
|
||||||
|
if vectors is None:
|
||||||
|
print("Keine Embeddings vorhanden — abbruch.")
|
||||||
|
return
|
||||||
|
nan_mask = np.isnan(vectors).any(axis=1)
|
||||||
|
if nan_mask.any():
|
||||||
|
print(f" {nan_mask.sum()} NaN-Vektoren maskiert")
|
||||||
|
vectors[nan_mask] = 0
|
||||||
|
print(f" {len(meta)} Vektoren geladen")
|
||||||
|
|
||||||
|
print(f"Embedde {len(claims)} Claims…")
|
||||||
|
c_vecs = []
|
||||||
|
for i in range(0, len(claims), EMBED_BATCH):
|
||||||
|
batch = claims[i:i + EMBED_BATCH]
|
||||||
|
texts = [c["claim_text"][:500] for c in batch]
|
||||||
|
try:
|
||||||
|
embs = embed_batch(client, texts)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Embedding-Batch {i // EMBED_BATCH + 1} fehler: {e}")
|
||||||
|
time.sleep(2)
|
||||||
|
embs = [None] * len(batch)
|
||||||
|
for emb in embs:
|
||||||
|
if emb is None:
|
||||||
|
c_vecs.append(None)
|
||||||
|
else:
|
||||||
|
v = np.array(emb, dtype=np.float32)
|
||||||
|
v /= np.linalg.norm(v) or 1
|
||||||
|
c_vecs.append(v)
|
||||||
|
|
||||||
|
budget = Budget(hard_limit_usd=HARD_BUDGET_USD)
|
||||||
|
rel_counts = {"belegt": 0, "widerspricht": 0, "erweitert": 0, "kein_bezug": 0}
|
||||||
|
skipped_low_score = 0
|
||||||
|
parse_failed = 0
|
||||||
|
|
||||||
|
for i, (c, c_vec) in enumerate(zip(claims, c_vecs)):
|
||||||
|
if budget.over():
|
||||||
|
print(f"!! Budget ({budget.cost():.4f} USD) erreicht — Abbruch nach {i}")
|
||||||
|
break
|
||||||
|
if c_vec is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
cand, score = best_candidate(c_vec, vectors, meta, c["podcast_id"], c["episode_id"], CROSS_PODCAST)
|
||||||
|
if cand is None or score < MIN_SCORE:
|
||||||
|
skipped_low_score += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
cand_text = db.execute(
|
||||||
|
"SELECT text FROM paragraphs WHERE id=?", (cand["id"],)
|
||||||
|
).fetchone()["text"]
|
||||||
|
|
||||||
|
result, err = verify(client, c["claim_text"], cand_text, budget)
|
||||||
|
if result is None:
|
||||||
|
parse_failed += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
rel = (result.get("relation") or "").lower()
|
||||||
|
if rel not in rel_counts:
|
||||||
|
rel = "kein_bezug"
|
||||||
|
rel_counts[rel] += 1
|
||||||
|
|
||||||
|
if rel != "kein_bezug":
|
||||||
|
db.execute(
|
||||||
|
"INSERT INTO claim_matches (claim_id, target_podcast, target_episode, "
|
||||||
|
"target_idx, relation, reason, score) VALUES (?, ?, ?, ?, ?, ?, ?)",
|
||||||
|
(c["id"], cand["podcast_id"], cand["episode_id"], cand["idx"],
|
||||||
|
rel, (result.get("reason") or "")[:500], score),
|
||||||
|
)
|
||||||
|
|
||||||
|
if (i + 1) % 20 == 0:
|
||||||
|
db.commit()
|
||||||
|
print(f" [{i+1}/{len(claims)}] belegt={rel_counts['belegt']} "
|
||||||
|
f"widerspricht={rel_counts['widerspricht']} erweitert={rel_counts['erweitert']} "
|
||||||
|
f"kein_bezug={rel_counts['kein_bezug']} skipped={skipped_low_score} "
|
||||||
|
f"cost=${budget.cost():.4f}")
|
||||||
|
time.sleep(0.25)
|
||||||
|
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("=== Zusammenfassung ===")
|
||||||
|
for k, v in rel_counts.items():
|
||||||
|
print(f" {k}: {v}")
|
||||||
|
print(f" skipped (score<{MIN_SCORE}): {skipped_low_score}")
|
||||||
|
print(f" parse-failures: {parse_failed}")
|
||||||
|
print(f" Tokens in={budget.tokens_in} out={budget.tokens_out}")
|
||||||
|
print(f" Kosten ~${budget.cost():.4f}")
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Reference in New Issue
Block a user