167 lines
5.9 KiB
Python
167 lines
5.9 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""#13 Argumentketten-Tracker: Klassifiziere logische Relationen zwischen semantisch ähnlichen Absätzen.
|
||
|
|
|
||
|
|
Nimmt die Top-N semantic_links und lässt Qwen die Relation klassifizieren:
|
||
|
|
erweitert, widerspricht, belegt, relativiert, gleicher_punkt, kein_bezug.
|
||
|
|
|
||
|
|
Nutzung:
|
||
|
|
DASHSCOPE_API_KEY=... python3 analyse_arguments.py [db-pfad] [limit]
|
||
|
|
"""
|
||
|
|
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
import sys
|
||
|
|
import time
|
||
|
|
import sqlite3
|
||
|
|
|
||
|
|
from openai import OpenAI
|
||
|
|
|
||
|
|
DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite"
|
||
|
|
LIMIT = int(sys.argv[2]) if len(sys.argv) > 2 else 500
|
||
|
|
|
||
|
|
API_KEY = os.environ.get("DASHSCOPE_API_KEY", "")
|
||
|
|
BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
|
||
|
|
MODEL = "qwen-plus"
|
||
|
|
|
||
|
|
SYSTEM_PROMPT = """Du bist ein Diskursanalyst. Du erhältst zwei Textabschnitte aus Podcast-Transkripten.
|
||
|
|
Klassifiziere die logische Relation zwischen ihnen. Antworte NUR mit einem JSON-Objekt:
|
||
|
|
|
||
|
|
{"relation": "...", "confidence": 0.0-1.0, "explanation": "Ein Satz Begründung"}
|
||
|
|
|
||
|
|
Mögliche Relationen:
|
||
|
|
- "erweitert": B baut auf A auf, ergänzt, vertieft
|
||
|
|
- "widerspricht": B widerspricht A, nennt Gegenargument
|
||
|
|
- "belegt": B liefert Evidenz/Daten für A's These
|
||
|
|
- "relativiert": B schränkt A ein, nennt Ausnahmen/Bedingungen
|
||
|
|
- "gleicher_punkt": A und B sagen im Kern dasselbe
|
||
|
|
- "kein_bezug": Trotz thematischer Nähe kein logischer Bezug"""
|
||
|
|
|
||
|
|
|
||
|
|
def classify_pair(client, text_a, meta_a, text_b, meta_b):
|
||
|
|
user_msg = f"""Absatz A ({meta_a}):
|
||
|
|
"{text_a}"
|
||
|
|
|
||
|
|
Absatz B ({meta_b}):
|
||
|
|
"{text_b}"
|
||
|
|
|
||
|
|
Welche logische Relation besteht von A zu B?"""
|
||
|
|
|
||
|
|
try:
|
||
|
|
resp = client.chat.completions.create(
|
||
|
|
model=MODEL,
|
||
|
|
messages=[
|
||
|
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||
|
|
{"role": "user", "content": user_msg},
|
||
|
|
],
|
||
|
|
temperature=0.1,
|
||
|
|
max_tokens=150,
|
||
|
|
)
|
||
|
|
content = resp.choices[0].message.content.strip()
|
||
|
|
# Parse JSON from response
|
||
|
|
if content.startswith("```"):
|
||
|
|
content = content.split("```")[1].strip()
|
||
|
|
if content.startswith("json"):
|
||
|
|
content = content[4:].strip()
|
||
|
|
return json.loads(content)
|
||
|
|
except Exception as e:
|
||
|
|
return {"relation": "error", "confidence": 0, "explanation": str(e)}
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
if not API_KEY:
|
||
|
|
print("DASHSCOPE_API_KEY nicht gesetzt.")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
|
||
|
|
db = sqlite3.connect(DB_PATH)
|
||
|
|
db.row_factory = sqlite3.Row
|
||
|
|
|
||
|
|
# Create output table
|
||
|
|
db.executescript("""
|
||
|
|
CREATE TABLE IF NOT EXISTS argument_links (
|
||
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||
|
|
source_podcast TEXT, source_episode TEXT, source_idx INTEGER,
|
||
|
|
target_podcast TEXT, target_episode TEXT, target_idx INTEGER,
|
||
|
|
relation TEXT, confidence REAL, explanation TEXT, score REAL
|
||
|
|
);
|
||
|
|
CREATE INDEX IF NOT EXISTS idx_arglinks ON argument_links(relation);
|
||
|
|
""")
|
||
|
|
|
||
|
|
# Get top semantic links (cross-episode, prefer cross-podcast)
|
||
|
|
rows = db.execute("""
|
||
|
|
SELECT sl.podcast_id, sl.source_episode, sl.source_idx,
|
||
|
|
sl.target_podcast, sl.target_episode, sl.target_idx, sl.score,
|
||
|
|
p1.text as source_text, p2.text as target_text,
|
||
|
|
e1.title as source_title, e1.guest as source_guest,
|
||
|
|
e2.title as target_title, e2.guest as target_guest
|
||
|
|
FROM semantic_links sl
|
||
|
|
JOIN paragraphs p1 ON sl.podcast_id = p1.podcast_id AND sl.source_episode = p1.episode_id AND sl.source_idx = p1.idx
|
||
|
|
JOIN paragraphs p2 ON sl.target_podcast = p2.podcast_id AND sl.target_episode = p2.episode_id AND sl.target_idx = p2.idx
|
||
|
|
JOIN episodes e1 ON sl.podcast_id = e1.podcast_id AND sl.source_episode = e1.id
|
||
|
|
JOIN episodes e2 ON sl.target_podcast = e2.podcast_id AND sl.target_episode = e2.id
|
||
|
|
WHERE sl.source_episode != sl.target_episode
|
||
|
|
ORDER BY sl.score DESC
|
||
|
|
LIMIT ?
|
||
|
|
""", (LIMIT,)).fetchall()
|
||
|
|
|
||
|
|
print(f"Klassifiziere {len(rows)} Paare mit {MODEL}…")
|
||
|
|
|
||
|
|
# Check already processed
|
||
|
|
existing = set()
|
||
|
|
try:
|
||
|
|
for r in db.execute("SELECT source_podcast||source_episode||source_idx||target_podcast||target_episode||target_idx as k FROM argument_links").fetchall():
|
||
|
|
existing.add(r["k"])
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
processed = 0
|
||
|
|
skipped = 0
|
||
|
|
|
||
|
|
for i, row in enumerate(rows):
|
||
|
|
key = f"{row['podcast_id']}{row['source_episode']}{row['source_idx']}{row['target_podcast']}{row['target_episode']}{row['target_idx']}"
|
||
|
|
if key in existing:
|
||
|
|
skipped += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
meta_a = f"{row['source_episode']}: {row['source_title']} — {row['source_guest']}"
|
||
|
|
meta_b = f"{row['target_episode']}: {row['target_title']} — {row['target_guest']}"
|
||
|
|
|
||
|
|
result = classify_pair(
|
||
|
|
client,
|
||
|
|
row["source_text"][:800], meta_a,
|
||
|
|
row["target_text"][:800], meta_b
|
||
|
|
)
|
||
|
|
|
||
|
|
db.execute(
|
||
|
|
"INSERT INTO argument_links (source_podcast, source_episode, source_idx, "
|
||
|
|
"target_podcast, target_episode, target_idx, relation, confidence, explanation, score) "
|
||
|
|
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
|
||
|
|
(row["podcast_id"], row["source_episode"], row["source_idx"],
|
||
|
|
row["target_podcast"], row["target_episode"], row["target_idx"],
|
||
|
|
result.get("relation", "error"), result.get("confidence", 0),
|
||
|
|
result.get("explanation", ""), row["score"])
|
||
|
|
)
|
||
|
|
|
||
|
|
processed += 1
|
||
|
|
if processed % 10 == 0:
|
||
|
|
db.commit()
|
||
|
|
print(f" {processed}/{len(rows) - skipped} klassifiziert…")
|
||
|
|
|
||
|
|
# Rate limiting
|
||
|
|
time.sleep(0.3)
|
||
|
|
|
||
|
|
db.commit()
|
||
|
|
|
||
|
|
# Stats
|
||
|
|
stats = db.execute("SELECT relation, COUNT(*) as c FROM argument_links GROUP BY relation ORDER BY c DESC").fetchall()
|
||
|
|
print(f"\nFertig: {processed} neue, {skipped} übersprungen.")
|
||
|
|
print("Verteilung:")
|
||
|
|
for s in stats:
|
||
|
|
print(f" {s['relation']}: {s['c']}")
|
||
|
|
|
||
|
|
db.close()
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|