#13/#16/#17/#18 Qwen-Analyse-Scripts - Frontend: Wort-Level-Highlighting im Transkript — jedes Wort als <span> mit Timestamp, Karaoke-Style Sync bei Wiedergabe, CSS word-active/word-spoken - API: /api/.../words Endpoint liefert Wort-Timestamps - #14 detect_gaps.py: K-Means-Clustering über 3727 Embeddings, identifiziert Leerstellen (Themen die in einem Podcast fehlen). Ergebnis: gaps_analysis.json - #15 detect_narrative_shift.py: Embedding-Drift pro Thema über Episodenfolge, erkennt Framing-Wechsel. Ergebnis: narrative_shifts.json - #13 analyse_arguments.py: Qwen klassifiziert logische Relationen (erweitert, widerspricht, belegt, relativiert) zwischen semantisch ähnlichen Absätzen - #16 extract_claims.py: Qwen extrahiert prüfbare Behauptungen (Zahlen, Statistiken) - #17 extract_questions.py: Qwen extrahiert und klassifiziert Fragen - #18 curate_debates.py: Qwen kuratiert Cross-Podcast-Gegenüberstellungen - run_all_qwen.sh: Sequentielle Pipeline für alle Qwen-Tasks (vermeidet DB-Locks) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
e678f75ee1
commit
78d66bef21
166
scripts/analyse_arguments.py
Normal file
166
scripts/analyse_arguments.py
Normal file
@ -0,0 +1,166 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""#13 Argumentketten-Tracker: Klassifiziere logische Relationen zwischen semantisch ähnlichen Absätzen.
|
||||||
|
|
||||||
|
Nimmt die Top-N semantic_links und lässt Qwen die Relation klassifizieren:
|
||||||
|
erweitert, widerspricht, belegt, relativiert, gleicher_punkt, kein_bezug.
|
||||||
|
|
||||||
|
Nutzung:
|
||||||
|
DASHSCOPE_API_KEY=... python3 analyse_arguments.py [db-pfad] [limit]
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite"
|
||||||
|
LIMIT = int(sys.argv[2]) if len(sys.argv) > 2 else 500
|
||||||
|
|
||||||
|
API_KEY = os.environ.get("DASHSCOPE_API_KEY", "")
|
||||||
|
BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
|
||||||
|
MODEL = "qwen-plus"
|
||||||
|
|
||||||
|
SYSTEM_PROMPT = """Du bist ein Diskursanalyst. Du erhältst zwei Textabschnitte aus Podcast-Transkripten.
|
||||||
|
Klassifiziere die logische Relation zwischen ihnen. Antworte NUR mit einem JSON-Objekt:
|
||||||
|
|
||||||
|
{"relation": "...", "confidence": 0.0-1.0, "explanation": "Ein Satz Begründung"}
|
||||||
|
|
||||||
|
Mögliche Relationen:
|
||||||
|
- "erweitert": B baut auf A auf, ergänzt, vertieft
|
||||||
|
- "widerspricht": B widerspricht A, nennt Gegenargument
|
||||||
|
- "belegt": B liefert Evidenz/Daten für A's These
|
||||||
|
- "relativiert": B schränkt A ein, nennt Ausnahmen/Bedingungen
|
||||||
|
- "gleicher_punkt": A und B sagen im Kern dasselbe
|
||||||
|
- "kein_bezug": Trotz thematischer Nähe kein logischer Bezug"""
|
||||||
|
|
||||||
|
|
||||||
|
def classify_pair(client, text_a, meta_a, text_b, meta_b):
|
||||||
|
user_msg = f"""Absatz A ({meta_a}):
|
||||||
|
"{text_a}"
|
||||||
|
|
||||||
|
Absatz B ({meta_b}):
|
||||||
|
"{text_b}"
|
||||||
|
|
||||||
|
Welche logische Relation besteht von A zu B?"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = client.chat.completions.create(
|
||||||
|
model=MODEL,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||||||
|
{"role": "user", "content": user_msg},
|
||||||
|
],
|
||||||
|
temperature=0.1,
|
||||||
|
max_tokens=150,
|
||||||
|
)
|
||||||
|
content = resp.choices[0].message.content.strip()
|
||||||
|
# Parse JSON from response
|
||||||
|
if content.startswith("```"):
|
||||||
|
content = content.split("```")[1].strip()
|
||||||
|
if content.startswith("json"):
|
||||||
|
content = content[4:].strip()
|
||||||
|
return json.loads(content)
|
||||||
|
except Exception as e:
|
||||||
|
return {"relation": "error", "confidence": 0, "explanation": str(e)}
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if not API_KEY:
|
||||||
|
print("DASHSCOPE_API_KEY nicht gesetzt.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
|
||||||
|
db = sqlite3.connect(DB_PATH)
|
||||||
|
db.row_factory = sqlite3.Row
|
||||||
|
|
||||||
|
# Create output table
|
||||||
|
db.executescript("""
|
||||||
|
CREATE TABLE IF NOT EXISTS argument_links (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
source_podcast TEXT, source_episode TEXT, source_idx INTEGER,
|
||||||
|
target_podcast TEXT, target_episode TEXT, target_idx INTEGER,
|
||||||
|
relation TEXT, confidence REAL, explanation TEXT, score REAL
|
||||||
|
);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_arglinks ON argument_links(relation);
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Get top semantic links (cross-episode, prefer cross-podcast)
|
||||||
|
rows = db.execute("""
|
||||||
|
SELECT sl.podcast_id, sl.source_episode, sl.source_idx,
|
||||||
|
sl.target_podcast, sl.target_episode, sl.target_idx, sl.score,
|
||||||
|
p1.text as source_text, p2.text as target_text,
|
||||||
|
e1.title as source_title, e1.guest as source_guest,
|
||||||
|
e2.title as target_title, e2.guest as target_guest
|
||||||
|
FROM semantic_links sl
|
||||||
|
JOIN paragraphs p1 ON sl.podcast_id = p1.podcast_id AND sl.source_episode = p1.episode_id AND sl.source_idx = p1.idx
|
||||||
|
JOIN paragraphs p2 ON sl.target_podcast = p2.podcast_id AND sl.target_episode = p2.episode_id AND sl.target_idx = p2.idx
|
||||||
|
JOIN episodes e1 ON sl.podcast_id = e1.podcast_id AND sl.source_episode = e1.id
|
||||||
|
JOIN episodes e2 ON sl.target_podcast = e2.podcast_id AND sl.target_episode = e2.id
|
||||||
|
WHERE sl.source_episode != sl.target_episode
|
||||||
|
ORDER BY sl.score DESC
|
||||||
|
LIMIT ?
|
||||||
|
""", (LIMIT,)).fetchall()
|
||||||
|
|
||||||
|
print(f"Klassifiziere {len(rows)} Paare mit {MODEL}…")
|
||||||
|
|
||||||
|
# Check already processed
|
||||||
|
existing = set()
|
||||||
|
try:
|
||||||
|
for r in db.execute("SELECT source_podcast||source_episode||source_idx||target_podcast||target_episode||target_idx as k FROM argument_links").fetchall():
|
||||||
|
existing.add(r["k"])
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
processed = 0
|
||||||
|
skipped = 0
|
||||||
|
|
||||||
|
for i, row in enumerate(rows):
|
||||||
|
key = f"{row['podcast_id']}{row['source_episode']}{row['source_idx']}{row['target_podcast']}{row['target_episode']}{row['target_idx']}"
|
||||||
|
if key in existing:
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
meta_a = f"{row['source_episode']}: {row['source_title']} — {row['source_guest']}"
|
||||||
|
meta_b = f"{row['target_episode']}: {row['target_title']} — {row['target_guest']}"
|
||||||
|
|
||||||
|
result = classify_pair(
|
||||||
|
client,
|
||||||
|
row["source_text"][:800], meta_a,
|
||||||
|
row["target_text"][:800], meta_b
|
||||||
|
)
|
||||||
|
|
||||||
|
db.execute(
|
||||||
|
"INSERT INTO argument_links (source_podcast, source_episode, source_idx, "
|
||||||
|
"target_podcast, target_episode, target_idx, relation, confidence, explanation, score) "
|
||||||
|
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
|
||||||
|
(row["podcast_id"], row["source_episode"], row["source_idx"],
|
||||||
|
row["target_podcast"], row["target_episode"], row["target_idx"],
|
||||||
|
result.get("relation", "error"), result.get("confidence", 0),
|
||||||
|
result.get("explanation", ""), row["score"])
|
||||||
|
)
|
||||||
|
|
||||||
|
processed += 1
|
||||||
|
if processed % 10 == 0:
|
||||||
|
db.commit()
|
||||||
|
print(f" {processed}/{len(rows) - skipped} klassifiziert…")
|
||||||
|
|
||||||
|
# Rate limiting
|
||||||
|
time.sleep(0.3)
|
||||||
|
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
# Stats
|
||||||
|
stats = db.execute("SELECT relation, COUNT(*) as c FROM argument_links GROUP BY relation ORDER BY c DESC").fetchall()
|
||||||
|
print(f"\nFertig: {processed} neue, {skipped} übersprungen.")
|
||||||
|
print("Verteilung:")
|
||||||
|
for s in stats:
|
||||||
|
print(f" {s['relation']}: {s['c']}")
|
||||||
|
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
154
scripts/curate_debates.py
Normal file
154
scripts/curate_debates.py
Normal file
@ -0,0 +1,154 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""#18 Cross-Podcast-Debatte: Kuratiere Gegenüberstellungen zu gemeinsamen Themen.
|
||||||
|
|
||||||
|
Nimmt die stärksten Cross-Podcast-Paare und lässt Qwen Übereinstimmungen/Divergenzen zusammenfassen.
|
||||||
|
|
||||||
|
Nutzung:
|
||||||
|
DASHSCOPE_API_KEY=... python3 curate_debates.py [db-pfad] [limit]
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite"
|
||||||
|
LIMIT = int(sys.argv[2]) if len(sys.argv) > 2 else 100
|
||||||
|
API_KEY = os.environ.get("DASHSCOPE_API_KEY", "")
|
||||||
|
BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
|
||||||
|
MODEL = "qwen-plus"
|
||||||
|
|
||||||
|
SYSTEM_PROMPT = """Du bist ein Diskursanalyst. Du erhältst zwei Textabschnitte aus VERSCHIEDENEN Podcasts, die dasselbe Thema behandeln.
|
||||||
|
|
||||||
|
Erstelle eine kurze Gegenüberstellung. Antworte NUR mit JSON:
|
||||||
|
|
||||||
|
{
|
||||||
|
"topic": "Das gemeinsame Thema in 3-5 Wörtern",
|
||||||
|
"agreement": "Worin stimmen beide überein? (1-2 Sätze)",
|
||||||
|
"divergence": "Worin unterscheiden sie sich? (1-2 Sätze, oder 'keine wesentliche Divergenz')",
|
||||||
|
"insight": "Was lernt man durch die Gegenüberstellung, das man aus keinem der beiden allein lernen würde? (1 Satz)"
|
||||||
|
}"""
|
||||||
|
|
||||||
|
|
||||||
|
def curate_pair(client, text_a, meta_a, text_b, meta_b):
|
||||||
|
user_msg = f"""Podcast A — {meta_a}:
|
||||||
|
"{text_a}"
|
||||||
|
|
||||||
|
Podcast B — {meta_b}:
|
||||||
|
"{text_b}"
|
||||||
|
|
||||||
|
Erstelle die Gegenüberstellung."""
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = client.chat.completions.create(
|
||||||
|
model=MODEL,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||||||
|
{"role": "user", "content": user_msg},
|
||||||
|
],
|
||||||
|
temperature=0.2,
|
||||||
|
max_tokens=300,
|
||||||
|
)
|
||||||
|
content = resp.choices[0].message.content.strip()
|
||||||
|
if content.startswith("```"):
|
||||||
|
content = content.split("```")[1].strip()
|
||||||
|
if content.startswith("json"):
|
||||||
|
content = content[4:].strip()
|
||||||
|
return json.loads(content)
|
||||||
|
except Exception as e:
|
||||||
|
return {"topic": "error", "agreement": "", "divergence": "", "insight": str(e)}
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if not API_KEY:
|
||||||
|
print("DASHSCOPE_API_KEY nicht gesetzt.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
|
||||||
|
db = sqlite3.connect(DB_PATH)
|
||||||
|
db.row_factory = sqlite3.Row
|
||||||
|
|
||||||
|
db.executescript("""
|
||||||
|
CREATE TABLE IF NOT EXISTS debates (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
topic TEXT,
|
||||||
|
source_podcast TEXT, source_episode TEXT, source_idx INTEGER,
|
||||||
|
target_podcast TEXT, target_episode TEXT, target_idx INTEGER,
|
||||||
|
agreement TEXT, divergence TEXT, insight TEXT, score REAL
|
||||||
|
);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_debates_topic ON debates(topic);
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Get strongest cross-podcast links
|
||||||
|
rows = db.execute("""
|
||||||
|
SELECT sl.podcast_id, sl.source_episode, sl.source_idx,
|
||||||
|
sl.target_podcast, sl.target_episode, sl.target_idx, sl.score,
|
||||||
|
p1.text as source_text, p2.text as target_text,
|
||||||
|
pc1.name as source_podcast_name, pc2.name as target_podcast_name,
|
||||||
|
e1.title as source_title, e1.guest as source_guest,
|
||||||
|
e2.title as target_title, e2.guest as target_guest
|
||||||
|
FROM semantic_links sl
|
||||||
|
JOIN paragraphs p1 ON sl.podcast_id = p1.podcast_id AND sl.source_episode = p1.episode_id AND sl.source_idx = p1.idx
|
||||||
|
JOIN paragraphs p2 ON sl.target_podcast = p2.podcast_id AND sl.target_episode = p2.episode_id AND sl.target_idx = p2.idx
|
||||||
|
JOIN episodes e1 ON sl.podcast_id = e1.podcast_id AND sl.source_episode = e1.id
|
||||||
|
JOIN episodes e2 ON sl.target_podcast = e2.podcast_id AND sl.target_episode = e2.id
|
||||||
|
JOIN podcasts pc1 ON sl.podcast_id = pc1.id
|
||||||
|
JOIN podcasts pc2 ON sl.target_podcast = pc2.id
|
||||||
|
WHERE sl.podcast_id != sl.target_podcast
|
||||||
|
ORDER BY sl.score DESC
|
||||||
|
LIMIT ?
|
||||||
|
""", (LIMIT,)).fetchall()
|
||||||
|
|
||||||
|
print(f"Kuratiere {len(rows)} Cross-Podcast-Debatten mit {MODEL}…")
|
||||||
|
|
||||||
|
existing = set()
|
||||||
|
try:
|
||||||
|
for r in db.execute("SELECT source_podcast||source_episode||source_idx||target_podcast||target_episode||target_idx as k FROM debates").fetchall():
|
||||||
|
existing.add(r["k"])
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
processed = 0
|
||||||
|
for i, row in enumerate(rows):
|
||||||
|
key = f"{row['podcast_id']}{row['source_episode']}{row['source_idx']}{row['target_podcast']}{row['target_episode']}{row['target_idx']}"
|
||||||
|
if key in existing:
|
||||||
|
continue
|
||||||
|
|
||||||
|
meta_a = f"{row['source_podcast_name']} / {row['source_episode']}: {row['source_title']} ({row['source_guest']})"
|
||||||
|
meta_b = f"{row['target_podcast_name']} / {row['target_episode']}: {row['target_title']} ({row['target_guest']})"
|
||||||
|
|
||||||
|
result = curate_pair(client, row["source_text"][:800], meta_a, row["target_text"][:800], meta_b)
|
||||||
|
|
||||||
|
db.execute(
|
||||||
|
"INSERT INTO debates (topic, source_podcast, source_episode, source_idx, "
|
||||||
|
"target_podcast, target_episode, target_idx, agreement, divergence, insight, score) "
|
||||||
|
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
|
||||||
|
(result.get("topic", ""), row["podcast_id"], row["source_episode"], row["source_idx"],
|
||||||
|
row["target_podcast"], row["target_episode"], row["target_idx"],
|
||||||
|
result.get("agreement", ""), result.get("divergence", ""),
|
||||||
|
result.get("insight", ""), row["score"])
|
||||||
|
)
|
||||||
|
|
||||||
|
processed += 1
|
||||||
|
if processed % 10 == 0:
|
||||||
|
db.commit()
|
||||||
|
print(f" {processed} kuratiert…")
|
||||||
|
|
||||||
|
time.sleep(0.3)
|
||||||
|
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
topics = db.execute("SELECT topic, COUNT(*) as c FROM debates GROUP BY topic ORDER BY c DESC LIMIT 20").fetchall()
|
||||||
|
print(f"\nFertig: {processed} Debatten kuratiert.")
|
||||||
|
print("Top-Themen:")
|
||||||
|
for t in topics:
|
||||||
|
print(f" {t['topic']}: {t['c']}")
|
||||||
|
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
184
scripts/detect_gaps.py
Normal file
184
scripts/detect_gaps.py
Normal file
@ -0,0 +1,184 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""#14 Leerstellen-Detektor: Embedding-Cluster-Analyse zur Identifikation von Diskurslücken.
|
||||||
|
|
||||||
|
Bildet Cluster über alle Paragraphen, misst Dichte pro Podcast,
|
||||||
|
identifiziert asymmetrische und leere Cluster.
|
||||||
|
|
||||||
|
Nutzung:
|
||||||
|
python3 detect_gaps.py [db-pfad] [output-json]
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import sqlite3
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite"
|
||||||
|
OUTPUT = sys.argv[2] if len(sys.argv) > 2 else "data/gaps_analysis.json"
|
||||||
|
|
||||||
|
N_CLUSTERS = 30 # Feinere Auflösung
|
||||||
|
|
||||||
|
|
||||||
|
def load_embeddings(db_path):
|
||||||
|
db = sqlite3.connect(db_path)
|
||||||
|
db.row_factory = sqlite3.Row
|
||||||
|
rows = db.execute(
|
||||||
|
"SELECT p.id, p.podcast_id, p.episode_id, p.idx, p.text, p.embedding, e.title, e.guest "
|
||||||
|
"FROM paragraphs p JOIN episodes e ON p.podcast_id = e.podcast_id AND p.episode_id = e.id "
|
||||||
|
"WHERE p.embedding IS NOT NULL"
|
||||||
|
).fetchall()
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
meta = []
|
||||||
|
vectors = []
|
||||||
|
for r in rows:
|
||||||
|
meta.append({
|
||||||
|
"id": r["id"], "podcast_id": r["podcast_id"],
|
||||||
|
"episode_id": r["episode_id"], "idx": r["idx"],
|
||||||
|
"text": r["text"][:200], "title": r["title"], "guest": r["guest"]
|
||||||
|
})
|
||||||
|
vectors.append(np.frombuffer(r["embedding"], dtype=np.float32))
|
||||||
|
|
||||||
|
return np.array(vectors), meta
|
||||||
|
|
||||||
|
|
||||||
|
def kmeans_simple(vectors, k, max_iter=50):
|
||||||
|
"""Simple K-Means ohne sklearn-Dependency."""
|
||||||
|
n = len(vectors)
|
||||||
|
# Init: random selection
|
||||||
|
rng = np.random.default_rng(42)
|
||||||
|
indices = rng.choice(n, k, replace=False)
|
||||||
|
centroids = vectors[indices].copy()
|
||||||
|
|
||||||
|
labels = np.zeros(n, dtype=int)
|
||||||
|
|
||||||
|
for _ in range(max_iter):
|
||||||
|
# Assign
|
||||||
|
dists = np.linalg.norm(vectors[:, None] - centroids[None, :], axis=2)
|
||||||
|
new_labels = np.argmin(dists, axis=1)
|
||||||
|
|
||||||
|
if np.all(new_labels == labels):
|
||||||
|
break
|
||||||
|
labels = new_labels
|
||||||
|
|
||||||
|
# Update centroids
|
||||||
|
for j in range(k):
|
||||||
|
mask = labels == j
|
||||||
|
if mask.sum() > 0:
|
||||||
|
centroids[j] = vectors[mask].mean(axis=0)
|
||||||
|
|
||||||
|
return labels, centroids
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("Lade Embeddings…")
|
||||||
|
vectors, meta = load_embeddings(DB_PATH)
|
||||||
|
print(f" {len(vectors)} Absätze geladen.")
|
||||||
|
|
||||||
|
# Normalize
|
||||||
|
norms = np.linalg.norm(vectors, axis=1, keepdims=True)
|
||||||
|
norms[norms == 0] = 1
|
||||||
|
vectors_norm = vectors / norms
|
||||||
|
|
||||||
|
print(f"Clustere in {N_CLUSTERS} Gruppen…")
|
||||||
|
labels, centroids = kmeans_simple(vectors_norm, N_CLUSTERS)
|
||||||
|
|
||||||
|
# Analyze clusters
|
||||||
|
podcasts = sorted(set(m["podcast_id"] for m in meta))
|
||||||
|
clusters = []
|
||||||
|
|
||||||
|
for c in range(N_CLUSTERS):
|
||||||
|
mask = labels == c
|
||||||
|
indices = np.where(mask)[0]
|
||||||
|
members = [meta[i] for i in indices]
|
||||||
|
|
||||||
|
# Count per podcast
|
||||||
|
per_podcast = {p: 0 for p in podcasts}
|
||||||
|
for m in members:
|
||||||
|
per_podcast[m["podcast_id"]] += 1
|
||||||
|
|
||||||
|
# Representative texts (closest to centroid)
|
||||||
|
if len(indices) > 0:
|
||||||
|
dists = np.linalg.norm(vectors_norm[indices] - centroids[c], axis=1)
|
||||||
|
top_indices = indices[np.argsort(dists)[:5]]
|
||||||
|
representative = [{"text": meta[i]["text"], "episode": meta[i]["episode_id"],
|
||||||
|
"podcast": meta[i]["podcast_id"], "guest": meta[i]["guest"]} for i in top_indices]
|
||||||
|
else:
|
||||||
|
representative = []
|
||||||
|
|
||||||
|
# Derive topic label from representative texts
|
||||||
|
words = " ".join(m["text"][:100] for m in members[:20]).lower().split()
|
||||||
|
# Simple word frequency (exclude common words)
|
||||||
|
stop = {"der", "die", "das", "und", "in", "von", "zu", "den", "ist", "ein", "eine", "es", "mit",
|
||||||
|
"auf", "für", "an", "sich", "nicht", "auch", "dass", "wir", "man", "aber", "des", "dem",
|
||||||
|
"werden", "oder", "als", "wie", "hat", "ich", "sind", "was", "so", "haben", "dann",
|
||||||
|
"wenn", "noch", "schon", "kann", "wird", "hier", "über", "nach", "nur", "bei", "da",
|
||||||
|
"diese", "dieser", "dieses", "einem", "einer", "also", "ja", "mal", "war", "sehr",
|
||||||
|
"gibt", "aus", "zum", "zur", "mehr", "immer", "weil", "uns", "sie", "er", "vom"}
|
||||||
|
word_freq = {}
|
||||||
|
for w in words:
|
||||||
|
w = w.strip(".,;:!?\"'()[]")
|
||||||
|
if len(w) > 3 and w not in stop:
|
||||||
|
word_freq[w] = word_freq.get(w, 0) + 1
|
||||||
|
top_words = sorted(word_freq.items(), key=lambda x: -x[1])[:5]
|
||||||
|
label = ", ".join(w for w, _ in top_words) if top_words else f"Cluster {c}"
|
||||||
|
|
||||||
|
clusters.append({
|
||||||
|
"id": c,
|
||||||
|
"label": label,
|
||||||
|
"size": int(mask.sum()),
|
||||||
|
"per_podcast": per_podcast,
|
||||||
|
"representative": representative,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Sort by size
|
||||||
|
clusters.sort(key=lambda x: -x["size"])
|
||||||
|
|
||||||
|
# Identify gaps
|
||||||
|
gaps = []
|
||||||
|
for cl in clusters:
|
||||||
|
total = cl["size"]
|
||||||
|
if total < 3:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for p in podcasts:
|
||||||
|
count = cl["per_podcast"].get(p, 0)
|
||||||
|
other_total = total - count
|
||||||
|
if other_total > 10 and count <= 2:
|
||||||
|
gaps.append({
|
||||||
|
"cluster_label": cl["label"],
|
||||||
|
"cluster_size": total,
|
||||||
|
"missing_in": p,
|
||||||
|
"present_in_count": other_total,
|
||||||
|
"representative": cl["representative"][:3],
|
||||||
|
})
|
||||||
|
|
||||||
|
# Sort gaps by how asymmetric they are
|
||||||
|
gaps.sort(key=lambda x: -x["present_in_count"])
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"total_paragraphs": len(meta),
|
||||||
|
"podcasts": podcasts,
|
||||||
|
"n_clusters": N_CLUSTERS,
|
||||||
|
"clusters": clusters,
|
||||||
|
"gaps": gaps[:30],
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(OUTPUT, "w") as f:
|
||||||
|
json.dump(result, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
print(f"\n{len(clusters)} Cluster, {len(gaps)} Leerstellen identifiziert.")
|
||||||
|
print(f"\nTop-Leerstellen:")
|
||||||
|
for g in gaps[:10]:
|
||||||
|
print(f" [{g['missing_in']}] fehlt: \"{g['cluster_label']}\" ({g['present_in_count']} Absätze im anderen Podcast)")
|
||||||
|
|
||||||
|
print(f"\nCluster-Größen:")
|
||||||
|
for cl in clusters[:15]:
|
||||||
|
bar = " | ".join(f"{p}:{cl['per_podcast'][p]}" for p in podcasts)
|
||||||
|
print(f" {cl['label'][:40]:40s} ({cl['size']:4d}) — {bar}")
|
||||||
|
|
||||||
|
print(f"\nErgebnis: {OUTPUT}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
167
scripts/detect_narrative_shift.py
Normal file
167
scripts/detect_narrative_shift.py
Normal file
@ -0,0 +1,167 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""#15 Narrative Shift Detection: Wie verschiebt sich das Framing über die Zeit?
|
||||||
|
|
||||||
|
Berechnet Embedding-Drift pro Themen-Cluster über die Episodenreihenfolge.
|
||||||
|
Spitzen = Framing-Wechsel.
|
||||||
|
|
||||||
|
Nutzung:
|
||||||
|
python3 detect_narrative_shift.py [db-pfad] [output-json]
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import sqlite3
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite"
|
||||||
|
OUTPUT = sys.argv[2] if len(sys.argv) > 2 else "data/narrative_shifts.json"
|
||||||
|
|
||||||
|
# Themen-Keywords für Cluster-Zuordnung
|
||||||
|
THEMES = {
|
||||||
|
"klimaschutz": ["klima", "klimawandel", "co2", "emission", "erderwärmung", "klimaschutz", "temperatur", "paris"],
|
||||||
|
"sicherheit": ["sicherheit", "verteidigung", "militär", "nato", "krieg", "frieden", "abschreckung", "bundeswehr"],
|
||||||
|
"demokratie": ["demokratie", "demokratisch", "wahl", "parlament", "abstimmung", "beteiligung", "grundgesetz"],
|
||||||
|
"ungleichheit": ["ungleichheit", "armut", "vermögen", "reichtum", "einkommen", "verteilung", "gini"],
|
||||||
|
"digitalisierung": ["digital", "plattform", "algorithmus", "google", "meta", "tiktok", "internet", "daten"],
|
||||||
|
"bildung": ["bildung", "schule", "universität", "lernen", "ausbildung", "studier", "lehre"],
|
||||||
|
"gesundheit": ["gesundheit", "krankheit", "allergie", "medizin", "prävention", "gesundheitssystem"],
|
||||||
|
"migration": ["migration", "flucht", "integration", "zuwanderung", "fachkräfte", "asyl"],
|
||||||
|
"wirtschaft": ["wirtschaft", "wachstum", "bip", "konjunktur", "inflation", "arbeitsmarkt", "produktivität"],
|
||||||
|
"freiheit": ["freiheit", "grundrecht", "diskriminierung", "gleichstellung", "meinungsfreiheit"],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def load_data(db_path):
|
||||||
|
db = sqlite3.connect(db_path)
|
||||||
|
db.row_factory = sqlite3.Row
|
||||||
|
rows = db.execute(
|
||||||
|
"SELECT p.podcast_id, p.episode_id, p.idx, p.text, p.embedding, e.staffel "
|
||||||
|
"FROM paragraphs p JOIN episodes e ON p.podcast_id = e.podcast_id AND p.episode_id = e.id "
|
||||||
|
"WHERE p.embedding IS NOT NULL "
|
||||||
|
"ORDER BY p.podcast_id, e.staffel, p.episode_id, p.idx"
|
||||||
|
).fetchall()
|
||||||
|
db.close()
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
def classify_theme(text):
|
||||||
|
"""Ordne einen Absatz einem Thema zu (Keyword-Match)."""
|
||||||
|
text_lower = text.lower()
|
||||||
|
scores = {}
|
||||||
|
for theme, keywords in THEMES.items():
|
||||||
|
score = sum(1 for kw in keywords if kw in text_lower)
|
||||||
|
if score > 0:
|
||||||
|
scores[theme] = score
|
||||||
|
if not scores:
|
||||||
|
return None
|
||||||
|
return max(scores, key=scores.get)
|
||||||
|
|
||||||
|
|
||||||
|
def cosine_distance(a, b):
|
||||||
|
na, nb = np.linalg.norm(a), np.linalg.norm(b)
|
||||||
|
if na == 0 or nb == 0:
|
||||||
|
return 1.0
|
||||||
|
return 1.0 - np.dot(a, b) / (na * nb)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("Lade Daten…")
|
||||||
|
rows = load_data(DB_PATH)
|
||||||
|
print(f" {len(rows)} Absätze geladen.")
|
||||||
|
|
||||||
|
# Group by podcast → episode → theme
|
||||||
|
podcasts = {}
|
||||||
|
for r in rows:
|
||||||
|
pid = r["podcast_id"]
|
||||||
|
eid = r["episode_id"]
|
||||||
|
text = r["text"]
|
||||||
|
vec = np.frombuffer(r["embedding"], dtype=np.float32)
|
||||||
|
theme = classify_theme(text)
|
||||||
|
|
||||||
|
if theme is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if pid not in podcasts:
|
||||||
|
podcasts[pid] = {}
|
||||||
|
if eid not in podcasts[pid]:
|
||||||
|
podcasts[pid][eid] = {"staffel": r["staffel"], "themes": {}}
|
||||||
|
if theme not in podcasts[pid][eid]["themes"]:
|
||||||
|
podcasts[pid][eid]["themes"][theme] = []
|
||||||
|
podcasts[pid][eid]["themes"][theme].append(vec)
|
||||||
|
|
||||||
|
# Compute centroid per (podcast, episode, theme)
|
||||||
|
shifts = {}
|
||||||
|
|
||||||
|
for pid, episodes in podcasts.items():
|
||||||
|
ep_list = sorted(episodes.keys()) # Lexicographic = chronological for SxEy format
|
||||||
|
|
||||||
|
for theme in THEMES:
|
||||||
|
centroids = []
|
||||||
|
ep_labels = []
|
||||||
|
|
||||||
|
for eid in ep_list:
|
||||||
|
if theme not in episodes[eid]["themes"]:
|
||||||
|
continue
|
||||||
|
vecs = np.array(episodes[eid]["themes"][theme])
|
||||||
|
centroid = vecs.mean(axis=0)
|
||||||
|
centroids.append(centroid)
|
||||||
|
ep_labels.append(eid)
|
||||||
|
|
||||||
|
if len(centroids) < 3:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Compute drift between consecutive episodes
|
||||||
|
drifts = []
|
||||||
|
for i in range(1, len(centroids)):
|
||||||
|
drift = cosine_distance(centroids[i - 1], centroids[i])
|
||||||
|
drifts.append({
|
||||||
|
"from": ep_labels[i - 1],
|
||||||
|
"to": ep_labels[i],
|
||||||
|
"drift": round(float(drift), 4),
|
||||||
|
})
|
||||||
|
|
||||||
|
# Find spikes (> 1.5 * median)
|
||||||
|
drift_vals = [d["drift"] for d in drifts]
|
||||||
|
median = float(np.median(drift_vals))
|
||||||
|
mean = float(np.mean(drift_vals))
|
||||||
|
|
||||||
|
spikes = [d for d in drifts if d["drift"] > median * 1.5]
|
||||||
|
|
||||||
|
key = f"{pid}/{theme}"
|
||||||
|
shifts[key] = {
|
||||||
|
"podcast": pid,
|
||||||
|
"theme": theme,
|
||||||
|
"episodes": ep_labels,
|
||||||
|
"n_episodes": len(ep_labels),
|
||||||
|
"mean_drift": round(mean, 4),
|
||||||
|
"median_drift": round(median, 4),
|
||||||
|
"max_drift": round(max(drift_vals), 4),
|
||||||
|
"drifts": drifts,
|
||||||
|
"spikes": spikes,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Sort by max drift
|
||||||
|
sorted_shifts = sorted(shifts.values(), key=lambda x: -x["max_drift"])
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"total_themes_tracked": len(sorted_shifts),
|
||||||
|
"themes": list(THEMES.keys()),
|
||||||
|
"shifts": sorted_shifts,
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(OUTPUT, "w") as f:
|
||||||
|
json.dump(result, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
print(f"\n{len(sorted_shifts)} Themen-Verläufe berechnet.")
|
||||||
|
print(f"\nGrößte Framing-Shifts:")
|
||||||
|
for s in sorted_shifts[:10]:
|
||||||
|
spike_info = ""
|
||||||
|
if s["spikes"]:
|
||||||
|
spike_info = " | Spikes: " + ", ".join(f"{sp['from']}→{sp['to']}({sp['drift']:.3f})" for sp in s["spikes"][:3])
|
||||||
|
print(f" {s['podcast']}/{s['theme']:15s} — max_drift={s['max_drift']:.4f}, mean={s['mean_drift']:.4f}{spike_info}")
|
||||||
|
|
||||||
|
print(f"\nErgebnis: {OUTPUT}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
140
scripts/extract_claims.py
Normal file
140
scripts/extract_claims.py
Normal file
@ -0,0 +1,140 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""#16 Claim-Verification-Layer: Extrahiere prüfbare Behauptungen aus Transkripten.
|
||||||
|
|
||||||
|
Nutzung:
|
||||||
|
DASHSCOPE_API_KEY=... python3 extract_claims.py [db-pfad]
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite"
|
||||||
|
API_KEY = os.environ.get("DASHSCOPE_API_KEY", "")
|
||||||
|
BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
|
||||||
|
MODEL = "qwen-turbo" # Günstiger für Massenverarbeitung
|
||||||
|
BATCH_SIZE = 3 # Absätze pro API-Call
|
||||||
|
|
||||||
|
SYSTEM_PROMPT = """Du bist ein Faktenprüfer. Du erhältst Podcast-Transkript-Absätze.
|
||||||
|
Extrahiere ALLE prüfbaren faktischen Behauptungen (Zahlen, Statistiken, kausale Aussagen, Verweise auf Studien/Gesetze).
|
||||||
|
KEINE Meinungen, Bewertungen oder rhetorische Fragen.
|
||||||
|
|
||||||
|
Antworte NUR mit einem JSON-Array. Für jeden Absatz ein Objekt:
|
||||||
|
[{"paragraph_idx": 0, "claims": [{"text": "Die Behauptung", "type": "statistic|causal|reference|number", "verifiable": true}]}]
|
||||||
|
|
||||||
|
Wenn ein Absatz keine prüfbaren Claims enthält: {"paragraph_idx": 0, "claims": []}"""
|
||||||
|
|
||||||
|
|
||||||
|
def extract_batch(client, paragraphs):
|
||||||
|
"""Extrahiere Claims aus einem Batch von Absätzen."""
|
||||||
|
user_msg = ""
|
||||||
|
for i, p in enumerate(paragraphs):
|
||||||
|
user_msg += f"\n--- Absatz {i} ({p['episode_id']}, {p['start_time']:.0f}s) ---\n{p['text'][:600]}\n"
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = client.chat.completions.create(
|
||||||
|
model=MODEL,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||||||
|
{"role": "user", "content": user_msg},
|
||||||
|
],
|
||||||
|
temperature=0.1,
|
||||||
|
max_tokens=1000,
|
||||||
|
)
|
||||||
|
content = resp.choices[0].message.content.strip()
|
||||||
|
if content.startswith("```"):
|
||||||
|
content = content.split("```")[1].strip()
|
||||||
|
if content.startswith("json"):
|
||||||
|
content = content[4:].strip()
|
||||||
|
return json.loads(content)
|
||||||
|
except Exception as e:
|
||||||
|
return [{"paragraph_idx": i, "claims": [], "error": str(e)} for i in range(len(paragraphs))]
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if not API_KEY:
|
||||||
|
print("DASHSCOPE_API_KEY nicht gesetzt.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
|
||||||
|
db = sqlite3.connect(DB_PATH)
|
||||||
|
db.row_factory = sqlite3.Row
|
||||||
|
|
||||||
|
# Create output table
|
||||||
|
db.executescript("""
|
||||||
|
CREATE TABLE IF NOT EXISTS claims (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
podcast_id TEXT, episode_id TEXT, paragraph_idx INTEGER,
|
||||||
|
claim_text TEXT, claim_type TEXT, verifiable BOOLEAN,
|
||||||
|
start_time REAL
|
||||||
|
);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_claims_episode ON claims(podcast_id, episode_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_claims_type ON claims(claim_type);
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Check what's already processed
|
||||||
|
processed_keys = set()
|
||||||
|
try:
|
||||||
|
for r in db.execute("SELECT DISTINCT podcast_id||'/'||episode_id||'/'||paragraph_idx as k FROM claims").fetchall():
|
||||||
|
processed_keys.add(r["k"])
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Get all paragraphs
|
||||||
|
rows = db.execute(
|
||||||
|
"SELECT id, podcast_id, episode_id, idx, start_time, text FROM paragraphs ORDER BY podcast_id, episode_id, idx"
|
||||||
|
).fetchall()
|
||||||
|
|
||||||
|
# Filter already processed
|
||||||
|
todo = [r for r in rows if f"{r['podcast_id']}/{r['episode_id']}/{r['idx']}" not in processed_keys]
|
||||||
|
print(f"Extrahiere Claims: {len(todo)} Absätze zu verarbeiten ({len(rows) - len(todo)} bereits fertig)")
|
||||||
|
|
||||||
|
total_claims = 0
|
||||||
|
for i in range(0, len(todo), BATCH_SIZE):
|
||||||
|
batch = todo[i:i + BATCH_SIZE]
|
||||||
|
paras = [{"episode_id": r["episode_id"], "start_time": r["start_time"] or 0, "text": r["text"]} for r in batch]
|
||||||
|
|
||||||
|
results = extract_batch(client, paras)
|
||||||
|
|
||||||
|
for j, result in enumerate(results):
|
||||||
|
if j >= len(batch):
|
||||||
|
break
|
||||||
|
row = batch[j]
|
||||||
|
for claim in result.get("claims", []):
|
||||||
|
db.execute(
|
||||||
|
"INSERT INTO claims (podcast_id, episode_id, paragraph_idx, claim_text, claim_type, verifiable, start_time) "
|
||||||
|
"VALUES (?, ?, ?, ?, ?, ?, ?)",
|
||||||
|
(row["podcast_id"], row["episode_id"], row["idx"],
|
||||||
|
claim.get("text", ""), claim.get("type", "unknown"),
|
||||||
|
claim.get("verifiable", True), row["start_time"])
|
||||||
|
)
|
||||||
|
total_claims += 1
|
||||||
|
|
||||||
|
if (i // BATCH_SIZE) % 20 == 0:
|
||||||
|
db.commit()
|
||||||
|
print(f" {min(i + BATCH_SIZE, len(todo))}/{len(todo)} Absätze, {total_claims} Claims bisher")
|
||||||
|
|
||||||
|
time.sleep(0.2)
|
||||||
|
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
# Stats
|
||||||
|
stats = db.execute("SELECT claim_type, COUNT(*) as c FROM claims GROUP BY claim_type ORDER BY c DESC").fetchall()
|
||||||
|
podcast_stats = db.execute("SELECT podcast_id, COUNT(*) as c FROM claims GROUP BY podcast_id").fetchall()
|
||||||
|
print(f"\nFertig: {total_claims} Claims extrahiert.")
|
||||||
|
print("Nach Typ:")
|
||||||
|
for s in stats:
|
||||||
|
print(f" {s['claim_type']}: {s['c']}")
|
||||||
|
print("Nach Podcast:")
|
||||||
|
for s in podcast_stats:
|
||||||
|
print(f" {s['podcast_id']}: {s['c']}")
|
||||||
|
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
143
scripts/extract_questions.py
Normal file
143
scripts/extract_questions.py
Normal file
@ -0,0 +1,143 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""#17 Frage-Antwort-Asymmetrie: Extrahiere Fragen aus Transkripten.
|
||||||
|
|
||||||
|
Nutzung:
|
||||||
|
DASHSCOPE_API_KEY=... python3 extract_questions.py [db-pfad]
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
DB_PATH = sys.argv[1] if len(sys.argv) > 1 else "data/db.sqlite"
|
||||||
|
API_KEY = os.environ.get("DASHSCOPE_API_KEY", "")
|
||||||
|
BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
|
||||||
|
MODEL = "qwen-turbo"
|
||||||
|
BATCH_SIZE = 3
|
||||||
|
|
||||||
|
SYSTEM_PROMPT = """Du bist ein Diskursanalyst. Du erhältst Podcast-Transkript-Absätze.
|
||||||
|
Extrahiere ALLE Fragen (explizite und implizite). Klassifiziere jede Frage.
|
||||||
|
|
||||||
|
Antworte NUR mit einem JSON-Array:
|
||||||
|
[{"paragraph_idx": 0, "questions": [{"text": "Die Frage", "type": "genuine|rhetorical|follow_up|implicit", "answered": "yes|partial|no|self_answered"}]}]
|
||||||
|
|
||||||
|
- genuine: Echte Frage, die eine Antwort erwartet
|
||||||
|
- rhetorical: Rhetorische Frage zur Betonung
|
||||||
|
- follow_up: Rückfrage/Nachfrage
|
||||||
|
- implicit: Implizite Frage (z.B. "Da fragt man sich natürlich…")
|
||||||
|
|
||||||
|
- answered: Wird die Frage im selben Absatz beantwortet?
|
||||||
|
|
||||||
|
Wenn keine Fragen: {"paragraph_idx": 0, "questions": []}"""
|
||||||
|
|
||||||
|
|
||||||
|
def extract_batch(client, paragraphs):
|
||||||
|
user_msg = ""
|
||||||
|
for i, p in enumerate(paragraphs):
|
||||||
|
user_msg += f"\n--- Absatz {i} ({p['episode_id']}, {p['start_time']:.0f}s) ---\n{p['text'][:600]}\n"
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = client.chat.completions.create(
|
||||||
|
model=MODEL,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||||||
|
{"role": "user", "content": user_msg},
|
||||||
|
],
|
||||||
|
temperature=0.1,
|
||||||
|
max_tokens=1000,
|
||||||
|
)
|
||||||
|
content = resp.choices[0].message.content.strip()
|
||||||
|
if content.startswith("```"):
|
||||||
|
content = content.split("```")[1].strip()
|
||||||
|
if content.startswith("json"):
|
||||||
|
content = content[4:].strip()
|
||||||
|
return json.loads(content)
|
||||||
|
except Exception as e:
|
||||||
|
return [{"paragraph_idx": i, "questions": []} for i in range(len(paragraphs))]
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if not API_KEY:
|
||||||
|
print("DASHSCOPE_API_KEY nicht gesetzt.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
|
||||||
|
db = sqlite3.connect(DB_PATH)
|
||||||
|
db.row_factory = sqlite3.Row
|
||||||
|
|
||||||
|
db.executescript("""
|
||||||
|
CREATE TABLE IF NOT EXISTS questions (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
podcast_id TEXT, episode_id TEXT, paragraph_idx INTEGER,
|
||||||
|
question_text TEXT, question_type TEXT,
|
||||||
|
answered TEXT DEFAULT 'unknown',
|
||||||
|
answered_by_podcast TEXT, answered_by_episode TEXT, answered_by_idx INTEGER,
|
||||||
|
start_time REAL
|
||||||
|
);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_questions_episode ON questions(podcast_id, episode_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_questions_type ON questions(question_type);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_questions_answered ON questions(answered);
|
||||||
|
""")
|
||||||
|
|
||||||
|
processed_keys = set()
|
||||||
|
try:
|
||||||
|
for r in db.execute("SELECT DISTINCT podcast_id||'/'||episode_id||'/'||paragraph_idx as k FROM questions").fetchall():
|
||||||
|
processed_keys.add(r["k"])
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
rows = db.execute(
|
||||||
|
"SELECT id, podcast_id, episode_id, idx, start_time, text FROM paragraphs ORDER BY podcast_id, episode_id, idx"
|
||||||
|
).fetchall()
|
||||||
|
|
||||||
|
todo = [r for r in rows if f"{r['podcast_id']}/{r['episode_id']}/{r['idx']}" not in processed_keys]
|
||||||
|
print(f"Extrahiere Fragen: {len(todo)} Absätze ({len(rows) - len(todo)} bereits fertig)")
|
||||||
|
|
||||||
|
total_questions = 0
|
||||||
|
for i in range(0, len(todo), BATCH_SIZE):
|
||||||
|
batch = todo[i:i + BATCH_SIZE]
|
||||||
|
paras = [{"episode_id": r["episode_id"], "start_time": r["start_time"] or 0, "text": r["text"]} for r in batch]
|
||||||
|
|
||||||
|
results = extract_batch(client, paras)
|
||||||
|
|
||||||
|
for j, result in enumerate(results):
|
||||||
|
if j >= len(batch):
|
||||||
|
break
|
||||||
|
row = batch[j]
|
||||||
|
for q in result.get("questions", []):
|
||||||
|
db.execute(
|
||||||
|
"INSERT INTO questions (podcast_id, episode_id, paragraph_idx, question_text, question_type, answered, start_time) "
|
||||||
|
"VALUES (?, ?, ?, ?, ?, ?, ?)",
|
||||||
|
(row["podcast_id"], row["episode_id"], row["idx"],
|
||||||
|
q.get("text", ""), q.get("type", "unknown"),
|
||||||
|
q.get("answered", "unknown"), row["start_time"])
|
||||||
|
)
|
||||||
|
total_questions += 1
|
||||||
|
|
||||||
|
if (i // BATCH_SIZE) % 20 == 0:
|
||||||
|
db.commit()
|
||||||
|
print(f" {min(i + BATCH_SIZE, len(todo))}/{len(todo)} Absätze, {total_questions} Fragen bisher")
|
||||||
|
|
||||||
|
time.sleep(0.2)
|
||||||
|
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
stats = db.execute("SELECT question_type, COUNT(*) as c FROM questions GROUP BY question_type ORDER BY c DESC").fetchall()
|
||||||
|
answered_stats = db.execute("SELECT answered, COUNT(*) as c FROM questions GROUP BY answered ORDER BY c DESC").fetchall()
|
||||||
|
print(f"\nFertig: {total_questions} Fragen extrahiert.")
|
||||||
|
print("Nach Typ:")
|
||||||
|
for s in stats:
|
||||||
|
print(f" {s['question_type']}: {s['c']}")
|
||||||
|
print("Beantwortet:")
|
||||||
|
for s in answered_stats:
|
||||||
|
print(f" {s['answered']}: {s['c']}")
|
||||||
|
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
35
scripts/run_all_qwen.sh
Executable file
35
scripts/run_all_qwen.sh
Executable file
@ -0,0 +1,35 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Sequentielle Ausführung aller Qwen-Analyse-Scripts
|
||||||
|
# Vermeidet SQLite-Locks durch seriellen Zugriff
|
||||||
|
|
||||||
|
set -e
|
||||||
|
cd "$(dirname "$0")/.."
|
||||||
|
DB="data/db.sqlite"
|
||||||
|
export DASHSCOPE_API_KEY=$(security find-generic-password -s qwen-api -w)
|
||||||
|
|
||||||
|
echo "$(date): Starte Qwen-Analyse-Pipeline"
|
||||||
|
echo "================================================"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "$(date): #18 Debatten kuratieren (100 Paare, schnellster Task)…"
|
||||||
|
python3 scripts/curate_debates.py "$DB" 100
|
||||||
|
echo "$(date): #18 fertig."
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "$(date): #13 Argumentketten klassifizieren (500 Paare)…"
|
||||||
|
python3 scripts/analyse_arguments.py "$DB" 500
|
||||||
|
echo "$(date): #13 fertig."
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "$(date): #17 Fragen extrahieren (3727 Absätze)…"
|
||||||
|
python3 scripts/extract_questions.py "$DB"
|
||||||
|
echo "$(date): #17 fertig."
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "$(date): #16 Claims extrahieren (3727 Absätze)…"
|
||||||
|
python3 scripts/extract_claims.py "$DB"
|
||||||
|
echo "$(date): #16 fertig."
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "$(date): Alle Qwen-Tasks abgeschlossen."
|
||||||
|
echo "DONE" > /tmp/qwen_pipeline_done
|
||||||
@ -182,6 +182,21 @@
|
|||||||
margin-right: 6px;
|
margin-right: 6px;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ── Word-level highlighting (#12) ── */
|
||||||
|
.word {
|
||||||
|
transition: background 0.1s, color 0.1s;
|
||||||
|
border-radius: 2px; padding: 0 1px;
|
||||||
|
cursor: pointer;
|
||||||
|
}
|
||||||
|
.word:hover { background: var(--surface2); }
|
||||||
|
.word.word-active {
|
||||||
|
background: var(--accent-green)44;
|
||||||
|
color: var(--text);
|
||||||
|
}
|
||||||
|
.word.word-spoken {
|
||||||
|
color: var(--text);
|
||||||
|
}
|
||||||
|
|
||||||
/* ── Search Results ── */
|
/* ── Search Results ── */
|
||||||
.search-result {
|
.search-result {
|
||||||
background: var(--surface2); border-radius: 8px; padding: 12px;
|
background: var(--surface2); border-radius: 8px; padding: 12px;
|
||||||
@ -516,6 +531,8 @@ const TranscriptView = {
|
|||||||
paragraphs: null,
|
paragraphs: null,
|
||||||
userScrolled: false,
|
userScrolled: false,
|
||||||
activeIdx: -1,
|
activeIdx: -1,
|
||||||
|
words: null, // Word-level timestamps for current episode
|
||||||
|
activeWordIdx: -1, // Currently highlighted word
|
||||||
|
|
||||||
async show(episodeId, seekTime) {
|
async show(episodeId, seekTime) {
|
||||||
const epData = await this.loadEpisodeTranscript(episodeId);
|
const epData = await this.loadEpisodeTranscript(episodeId);
|
||||||
@ -540,10 +557,35 @@ const TranscriptView = {
|
|||||||
let html = `<h2 style="color:${staffel.color}">${ep.id}: ${ep.title} — Transkript</h2>`;
|
let html = `<h2 style="color:${staffel.color}">${ep.id}: ${ep.title} — Transkript</h2>`;
|
||||||
html += `<p class="subtitle">${ep.guest}</p>`;
|
html += `<p class="subtitle">${ep.guest}</p>`;
|
||||||
|
|
||||||
|
// Try to load word-level timestamps
|
||||||
|
this.words = null;
|
||||||
|
this.activeWordIdx = -1;
|
||||||
|
if (CURRENT_PODCAST) {
|
||||||
|
try {
|
||||||
|
const wr = await fetch(`${API_BASE}/api/podcasts/${CURRENT_PODCAST}/transcript/${episodeId}/words`);
|
||||||
|
if (wr.ok) {
|
||||||
|
const wd = await wr.json();
|
||||||
|
if (wd.available && wd.words.length > 0) this.words = wd.words;
|
||||||
|
}
|
||||||
|
} catch (e) {}
|
||||||
|
}
|
||||||
|
|
||||||
this.paragraphs.forEach((p, i) => {
|
this.paragraphs.forEach((p, i) => {
|
||||||
html += `<div class="transcript-para" data-idx="${i}" onclick="TranscriptView.seekTo(${p.start})">`;
|
html += `<div class="transcript-para" data-idx="${i}" onclick="TranscriptView.seekTo(${p.start})">`;
|
||||||
html += `<span class="ts">${fmtTime(p.start)}</span>`;
|
html += `<span class="ts">${fmtTime(p.start)}</span>`;
|
||||||
html += escHtml(p.text);
|
if (this.words) {
|
||||||
|
// Render words as clickable spans with timestamps
|
||||||
|
const paraWords = this.words.filter(w => w.start >= p.start - 0.5 && w.start < (p.end || p.start + 999));
|
||||||
|
if (paraWords.length > 0) {
|
||||||
|
paraWords.forEach(w => {
|
||||||
|
html += `<span class="word" data-ws="${w.start}" data-we="${w.end}" onclick="event.stopPropagation();TranscriptView.seekTo(${w.start})">${escHtml(w.word)} </span>`;
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
html += escHtml(p.text);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
html += escHtml(p.text);
|
||||||
|
}
|
||||||
html += `</div>`;
|
html += `</div>`;
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -557,21 +599,45 @@ const TranscriptView = {
|
|||||||
|
|
||||||
syncToTime(time) {
|
syncToTime(time) {
|
||||||
if (!this.visible || !this.paragraphs) return;
|
if (!this.visible || !this.paragraphs) return;
|
||||||
|
|
||||||
|
// Paragraph-level sync
|
||||||
let idx = -1;
|
let idx = -1;
|
||||||
for (let i = 0; i < this.paragraphs.length; i++) {
|
for (let i = 0; i < this.paragraphs.length; i++) {
|
||||||
if (time >= this.paragraphs[i].start) idx = i;
|
if (time >= this.paragraphs[i].start) idx = i;
|
||||||
else break;
|
else break;
|
||||||
}
|
}
|
||||||
if (idx === this.activeIdx) return;
|
if (idx !== this.activeIdx) {
|
||||||
this.activeIdx = idx;
|
this.activeIdx = idx;
|
||||||
|
document.querySelectorAll('.transcript-para.active').forEach(el => el.classList.remove('active'));
|
||||||
|
if (idx >= 0) {
|
||||||
|
const el = document.querySelector(`.transcript-para[data-idx="${idx}"]`);
|
||||||
|
if (el) {
|
||||||
|
el.classList.add('active');
|
||||||
|
if (!this.userScrolled) {
|
||||||
|
el.scrollIntoView({ behavior: 'smooth', block: 'center' });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
document.querySelectorAll('.transcript-para.active').forEach(el => el.classList.remove('active'));
|
// Word-level sync (#12)
|
||||||
if (idx >= 0) {
|
if (this.words) {
|
||||||
const el = document.querySelector(`.transcript-para[data-idx="${idx}"]`);
|
const prev = document.querySelector('.word.word-active');
|
||||||
if (el) {
|
if (prev) prev.classList.replace('word-active', 'word-spoken');
|
||||||
el.classList.add('active');
|
|
||||||
if (!this.userScrolled) {
|
// Find current word by time
|
||||||
el.scrollIntoView({ behavior: 'smooth', block: 'center' });
|
const wordEl = document.querySelector(`.word[data-ws]`);
|
||||||
|
if (wordEl) {
|
||||||
|
const allWords = document.querySelectorAll('.word[data-ws]');
|
||||||
|
for (const w of allWords) {
|
||||||
|
const ws = parseFloat(w.dataset.ws);
|
||||||
|
const we = parseFloat(w.dataset.we);
|
||||||
|
if (time >= ws && time < we) {
|
||||||
|
w.classList.add('word-active');
|
||||||
|
break;
|
||||||
|
} else if (time >= we) {
|
||||||
|
w.classList.add('word-spoken');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user