372 lines
11 KiB
Python
372 lines
11 KiB
Python
|
|
"""Themen × Anträge Matching fuer das Aktuelle-Themen-Dashboard
|
|||
|
|
(#170 Phase 2).
|
|||
|
|
|
|||
|
|
Verschneidet News-Artikel-Embeddings (aus news_articles.summary_embedding)
|
|||
|
|
mit Antrag-Embeddings (assessments.summary_embedding) per Cosine-Similarity.
|
|||
|
|
Liefert pro News-Artikel die Top-K-passendsten Anträge.
|
|||
|
|
|
|||
|
|
Reuse:
|
|||
|
|
- ``embeddings.cosine_similarity`` fuer den Vektor-Vergleich
|
|||
|
|
- Beide Tabellen nutzen denselben Embedding-Modell-Vektorraum (qwen v4),
|
|||
|
|
daher direkter Cross-Vergleich moeglich
|
|||
|
|
- Filter ueber ``embedding_model``-Spalte, falls Migration laueft
|
|||
|
|
"""
|
|||
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import json
|
|||
|
|
import logging
|
|||
|
|
import sqlite3
|
|||
|
|
from collections import defaultdict
|
|||
|
|
from datetime import datetime, timezone
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import Optional
|
|||
|
|
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _load_embeddings(
|
|||
|
|
db_path: Path,
|
|||
|
|
table: str,
|
|||
|
|
select_cols: list[str],
|
|||
|
|
where_extra: str = "",
|
|||
|
|
params: tuple = (),
|
|||
|
|
) -> list[dict]:
|
|||
|
|
"""Generischer Loader fuer Tabellen mit ``summary_embedding``-Spalte.
|
|||
|
|
|
|||
|
|
Liefert Zeilen mit decoded Embedding-Vektor (oder filtert aus, wenn
|
|||
|
|
Modell nicht zum aktuellen READ-Modell passt).
|
|||
|
|
"""
|
|||
|
|
from . import embeddings as emb
|
|||
|
|
|
|||
|
|
if not Path(db_path).exists():
|
|||
|
|
return []
|
|||
|
|
conn = sqlite3.connect(str(db_path))
|
|||
|
|
try:
|
|||
|
|
conn.row_factory = sqlite3.Row
|
|||
|
|
cols = ", ".join(select_cols)
|
|||
|
|
sql = (
|
|||
|
|
f"SELECT {cols}, summary_embedding, embedding_model "
|
|||
|
|
f"FROM {table} "
|
|||
|
|
f"WHERE summary_embedding IS NOT NULL {where_extra}"
|
|||
|
|
)
|
|||
|
|
rows = conn.execute(sql, params).fetchall()
|
|||
|
|
finally:
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
out = []
|
|||
|
|
for r in rows:
|
|||
|
|
if r["embedding_model"] != emb.EMBEDDING_MODEL_READ:
|
|||
|
|
continue
|
|||
|
|
try:
|
|||
|
|
vec = json.loads(r["summary_embedding"])
|
|||
|
|
except (json.JSONDecodeError, TypeError):
|
|||
|
|
continue
|
|||
|
|
d = dict(r)
|
|||
|
|
d["_vec"] = vec
|
|||
|
|
out.append(d)
|
|||
|
|
return out
|
|||
|
|
|
|||
|
|
|
|||
|
|
def find_anträge_for_news(
|
|||
|
|
news_url: str,
|
|||
|
|
top_k: int = 5,
|
|||
|
|
min_similarity: float = 0.4,
|
|||
|
|
db_path: Optional[Path] = None,
|
|||
|
|
) -> list[dict]:
|
|||
|
|
"""Pro gegebener News-URL: Top-K aehnlichste Antraege per Cosine-Match.
|
|||
|
|
|
|||
|
|
Filter ``min_similarity`` haelt den Cut-Off fuer "passt einigermassen".
|
|||
|
|
0.4 ist empirisch der Punkt, ab dem qwen-v4-Embeddings semantisch
|
|||
|
|
relevant matchen.
|
|||
|
|
"""
|
|||
|
|
from .config import settings
|
|||
|
|
from . import embeddings as emb
|
|||
|
|
|
|||
|
|
path = db_path or settings.db_path
|
|||
|
|
if not Path(path).exists():
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
# 1. News-Vektor laden
|
|||
|
|
conn = sqlite3.connect(str(path))
|
|||
|
|
try:
|
|||
|
|
row = conn.execute(
|
|||
|
|
"""SELECT summary_embedding, embedding_model
|
|||
|
|
FROM news_articles WHERE url=?""",
|
|||
|
|
(news_url,),
|
|||
|
|
).fetchone()
|
|||
|
|
finally:
|
|||
|
|
conn.close()
|
|||
|
|
if not row or not row[0] or row[1] != emb.EMBEDDING_MODEL_READ:
|
|||
|
|
return []
|
|||
|
|
try:
|
|||
|
|
news_vec = json.loads(row[0])
|
|||
|
|
except (json.JSONDecodeError, TypeError):
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
# 2. Alle Assessments mit Embedding laden + scoren
|
|||
|
|
assessments = _load_embeddings(
|
|||
|
|
Path(path),
|
|||
|
|
"assessments",
|
|||
|
|
["drucksache", "title", "bundesland", "fraktionen", "gwoe_score",
|
|||
|
|
"empfehlung", "themen", "datum"],
|
|||
|
|
)
|
|||
|
|
scored = []
|
|||
|
|
for a in assessments:
|
|||
|
|
sim = emb.cosine_similarity(news_vec, a["_vec"])
|
|||
|
|
if sim < min_similarity:
|
|||
|
|
continue
|
|||
|
|
scored.append({
|
|||
|
|
"drucksache": a["drucksache"],
|
|||
|
|
"title": a["title"],
|
|||
|
|
"bundesland": a["bundesland"],
|
|||
|
|
"fraktionen": json.loads(a["fraktionen"] or "[]"),
|
|||
|
|
"gwoe_score": a["gwoe_score"],
|
|||
|
|
"empfehlung": a["empfehlung"],
|
|||
|
|
"themen": json.loads(a["themen"] or "[]"),
|
|||
|
|
"datum": a["datum"],
|
|||
|
|
"similarity": round(sim, 3),
|
|||
|
|
})
|
|||
|
|
scored.sort(key=lambda x: x["similarity"], reverse=True)
|
|||
|
|
return scored[:top_k]
|
|||
|
|
|
|||
|
|
|
|||
|
|
def find_news_for_antrag(
|
|||
|
|
drucksache: str,
|
|||
|
|
top_k: int = 5,
|
|||
|
|
min_similarity: float = 0.4,
|
|||
|
|
days_window: int = 90,
|
|||
|
|
db_path: Optional[Path] = None,
|
|||
|
|
) -> list[dict]:
|
|||
|
|
"""Pro gegebener Drucksache: Top-K aehnlichste News-Artikel per Cosine.
|
|||
|
|
|
|||
|
|
Filtert News auf ein Zeitfenster (Default 90 Tage), damit
|
|||
|
|
Pressemitteilungen aus aktueller Aktualitaet stammen.
|
|||
|
|
"""
|
|||
|
|
from .config import settings
|
|||
|
|
from . import embeddings as emb
|
|||
|
|
|
|||
|
|
path = db_path or settings.db_path
|
|||
|
|
if not Path(path).exists():
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
# 1. Antrag-Vektor laden
|
|||
|
|
conn = sqlite3.connect(str(path))
|
|||
|
|
try:
|
|||
|
|
row = conn.execute(
|
|||
|
|
"""SELECT summary_embedding, embedding_model
|
|||
|
|
FROM assessments WHERE drucksache=?""",
|
|||
|
|
(drucksache,),
|
|||
|
|
).fetchone()
|
|||
|
|
finally:
|
|||
|
|
conn.close()
|
|||
|
|
if not row or not row[0] or row[1] != emb.EMBEDDING_MODEL_READ:
|
|||
|
|
return []
|
|||
|
|
try:
|
|||
|
|
antrag_vec = json.loads(row[0])
|
|||
|
|
except (json.JSONDecodeError, TypeError):
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
# 2. News mit Datums-Filter laden
|
|||
|
|
cutoff = datetime.now(timezone.utc).timestamp() - days_window * 86400
|
|||
|
|
news = _load_embeddings(
|
|||
|
|
Path(path),
|
|||
|
|
"news_articles",
|
|||
|
|
["url", "titel", "summary", "datum", "source", "ressort", "tags"],
|
|||
|
|
)
|
|||
|
|
scored = []
|
|||
|
|
for n in news:
|
|||
|
|
sim = emb.cosine_similarity(antrag_vec, n["_vec"])
|
|||
|
|
if sim < min_similarity:
|
|||
|
|
continue
|
|||
|
|
# Datums-Filter
|
|||
|
|
try:
|
|||
|
|
news_ts = datetime.fromisoformat(
|
|||
|
|
n["datum"].replace("Z", "+00:00")
|
|||
|
|
).timestamp()
|
|||
|
|
if news_ts < cutoff:
|
|||
|
|
continue
|
|||
|
|
except (ValueError, AttributeError):
|
|||
|
|
pass # Wenn Datum nicht parsbar, lass es durch
|
|||
|
|
try:
|
|||
|
|
tags = json.loads(n["tags"]) if n["tags"] else []
|
|||
|
|
except (json.JSONDecodeError, TypeError):
|
|||
|
|
tags = []
|
|||
|
|
scored.append({
|
|||
|
|
"url": n["url"],
|
|||
|
|
"titel": n["titel"],
|
|||
|
|
"summary": n["summary"],
|
|||
|
|
"datum": n["datum"],
|
|||
|
|
"source": n["source"],
|
|||
|
|
"ressort": n["ressort"],
|
|||
|
|
"tags": tags,
|
|||
|
|
"similarity": round(sim, 3),
|
|||
|
|
})
|
|||
|
|
scored.sort(key=lambda x: x["similarity"], reverse=True)
|
|||
|
|
return scored[:top_k]
|
|||
|
|
|
|||
|
|
|
|||
|
|
def aggregate_top_themen(
|
|||
|
|
days_window: int = 7,
|
|||
|
|
top_k: int = 10,
|
|||
|
|
min_similarity: float = 0.4,
|
|||
|
|
matches_per_news: int = 3,
|
|||
|
|
db_path: Optional[Path] = None,
|
|||
|
|
) -> dict:
|
|||
|
|
"""Top-K aktuelle News (letzte N Tage) mit jeweils ihren passendsten
|
|||
|
|
Antraegen — der primaere Dashboard-Endpoint.
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
``{
|
|||
|
|
"buckets": [{
|
|||
|
|
"news": {url, titel, summary, datum, source, ressort, tags},
|
|||
|
|
"matches": [{drucksache, title, gwoe_score, similarity, ...}]
|
|||
|
|
}, ...],
|
|||
|
|
"n_total_news": int,
|
|||
|
|
"filter": {...}
|
|||
|
|
}``
|
|||
|
|
"""
|
|||
|
|
from .config import settings
|
|||
|
|
from . import embeddings as emb
|
|||
|
|
|
|||
|
|
path = db_path or settings.db_path
|
|||
|
|
if not Path(path).exists():
|
|||
|
|
return {"buckets": [], "n_total_news": 0, "filter": {
|
|||
|
|
"days_window": days_window, "top_k": top_k,
|
|||
|
|
"min_similarity": min_similarity,
|
|||
|
|
}}
|
|||
|
|
|
|||
|
|
cutoff = (
|
|||
|
|
datetime.now(timezone.utc).timestamp() - days_window * 86400
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
news_rows = _load_embeddings(
|
|||
|
|
Path(path),
|
|||
|
|
"news_articles",
|
|||
|
|
["url", "titel", "summary", "datum", "source", "ressort", "tags"],
|
|||
|
|
)
|
|||
|
|
# Nach Datum filtern
|
|||
|
|
fresh = []
|
|||
|
|
for n in news_rows:
|
|||
|
|
try:
|
|||
|
|
news_ts = datetime.fromisoformat(
|
|||
|
|
n["datum"].replace("Z", "+00:00")
|
|||
|
|
).timestamp()
|
|||
|
|
except (ValueError, AttributeError):
|
|||
|
|
continue
|
|||
|
|
if news_ts < cutoff:
|
|||
|
|
continue
|
|||
|
|
n["_ts"] = news_ts
|
|||
|
|
fresh.append(n)
|
|||
|
|
# Nach Datum desc sortieren, top_k cutten
|
|||
|
|
fresh.sort(key=lambda x: x["_ts"], reverse=True)
|
|||
|
|
fresh = fresh[:top_k]
|
|||
|
|
|
|||
|
|
# Pro News: alle Antraege scoren, Top matches_per_news behalten
|
|||
|
|
assessments = _load_embeddings(
|
|||
|
|
Path(path),
|
|||
|
|
"assessments",
|
|||
|
|
["drucksache", "title", "bundesland", "fraktionen", "gwoe_score",
|
|||
|
|
"empfehlung", "themen", "datum"],
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
buckets = []
|
|||
|
|
for n in fresh:
|
|||
|
|
scored = []
|
|||
|
|
for a in assessments:
|
|||
|
|
sim = emb.cosine_similarity(n["_vec"], a["_vec"])
|
|||
|
|
if sim < min_similarity:
|
|||
|
|
continue
|
|||
|
|
scored.append({
|
|||
|
|
"drucksache": a["drucksache"],
|
|||
|
|
"title": a["title"],
|
|||
|
|
"bundesland": a["bundesland"],
|
|||
|
|
"fraktionen": json.loads(a["fraktionen"] or "[]"),
|
|||
|
|
"gwoe_score": a["gwoe_score"],
|
|||
|
|
"empfehlung": a["empfehlung"],
|
|||
|
|
"datum": a["datum"],
|
|||
|
|
"similarity": round(sim, 3),
|
|||
|
|
})
|
|||
|
|
scored.sort(key=lambda x: x["similarity"], reverse=True)
|
|||
|
|
try:
|
|||
|
|
tags = json.loads(n["tags"]) if n["tags"] else []
|
|||
|
|
except (json.JSONDecodeError, TypeError):
|
|||
|
|
tags = []
|
|||
|
|
buckets.append({
|
|||
|
|
"news": {
|
|||
|
|
"url": n["url"],
|
|||
|
|
"titel": n["titel"],
|
|||
|
|
"summary": n["summary"],
|
|||
|
|
"datum": n["datum"],
|
|||
|
|
"source": n["source"],
|
|||
|
|
"ressort": n["ressort"],
|
|||
|
|
"tags": tags,
|
|||
|
|
},
|
|||
|
|
"matches": scored[:matches_per_news],
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
"buckets": buckets,
|
|||
|
|
"n_total_news": len(news_rows),
|
|||
|
|
"filter": {
|
|||
|
|
"days_window": days_window,
|
|||
|
|
"top_k": top_k,
|
|||
|
|
"min_similarity": min_similarity,
|
|||
|
|
"matches_per_news": matches_per_news,
|
|||
|
|
},
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def aggregate_themen_zeitreihe(
|
|||
|
|
days_window: int = 30,
|
|||
|
|
db_path: Optional[Path] = None,
|
|||
|
|
) -> dict:
|
|||
|
|
"""News-Volumen pro (Tag, Source) ueber die letzten N Tage —
|
|||
|
|
Stacked-Area-Chart.
|
|||
|
|
|
|||
|
|
Liefert Zeitreihe ohne Antrag-Match — nur die News-Aktivitaet pro
|
|||
|
|
Quelle, damit das Dashboard sehen kann, welche Quellen wie aktiv waren.
|
|||
|
|
"""
|
|||
|
|
from .config import settings
|
|||
|
|
|
|||
|
|
path = db_path or settings.db_path
|
|||
|
|
if not Path(path).exists():
|
|||
|
|
return {"buckets": [], "sources": [], "series": {}}
|
|||
|
|
|
|||
|
|
cutoff_ts = datetime.now(timezone.utc).timestamp() - days_window * 86400
|
|||
|
|
conn = sqlite3.connect(str(path))
|
|||
|
|
try:
|
|||
|
|
rows = conn.execute(
|
|||
|
|
"SELECT datum, source FROM news_articles"
|
|||
|
|
).fetchall()
|
|||
|
|
finally:
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
counts: defaultdict[tuple[str, str], int] = defaultdict(int)
|
|||
|
|
sources_seen: set[str] = set()
|
|||
|
|
days_seen: set[str] = set()
|
|||
|
|
for datum, source in rows:
|
|||
|
|
if not datum:
|
|||
|
|
continue
|
|||
|
|
try:
|
|||
|
|
ts = datetime.fromisoformat(datum.replace("Z", "+00:00")).timestamp()
|
|||
|
|
except (ValueError, AttributeError):
|
|||
|
|
continue
|
|||
|
|
if ts < cutoff_ts:
|
|||
|
|
continue
|
|||
|
|
day = datum[:10] # YYYY-MM-DD
|
|||
|
|
sources_seen.add(source)
|
|||
|
|
days_seen.add(day)
|
|||
|
|
counts[(day, source)] += 1
|
|||
|
|
|
|||
|
|
days_sorted = sorted(days_seen)
|
|||
|
|
sources_sorted = sorted(sources_seen)
|
|||
|
|
series = {
|
|||
|
|
s: [counts[(d, s)] for d in days_sorted]
|
|||
|
|
for s in sources_sorted
|
|||
|
|
}
|
|||
|
|
return {
|
|||
|
|
"buckets": days_sorted,
|
|||
|
|
"sources": sources_sorted,
|
|||
|
|
"series": series,
|
|||
|
|
}
|