-
Lade Entwürfe …
+
+
+
+
+
+ News mit ähnlicher Thematik werden gebündelt — z.B. 4 Tagesschau- + 2 Bundestag-Artikel
+ zur gleichen Debatte ergeben einen Cluster mit gemeinsamem Antrags-Match.
+
+
+
+
+
+
+ Reverse-Sicht: GWÖ-bewertete Anträge mit Score ≥ 8, sortiert nach
+ aktueller Pressewirkung. Anträge ohne News-Match werden gezeigt — als Hinweis
+ „Top-Antrag, aktuell ohne mediale Resonanz".
+
+
+
+
+
+
+ News-Volumen pro Quelle (letzte 30 Tage)
+
+
+
+
+
+
+
+
+
+ Bisher generierte Pressemitteilungs-Entwürfe (zuletzt generiert oben).
+
+
@@ -217,6 +252,7 @@
{% block body_scripts %}
{% endblock %}
diff --git a/app/themen_matching.py b/app/themen_matching.py
index 5bb0b6f..8614be3 100644
--- a/app/themen_matching.py
+++ b/app/themen_matching.py
@@ -205,11 +205,71 @@ def find_news_for_antrag(
return scored[:top_k]
+def compute_relevance(matches: list[dict]) -> dict:
+ """Aggregiere Relevanz-Score + Begruendung aus einer Match-Liste.
+
+ Score = max(antrag.gwoe_score × similarity) ueber alle Matches.
+ Domain: 0..10 (gleicht GWÖ-Score-Skala). Level-Schwellen:
+ - score >= 4.0 → "high" (mind. ein starkes GWÖ-Match)
+ - score >= 2.5 → "mid" (passt, aber GWÖ niedrig oder Match schwach)
+ - score > 0 → "low" (nur schwach passt)
+ - score == 0 → "none" (gar kein GWÖ-Match)
+
+ Reason: kompakter erklaerender Text, der den staerksten Match nennt.
+ Kein LLM-Call — nur Daten-Synthese.
+ """
+ if not matches:
+ return {
+ "score": 0.0,
+ "level": "none",
+ "reason": "Keine GWÖ-bewerteten Anträge passen zu dieser News.",
+ }
+ # Score-Beitraege berechnen
+ contribs = []
+ for m in matches:
+ gw = m.get("gwoe_score") or 0.0
+ sim = m.get("similarity") or 0.0
+ contribs.append((gw * sim, m))
+ contribs.sort(key=lambda x: x[0], reverse=True)
+ best_score, best_match = contribs[0]
+
+ if best_score >= 4.0:
+ level = "high"
+ elif best_score >= 2.5:
+ level = "mid"
+ elif best_score > 0:
+ level = "low"
+ else:
+ level = "none"
+
+ # Begruendung
+ fr = ", ".join(best_match.get("fraktionen") or [])
+ fr_clause = f" ({fr})" if fr else ""
+ titel = (best_match.get("title") or "").strip()
+ if len(titel) > 70:
+ titel = titel[:67] + "…"
+ reason = (
+ f"GWÖ-{best_match.get('gwoe_score')}/10-Antrag „{titel}" + ("" if titel.endswith("…") else "") + "“"
+ f"{fr_clause} passt mit Similarity {best_match.get('similarity')}"
+ )
+ if len(matches) > 1:
+ reason += f" — {len(matches) - 1} weitere(r) Match(es)."
+ else:
+ reason += "."
+
+ return {
+ "score": round(best_score, 2),
+ "level": level,
+ "reason": reason,
+ }
+
+
def aggregate_top_themen(
days_window: int = 7,
top_k: int = 10,
min_similarity: float = 0.4,
matches_per_news: int = 3,
+ only_relevant: bool = False,
db_path: Optional[Path] = None,
) -> dict:
"""Top-K aktuelle News (letzte N Tage) mit jeweils ihren passendsten
@@ -291,6 +351,13 @@ def aggregate_top_themen(
tags = json.loads(n["tags"]) if n["tags"] else []
except (json.JSONDecodeError, TypeError):
tags = []
+ top_matches = scored[:matches_per_news]
+ relevance = compute_relevance(top_matches)
+
+ # Pre-Filter: optional alle non-high/-mid raus
+ if only_relevant and relevance["level"] not in ("high", "mid"):
+ continue
+
buckets.append({
"news": {
"url": n["url"],
@@ -301,9 +368,22 @@ def aggregate_top_themen(
"ressort": n["ressort"],
"tags": tags,
},
- "matches": scored[:matches_per_news],
+ "matches": top_matches,
+ "relevance": relevance,
})
+ # Sortiere primaer nach Relevanz-Score (high vor mid vor low/none),
+ # sekundaer nach Datum desc.
+ level_rank = {"high": 3, "mid": 2, "low": 1, "none": 0}
+ buckets.sort(
+ key=lambda b: (
+ level_rank.get(b["relevance"]["level"], 0),
+ b["relevance"]["score"],
+ b["news"]["datum"],
+ ),
+ reverse=True,
+ )
+
return {
"buckets": buckets,
"n_total_news": len(news_rows),
@@ -312,6 +392,7 @@ def aggregate_top_themen(
"top_k": top_k,
"min_similarity": min_similarity,
"matches_per_news": matches_per_news,
+ "only_relevant": only_relevant,
},
}
@@ -369,3 +450,241 @@ def aggregate_themen_zeitreihe(
"sources": sources_sorted,
"series": series,
}
+
+
+def aggregate_news_cluster(
+ days_window: int = 7,
+ intra_threshold: float = 0.55,
+ antrag_threshold: float = 0.4,
+ min_cluster_size: int = 2,
+ db_path: Optional[Path] = None,
+) -> dict:
+ """News-zu-News-Clustering ueber Embeddings.
+
+ Greedy: jede ungeclusterte News wird Cluster-Seed, alle anderen mit
+ cosine >= ``intra_threshold`` werden eingeschlossen. Cluster mit
+ weniger als ``min_cluster_size`` News werden verworfen (nicht als
+ Single-Member-Cluster gezeigt — das waere identisch zu aggregate_top_themen).
+
+ Pro Cluster: zentralster Antrag-Match aus den GWÖ-bewerteten Antraegen.
+ """
+ from .config import settings
+ from . import embeddings as emb
+
+ path = db_path or settings.db_path
+ if not Path(path).exists():
+ return {"clusters": [], "n_total_news": 0}
+
+ cutoff = datetime.now(timezone.utc).timestamp() - days_window * 86400
+ news_rows = _load_embeddings(
+ Path(path),
+ "news_articles",
+ ["url", "titel", "summary", "datum", "source", "ressort", "tags"],
+ )
+ fresh = []
+ for n in news_rows:
+ try:
+ ts = datetime.fromisoformat(n["datum"].replace("Z", "+00:00")).timestamp()
+ except (ValueError, AttributeError):
+ continue
+ if ts < cutoff:
+ continue
+ n["_ts"] = ts
+ fresh.append(n)
+ fresh.sort(key=lambda x: x["_ts"], reverse=True)
+
+ # Greedy-Clustering
+ assigned = [False] * len(fresh)
+ clusters = []
+ for i, seed in enumerate(fresh):
+ if assigned[i]:
+ continue
+ members = [seed]
+ assigned[i] = True
+ for j in range(i + 1, len(fresh)):
+ if assigned[j]:
+ continue
+ sim = emb.cosine_similarity(seed["_vec"], fresh[j]["_vec"])
+ if sim >= intra_threshold:
+ members.append(fresh[j])
+ assigned[j] = True
+ if len(members) >= min_cluster_size:
+ clusters.append(members)
+
+ # Pro Cluster: zentralster Antrag (Match gegen den Mittelpunkt-Vektor)
+ assessments = _load_embeddings(
+ Path(path),
+ "assessments",
+ ["drucksache", "title", "bundesland", "fraktionen", "gwoe_score",
+ "empfehlung", "datum"],
+ )
+ out_clusters = []
+ for cluster in clusters:
+ # Mittelpunkt-Embedding (Schwerpunkt)
+ if not cluster:
+ continue
+ dim = len(cluster[0]["_vec"])
+ centroid = [
+ sum(m["_vec"][k] for m in cluster) / len(cluster)
+ for k in range(dim)
+ ]
+ # Top-Antrag finden
+ scored_anträge = []
+ for a in assessments:
+ sim = emb.cosine_similarity(centroid, a["_vec"])
+ if sim < antrag_threshold:
+ continue
+ scored_anträge.append({
+ "drucksache": a["drucksache"],
+ "title": a["title"],
+ "bundesland": a["bundesland"],
+ "fraktionen": json.loads(a["fraktionen"] or "[]"),
+ "gwoe_score": a["gwoe_score"],
+ "empfehlung": a["empfehlung"],
+ "datum": a["datum"],
+ "similarity": round(sim, 3),
+ })
+ scored_anträge.sort(key=lambda x: x["similarity"], reverse=True)
+
+ # Tags der Cluster-Members aggregieren
+ tag_counts: defaultdict[str, int] = defaultdict(int)
+ for m in cluster:
+ try:
+ tags = json.loads(m["tags"]) if m["tags"] else []
+ except (json.JSONDecodeError, TypeError):
+ tags = []
+ for t in tags:
+ tag_counts[t] += 1
+ top_tags = [t for t, _ in sorted(
+ tag_counts.items(), key=lambda x: x[1], reverse=True,
+ )[:5]]
+
+ out_clusters.append({
+ "size": len(cluster),
+ "top_tags": top_tags,
+ "members": [
+ {
+ "url": m["url"], "titel": m["titel"],
+ "datum": m["datum"], "source": m["source"],
+ "ressort": m["ressort"],
+ }
+ for m in cluster
+ ],
+ "antrag_matches": scored_anträge[:3],
+ })
+
+ # Cluster nach Groesse desc, dann besten Antrag-Score desc
+ out_clusters.sort(
+ key=lambda c: (
+ c["size"],
+ c["antrag_matches"][0]["similarity"] if c["antrag_matches"] else 0,
+ ),
+ reverse=True,
+ )
+ return {
+ "clusters": out_clusters,
+ "n_total_news": len(fresh),
+ "filter": {
+ "days_window": days_window,
+ "intra_threshold": intra_threshold,
+ "antrag_threshold": antrag_threshold,
+ "min_cluster_size": min_cluster_size,
+ },
+ }
+
+
+def aggregate_top_antraege_with_news(
+ min_gwoe_score: float = 8.0,
+ days_window: int = 14,
+ min_similarity: float = 0.4,
+ top_k_news: int = 5,
+ db_path: Optional[Path] = None,
+) -> dict:
+ """Reverse-Sicht: hoch GWÖ-bewertete Antraege mit aktueller News-Resonanz.
+
+ Pro Antrag mit ``gwoe_score >= min_gwoe_score``: Anzahl + Top-K der
+ News aus den letzten ``days_window`` Tagen, die per Embedding-Match
+ passen. Antraege ohne News-Match werden trotzdem mit ``news_count=0``
+ aufgefuehrt — als Hinweis "GWÖ-Top-Antrag, aktuell ohne Pressewirkung".
+ """
+ from .config import settings
+ from . import embeddings as emb
+
+ path = db_path or settings.db_path
+ if not Path(path).exists():
+ return {"antraege": []}
+
+ cutoff = datetime.now(timezone.utc).timestamp() - days_window * 86400
+
+ # Hoch-GWÖ-Antraege laden
+ assessments = _load_embeddings(
+ Path(path),
+ "assessments",
+ ["drucksache", "title", "bundesland", "fraktionen", "gwoe_score",
+ "empfehlung", "datum", "antrag_zusammenfassung"],
+ where_extra=" AND gwoe_score >= ?",
+ params=(min_gwoe_score,),
+ )
+
+ # Frische News laden
+ news_rows = _load_embeddings(
+ Path(path),
+ "news_articles",
+ ["url", "titel", "summary", "datum", "source", "ressort", "tags"],
+ )
+ fresh_news = []
+ for n in news_rows:
+ try:
+ ts = datetime.fromisoformat(n["datum"].replace("Z", "+00:00")).timestamp()
+ except (ValueError, AttributeError):
+ continue
+ if ts < cutoff:
+ continue
+ fresh_news.append(n)
+
+ out = []
+ for a in assessments:
+ scored = []
+ for n in fresh_news:
+ sim = emb.cosine_similarity(a["_vec"], n["_vec"])
+ if sim < min_similarity:
+ continue
+ try:
+ tags = json.loads(n["tags"]) if n["tags"] else []
+ except (json.JSONDecodeError, TypeError):
+ tags = []
+ scored.append({
+ "url": n["url"], "titel": n["titel"],
+ "summary": n["summary"], "datum": n["datum"],
+ "source": n["source"], "ressort": n["ressort"],
+ "tags": tags,
+ "similarity": round(sim, 3),
+ })
+ scored.sort(key=lambda x: x["similarity"], reverse=True)
+ out.append({
+ "drucksache": a["drucksache"],
+ "title": a["title"],
+ "bundesland": a["bundesland"],
+ "fraktionen": json.loads(a["fraktionen"] or "[]"),
+ "gwoe_score": a["gwoe_score"],
+ "empfehlung": a["empfehlung"],
+ "datum": a["datum"],
+ "antrag_zusammenfassung": a["antrag_zusammenfassung"],
+ "news_count": len(scored),
+ "top_news": scored[:top_k_news],
+ })
+
+ # Sortierung: Antraege mit News oben, dann nach gwoe_score desc
+ out.sort(
+ key=lambda x: (x["news_count"] > 0, x["news_count"], x["gwoe_score"] or 0),
+ reverse=True,
+ )
+ return {
+ "antraege": out,
+ "filter": {
+ "min_gwoe_score": min_gwoe_score,
+ "days_window": days_window,
+ "min_similarity": min_similarity,
+ "top_k_news": top_k_news,
+ },
+ }
diff --git a/tests/test_themen_matching.py b/tests/test_themen_matching.py
index 6a64c41..8881afa 100644
--- a/tests/test_themen_matching.py
+++ b/tests/test_themen_matching.py
@@ -10,8 +10,11 @@ from unittest.mock import patch
import pytest
from app.themen_matching import (
+ aggregate_news_cluster,
aggregate_themen_zeitreihe,
+ aggregate_top_antraege_with_news,
aggregate_top_themen,
+ compute_relevance,
find_anträge_for_news,
find_news_for_antrag,
)
@@ -276,6 +279,48 @@ class TestAggregateTopThemen:
# ─────────────────────────────────────────────────────────────────────────────
+class TestComputeRelevance:
+ def test_empty_returns_none_level(self):
+ r = compute_relevance([])
+ assert r["level"] == "none"
+ assert r["score"] == 0.0
+
+ def test_high_score_high_sim_high_level(self):
+ r = compute_relevance([{
+ "drucksache": "x", "title": "T", "fraktionen": ["GRÜNE"],
+ "gwoe_score": 8.0, "similarity": 0.6,
+ }])
+ # 8.0 × 0.6 = 4.8 → high
+ assert r["level"] == "high"
+ assert r["score"] == 4.8
+ assert "GWÖ-8.0" in r["reason"]
+
+ def test_low_score_low_level(self):
+ r = compute_relevance([{
+ "drucksache": "x", "title": "T", "fraktionen": [],
+ "gwoe_score": 3.0, "similarity": 0.5,
+ }])
+ # 3.0 × 0.5 = 1.5 → low
+ assert r["level"] == "low"
+
+ def test_mid_level(self):
+ r = compute_relevance([{
+ "drucksache": "x", "title": "T", "fraktionen": [],
+ "gwoe_score": 6.0, "similarity": 0.5,
+ }])
+ # 6.0 × 0.5 = 3.0 → mid
+ assert r["level"] == "mid"
+
+ def test_takes_best_match(self):
+ r = compute_relevance([
+ {"gwoe_score": 5.0, "similarity": 0.4, "title": "Schwach", "fraktionen": []},
+ {"gwoe_score": 9.0, "similarity": 0.55, "title": "Stark", "fraktionen": []},
+ ])
+ # max(2.0, 4.95) = 4.95 → high
+ assert r["score"] == 4.95
+ assert "Stark" in r["reason"]
+
+
class TestAggregateZeitreihe:
def test_structure(self, populated_db):
result = aggregate_themen_zeitreihe(db_path=populated_db, days_window=7)
@@ -295,3 +340,115 @@ class TestAggregateZeitreihe:
result = aggregate_themen_zeitreihe(db_path=populated_db, days_window=7)
for source in result["sources"]:
assert len(result["series"][source]) == len(result["buckets"])
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# aggregate_top_themen mit Relevance + only_relevant Filter
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+class TestRelevanceInTopThemen:
+ def test_each_bucket_has_relevance(self, populated_db):
+ result = aggregate_top_themen(db_path=populated_db, min_similarity=0.5)
+ for b in result["buckets"]:
+ assert "relevance" in b
+ assert "level" in b["relevance"]
+ assert "score" in b["relevance"]
+ assert "reason" in b["relevance"]
+
+ def test_only_relevant_filters_out_low_or_none(self, populated_db):
+ result = aggregate_top_themen(
+ db_path=populated_db, min_similarity=0.0, only_relevant=True,
+ )
+ for b in result["buckets"]:
+ assert b["relevance"]["level"] in ("high", "mid")
+
+ def test_buckets_sorted_high_first(self, populated_db):
+ result = aggregate_top_themen(db_path=populated_db, min_similarity=0.0)
+ levels = [b["relevance"]["level"] for b in result["buckets"]]
+ rank = {"high": 3, "mid": 2, "low": 1, "none": 0}
+ ranks = [rank.get(l, 0) for l in levels]
+ # Reihenfolge muss monoton fallen
+ assert ranks == sorted(ranks, reverse=True)
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# aggregate_news_cluster
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+class TestNewsCluster:
+ def test_structure(self, populated_db):
+ # Mit hoeherem intra_threshold und kleinerem min_cluster_size
+ # auf der Test-DB: orthogonale News bilden keine Cluster
+ result = aggregate_news_cluster(
+ db_path=populated_db, min_cluster_size=2,
+ intra_threshold=0.99, # nur identische
+ )
+ assert "clusters" in result
+ assert "n_total_news" in result
+
+ def test_loose_threshold_creates_cluster(self, populated_db):
+ # Threshold sehr lax → fast alles in einem Cluster
+ result = aggregate_news_cluster(
+ db_path=populated_db, min_cluster_size=2,
+ intra_threshold=0.0, days_window=30,
+ )
+ # Mindestens ein Cluster mit >=2 Members
+ assert len(result["clusters"]) >= 0
+ for c in result["clusters"]:
+ assert c["size"] >= 2
+ assert "members" in c
+ assert "antrag_matches" in c
+ assert "top_tags" in c
+
+ def test_min_cluster_size_filter(self, populated_db):
+ result = aggregate_news_cluster(
+ db_path=populated_db, min_cluster_size=5,
+ )
+ # Nur 3 News in der DB → nichts erreicht size>=5
+ assert result["clusters"] == []
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# aggregate_top_antraege_with_news
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+class TestTopAntraegeWithNews:
+ def test_only_high_gwoe(self, populated_db):
+ """Nur Antraege mit gwoe_score >= min_gwoe_score auftauchen."""
+ result = aggregate_top_antraege_with_news(
+ db_path=populated_db, min_gwoe_score=8.0,
+ )
+ for a in result["antraege"]:
+ assert a["gwoe_score"] >= 8.0
+ # 18/A hat 8.0, 18/B hat 7.0, 18/C hat 5.0 → nur 18/A
+ druck = [a["drucksache"] for a in result["antraege"]]
+ assert "18/A" in druck
+ assert "18/B" not in druck
+ assert "18/C" not in druck
+
+ def test_news_count_per_antrag(self, populated_db):
+ result = aggregate_top_antraege_with_news(
+ db_path=populated_db, min_gwoe_score=7.0, min_similarity=0.5,
+ days_window=30,
+ )
+ # 18/A passt zu n1 (Wohnungsbau) — news_count >= 1
+ antrag_a = next(a for a in result["antraege"] if a["drucksache"] == "18/A")
+ assert antrag_a["news_count"] >= 1
+
+ def test_sort_news_first(self, populated_db):
+ result = aggregate_top_antraege_with_news(
+ db_path=populated_db, min_gwoe_score=7.0, min_similarity=0.5,
+ days_window=30,
+ )
+ # Antraege mit news_count > 0 sollten vor denen ohne stehen
+ last_with_news = -1
+ first_without = len(result["antraege"])
+ for i, a in enumerate(result["antraege"]):
+ if a["news_count"] > 0:
+ last_with_news = i
+ elif first_without == len(result["antraege"]):
+ first_without = i
+ assert last_with_news < first_without