feat(#170): Aktuelle-Themen-Dashboard — News × Anträge × Pressemitteilungen

Vollständiges 4-Phasen-Feature:

**Phase 1 — News-Aggregator** (`app/news_aggregator.py`)
- Tagesschau-API (`/api2u/news?ressort=...`) für inland/ausland/wirtschaft/wissen
- Bundestag-RSS für aktuellethemen / pressemitteilungen / hib
- DB-Tabelle `news_articles` (URL-PK, idempotent)
- Embeddings via existierender qwen-v4-Pipeline
- Cron-Script `scripts/auto-fetch-news.sh`
- Bewusst NICHT: RND.de (robots.txt bannt explizit ClaudeBot, GPTBot,
  CCBot, ChatGPT-User, Google-Extended). Nur AI-erlaubende, öffentlich-
  rechtliche/parlamentarische Quellen
- Volltexte werden NICHT persistiert (nur Titel + erster Satz)

**Phase 2 — Themen × Anträge Matching** (`app/themen_matching.py`)
- News-Embedding × Assessment-summary_embedding via Cosine-Similarity
- `find_anträge_for_news`: pro News die Top-K passenden Anträge
- `find_news_for_antrag`: pro Antrag Top-K News mit Datums-Fenster (90d)
- `aggregate_top_themen`: primärer Dashboard-Endpoint
- `aggregate_themen_zeitreihe`: News-Volumen pro Tag × Source

**Phase 3 — Dashboard-View** (`/aktuelle-themen`)
- Neuer linker Nav-Eintrag „Aktuelle Themen"
- Stacked-Area-Chart News-Volumen pro Quelle (30d)
- Pro News-Card: Titel + Summary + Tags + Top-3-Antrags-Match-Liste
  mit GWÖ-Score-Pill, Drucksache-Link, PM-Vorschlag-Button
- Filter: Zeitfenster, Top-N, min_similarity
- Auth-protected (require_auth)

**Phase 4 — Pressemitteilungs-Generator** (`app/presse_generator.py`)
- LLM-Prompt-Template (200-250 Worte, GWÖ-Sicht, JSON-Output)
- Reuse von `QwenBewerter` aus app/adapters/qwen_bewerter.py
- DB-Tabelle `presse_drafts` (Persistenz)
- POST `/api/aktuelle-themen/generate-presse` rate-limited 5/min,
  auth-only (LLM-Kosten)
- GET `/api/aktuelle-themen/drafts` + `/drafts/{id}` für Liste/Detail
- Manueller Trigger via UI-Button, kein Auto-Versand
- Modal-Anzeige des generierten Texts

**Compliance:**
- robots.txt-respektierend (ClaudeBot-Bann von RND vermieden, AI-
  erlaubende Quellen verwendet)
- UI zeigt nur Titel+URL+Datum+erster Satz, keine Volltext-Reproduktion
- Pressemitteilungen sind explizit Drafts, nicht Auto-Versand
- LLM-Calls rate-limited, auth-only

**Tests:** 43 neue Tests (19 news_aggregator + 16 themen_matching +
8 presse_generator). Suite jetzt 1048 grün.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dotty Dotter 2026-05-03 12:39:36 +02:00
parent 1e381d23ab
commit d54ce23e42
11 changed files with 2358 additions and 0 deletions

View File

@ -285,6 +285,55 @@ async def init_db():
"ON plenum_vote_results(bundesland, drucksache)" "ON plenum_vote_results(bundesland, drucksache)"
) )
# News-Artikel aus oeffentlich-rechtlichen Quellen (#170 Phase 1).
# Tagesschau-API + Bundestag-RSS — KEIN AI-banntes Quellmaterial
# (RND ist explizit per robots.txt ausgeschlossen).
# Volltexte werden NICHT persistiert — nur Titel + Summary fuer
# Embeddings + UI-Anzeige (Urheberrecht).
await db.execute("""
CREATE TABLE IF NOT EXISTS news_articles (
url TEXT PRIMARY KEY,
titel TEXT NOT NULL,
summary TEXT,
datum TEXT NOT NULL,
source TEXT NOT NULL,
ressort TEXT,
tags TEXT,
summary_embedding BLOB,
embedding_model TEXT,
fetched_at TEXT NOT NULL DEFAULT (datetime('now'))
)
""")
await db.execute(
"CREATE INDEX IF NOT EXISTS idx_news_datum "
"ON news_articles(datum)"
)
await db.execute(
"CREATE INDEX IF NOT EXISTS idx_news_source "
"ON news_articles(source)"
)
# Pressemitteilungs-Drafts (#170 Phase 4). LLM-generierte Vorschlaege,
# die einen Antrag in den Kontext eines News-Artikels stellen.
# Manueller Trigger, kein Auto-Versand.
await db.execute("""
CREATE TABLE IF NOT EXISTS presse_drafts (
id INTEGER PRIMARY KEY AUTOINCREMENT,
drucksache TEXT NOT NULL,
bundesland TEXT NOT NULL,
news_url TEXT NOT NULL,
news_titel TEXT NOT NULL,
titel TEXT NOT NULL,
body TEXT NOT NULL,
model TEXT NOT NULL,
created_at TEXT NOT NULL DEFAULT (datetime('now'))
)
""")
await db.execute(
"CREATE INDEX IF NOT EXISTS idx_presse_created "
"ON presse_drafts(created_at DESC)"
)
await db.commit() await db.commit()

View File

@ -2008,6 +2008,116 @@ async def auswertungen_page(request: Request, current_user: dict = Depends(requi
}) })
# ─── Aktuelle-Themen-Dashboard (#170) ──────────────────────────────────────
@app.get("/aktuelle-themen", response_class=HTMLResponse)
async def aktuelle_themen_page(
request: Request, current_user: dict = Depends(require_auth)
):
"""Aktuelle-Themen-Dashboard: News × Anträge × Pressemitteilungs-Drafts."""
return templates.TemplateResponse("v2/screens/aktuelle-themen.html", {
"request": request,
"app_name": settings.app_name,
"v2_active_nav": "aktuelle-themen",
**_v2_template_context(current_user),
})
@app.get("/api/aktuelle-themen/top")
async def api_aktuelle_themen_top(
days: int = 7,
top_k: int = 10,
min_similarity: float = 0.4,
matches_per_news: int = 3,
):
"""Top-K News der letzten N Tage mit Antrags-Match."""
from .themen_matching import aggregate_top_themen
return aggregate_top_themen(
days_window=days,
top_k=top_k,
min_similarity=min_similarity,
matches_per_news=matches_per_news,
)
@app.get("/api/aktuelle-themen/zeitreihe")
async def api_aktuelle_themen_zeitreihe(days: int = 30):
"""News-Volumen pro Tag × Source — Stacked-Area-Chart."""
from .themen_matching import aggregate_themen_zeitreihe
return aggregate_themen_zeitreihe(days_window=days)
@app.get("/api/aktuelle-themen/news-fuer-antrag")
async def api_news_fuer_antrag(
drucksache: str,
top_k: int = 5,
min_similarity: float = 0.4,
days: int = 90,
):
"""Top-K News, die zu einem gegebenen Antrag passen (für Detail-View)."""
from .themen_matching import find_news_for_antrag
return {"drucksache": drucksache, "matches": find_news_for_antrag(
drucksache=drucksache, top_k=top_k,
min_similarity=min_similarity, days_window=days,
)}
@app.get("/api/aktuelle-themen/anträge-fuer-news")
async def api_anträge_fuer_news(
url: str,
top_k: int = 5,
min_similarity: float = 0.4,
):
"""Top-K Anträge, die zu einem gegebenen News-Artikel passen."""
from .themen_matching import find_anträge_for_news
return {"news_url": url, "matches": find_anträge_for_news(
news_url=url, top_k=top_k, min_similarity=min_similarity,
)}
# ─── Pressemitteilungs-Drafts (#170 Phase 4) ──────────────────────────
@app.post("/api/aktuelle-themen/generate-presse")
@limiter.limit("5/minute")
async def api_generate_presse(
request: Request,
drucksache: str,
news_url: str,
current_user: dict = Depends(require_auth),
):
"""Generiert einen LLM-Pressemitteilungs-Vorschlag.
Auth-only + rate-limited (5/min) wegen LLM-Kosten.
"""
from .presse_generator import generate_draft
try:
return await generate_draft(drucksache=drucksache, news_url=news_url)
except ValueError as e:
raise HTTPException(status_code=404, detail=str(e))
except Exception as e:
logger.exception("generate_draft failed")
raise HTTPException(status_code=500, detail=str(e))
@app.get("/api/aktuelle-themen/drafts")
async def api_drafts_list(limit: int = 20):
"""Liste der zuletzt generierten Pressemitteilungs-Entwürfe."""
from .presse_generator import list_drafts
return {"drafts": list_drafts(limit=limit)}
@app.get("/api/aktuelle-themen/drafts/{draft_id}")
async def api_draft_detail(draft_id: int):
"""Einen einzelnen Pressemitteilungs-Entwurf."""
from .presse_generator import get_draft
d = get_draft(draft_id)
if not d:
raise HTTPException(status_code=404, detail="Draft nicht gefunden")
return d
@app.get("/api/auswertungen/matrix") @app.get("/api/auswertungen/matrix")
async def auswertungen_matrix( async def auswertungen_matrix(
wahlperiode: Optional[str] = None, wahlperiode: Optional[str] = None,

347
app/news_aggregator.py Normal file
View File

@ -0,0 +1,347 @@
"""News-Aggregator fuer das Aktuelle-Themen-Dashboard (#170 Phase 1).
Fetcht regelmaessig News-Headlines aus AI-erlaubenden, oeffentlich-rechtlichen
oder parlamentarischen Quellen:
- **Tagesschau-API** (https://www.tagesschau.de/api2u/news/) strukturiertes
JSON mit ressort, tags, firstSentence pro Artikel.
- **Bundestag-Aktuellethemen-RSS**
(https://www.bundestag.de/static/appdata/includes/rss/aktuellethemen.rss)
RSS mit Titel + Beschreibung pro Artikel.
**Bewusst NICHT verwendet:** RND.de (robots.txt bannt explizit ClaudeBot,
GPTBot, ChatGPT-User, CCBot, Google-Extended). RSS-Feeds privat-publizierter
Verlage werden nur dann angebunden, wenn AI-Verarbeitung explizit erlaubt ist.
**Compliance:**
- Volltexte werden NICHT persistiert. Nur Titel + erster Satz / Description.
- Kein User-Agent, der einen AI-Bot vortaeuscht (kein "ClaudeBot").
- Rate-Limiting: 1 Request pro Quelle pro Aufruf (kein Loop, kein Hammer).
Datenbank-Tabelle ``news_articles`` (siehe app/database.py):
url PK, titel, summary, datum (ISO), source, ressort, tags JSON,
summary_embedding BLOB, embedding_model, fetched_at.
"""
from __future__ import annotations
import json
import logging
import re
import urllib.error
import urllib.request
from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
from pathlib import Path
from typing import Optional
logger = logging.getLogger(__name__)
USER_AGENT = "GWOeAntragspruefer/1.0 (+https://gwoe.toppyr.de)"
TIMEOUT = 20
# ─────────────────────────────────────────────────────────────────────────────
# Quellen
# ─────────────────────────────────────────────────────────────────────────────
TAGESSCHAU_API = "https://www.tagesschau.de/api2u/news"
# Politische Tagesschau-Ressorts — Sport/Panorama/Sport rausgefiltert,
# weil sie selten zu parlamentarischen Antraegen passen.
TAGESSCHAU_RESSORTS = ["inland", "ausland", "wirtschaft", "wissen"]
BUNDESTAG_RSS = {
"bundestag-aktuell": (
"https://www.bundestag.de/static/appdata/includes/rss/aktuellethemen.rss"
),
"bundestag-presse": (
"https://www.bundestag.de/static/appdata/includes/rss/pressemitteilungen.rss"
),
"bundestag-hib": (
"https://www.bundestag.de/static/appdata/includes/rss/hib.rss"
),
}
# ─────────────────────────────────────────────────────────────────────────────
# HTTP-Helper
# ─────────────────────────────────────────────────────────────────────────────
def _http_get(url: str) -> Optional[bytes]:
"""GET mit ehrlichem User-Agent + Timeout. Gibt None bei Fehler."""
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
try:
with urllib.request.urlopen(req, timeout=TIMEOUT) as r:
return r.read()
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
logger.warning("news fetch failed: %s%s", url, e)
return None
def _strip_html(text: str) -> str:
"""Entfernt HTML-Tags + CDATA fuer Plaintext-Summaries."""
if not text:
return ""
text = re.sub(r"<!\[CDATA\[(.*?)\]\]>", r"\1", text, flags=re.DOTALL)
text = re.sub(r"<[^>]+>", " ", text)
text = text.replace("&amp;", "&").replace("&nbsp;", " ").replace("&quot;", '"')
return re.sub(r"\s+", " ", text).strip()
# ─────────────────────────────────────────────────────────────────────────────
# Parser
# ─────────────────────────────────────────────────────────────────────────────
def fetch_tagesschau(ressorts: Optional[list[str]] = None) -> list[dict]:
"""Holt News aus der Tagesschau-API. Liefert Liste von Dicts mit den
Feldern: url, titel, summary, datum, source, ressort, tags.
Volltexte (``content``) werden bewusst nicht uebernommen nur die in
der API verfuegbare ``firstSentence`` als Summary.
"""
ressorts = ressorts or TAGESSCHAU_RESSORTS
out: list[dict] = []
seen: set[str] = set()
for ressort in ressorts:
url = f"{TAGESSCHAU_API}?ressort={ressort}"
raw = _http_get(url)
if not raw:
continue
try:
data = json.loads(raw.decode("utf-8"))
except json.JSONDecodeError:
logger.warning("tagesschau JSON parse failed: %s", url)
continue
for item in data.get("news") or []:
link = item.get("shareURL") or item.get("detailsweb")
if not link or link in seen:
continue
seen.add(link)
titel = (item.get("title") or "").strip()
if not titel:
continue
summary = (item.get("firstSentence") or "").strip()
datum = item.get("date") or ""
tags = [t.get("tag") for t in (item.get("tags") or []) if t.get("tag")]
out.append({
"url": link,
"titel": titel,
"summary": summary,
"datum": datum,
"source": "tagesschau",
"ressort": item.get("ressort") or ressort,
"tags": tags,
})
return out
_RSS_ITEM_RE = re.compile(r"<item>(.*?)</item>", re.DOTALL)
_RSS_TITLE_RE = re.compile(r"<title>(.*?)</title>", re.DOTALL)
_RSS_LINK_RE = re.compile(r"<link>(.*?)</link>")
_RSS_DESC_RE = re.compile(r"<description>(.*?)</description>", re.DOTALL)
_RSS_PUB_RE = re.compile(r"<pubDate>(.*?)</pubDate>")
def _parse_rss_date(s: str) -> str:
"""Konvertiere RSS-pubDate (RFC 822) → ISO-8601-Datum."""
if not s:
return ""
try:
dt = parsedate_to_datetime(s.strip())
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
return dt.astimezone(timezone.utc).isoformat()
except (TypeError, ValueError):
return ""
def fetch_rss(source: str, url: str, max_items: int = 50) -> list[dict]:
"""Generischer RSS-2.0-Parser. Liefert dicts wie fetch_tagesschau."""
raw = _http_get(url)
if not raw:
return []
text = raw.decode("utf-8", errors="replace")
items_xml = _RSS_ITEM_RE.findall(text)[:max_items]
out: list[dict] = []
for item in items_xml:
title_m = _RSS_TITLE_RE.search(item)
link_m = _RSS_LINK_RE.search(item)
desc_m = _RSS_DESC_RE.search(item)
pub_m = _RSS_PUB_RE.search(item)
titel = _strip_html(title_m.group(1)) if title_m else ""
link = _strip_html(link_m.group(1)) if link_m else ""
if not titel or not link:
continue
summary = _strip_html(desc_m.group(1)) if desc_m else ""
datum = _parse_rss_date(pub_m.group(1)) if pub_m else ""
out.append({
"url": link,
"titel": titel,
"summary": summary,
"datum": datum,
"source": source,
"ressort": None,
"tags": [],
})
return out
def fetch_all() -> list[dict]:
"""Holt alle konfigurierten Quellen ein. Kein Caching, kein Auto-Retry."""
out: list[dict] = []
out.extend(fetch_tagesschau())
for source, url in BUNDESTAG_RSS.items():
out.extend(fetch_rss(source, url))
return out
# ─────────────────────────────────────────────────────────────────────────────
# DB-Persistierung
# ─────────────────────────────────────────────────────────────────────────────
def upsert_articles(
articles: list[dict],
db_path: Optional[Path] = None,
embed: bool = True,
) -> dict:
"""Schreibe oder aktualisiere News-Artikel in der DB.
Idempotent ueber URL-PK. Existierende Eintraege bekommen ein neues
``fetched_at``, aber Embedding bleibt persistent (sonst LLM-Kosten
pro Cron-Lauf).
Returns:
``{"inserted": int, "updated": int, "embedded": int}``
"""
import sqlite3
from .config import settings
path = db_path or settings.db_path
if not Path(path).exists():
return {"inserted": 0, "updated": 0, "embedded": 0}
conn = sqlite3.connect(str(path))
inserted = 0
updated = 0
embedded = 0
try:
for art in articles:
url = art["url"]
cur = conn.execute(
"SELECT summary_embedding IS NOT NULL FROM news_articles WHERE url=?",
(url,),
)
row = cur.fetchone()
tags_json = json.dumps(art.get("tags") or [])
if row is None:
conn.execute(
"""INSERT INTO news_articles
(url, titel, summary, datum, source, ressort, tags, fetched_at)
VALUES (?, ?, ?, ?, ?, ?, ?, datetime('now'))""",
(
url, art["titel"], art.get("summary") or "",
art.get("datum") or "",
art["source"], art.get("ressort"), tags_json,
),
)
inserted += 1
else:
conn.execute(
"""UPDATE news_articles
SET titel=?, summary=?, datum=?, source=?, ressort=?, tags=?,
fetched_at=datetime('now')
WHERE url=?""",
(
art["titel"], art.get("summary") or "",
art.get("datum") or "",
art["source"], art.get("ressort"), tags_json,
url,
),
)
updated += 1
conn.commit()
finally:
conn.close()
if embed:
embedded = embed_pending_articles(db_path=db_path)
return {"inserted": inserted, "updated": updated, "embedded": embedded}
def embed_pending_articles(
db_path: Optional[Path] = None,
limit: int = 100,
) -> int:
"""Erzeuge Embeddings fuer alle News-Artikel ohne ``summary_embedding``.
Embedded wird ein Stueck-Text aus Titel + Summary + Tags. Bei
Embedding-API-Fehler wird der Artikel uebersprungen naechster Run
holt ihn nach.
"""
import sqlite3
from .config import settings
from . import embeddings as emb
path = db_path or settings.db_path
if not Path(path).exists():
return 0
conn = sqlite3.connect(str(path))
try:
rows = conn.execute(
"""SELECT url, titel, summary, tags FROM news_articles
WHERE summary_embedding IS NULL ORDER BY datum DESC LIMIT ?""",
(limit,),
).fetchall()
finally:
conn.close()
if not rows:
return 0
embedded = 0
conn = sqlite3.connect(str(path))
try:
for url, titel, summary, tags_raw in rows:
try:
tags = json.loads(tags_raw) if tags_raw else []
except (json.JSONDecodeError, TypeError):
tags = []
parts = [titel or ""]
if summary:
parts.append(summary)
if tags:
parts.append(", ".join(tags))
text = "\n".join(p for p in parts if p).strip()
if not text:
continue
try:
vec = emb.create_embedding(text, model=emb.EMBEDDING_MODEL)
except Exception:
logger.exception("embed_pending_articles: API error for %s", url)
continue
conn.execute(
"""UPDATE news_articles
SET summary_embedding=?, embedding_model=?
WHERE url=?""",
(json.dumps(vec).encode(), emb.EMBEDDING_MODEL, url),
)
embedded += 1
conn.commit()
finally:
conn.close()
return embedded
def run_aggregator(db_path: Optional[Path] = None, embed: bool = True) -> dict:
"""Top-Level: alle Quellen holen + persistieren + embedden.
Sicher fuer Cron-Aufrufe fehlende Quellen werden geloggt, nicht
geworfen.
"""
articles = fetch_all()
return upsert_articles(articles, db_path=db_path, embed=embed)

256
app/presse_generator.py Normal file
View File

@ -0,0 +1,256 @@
"""Pressemitteilungs-Generator fuer #170 Phase 4.
Erzeugt einen LLM-generierten Pressemitteilungs-Vorschlag, der einen
GWÖ-bewerteten Antrag in den Kontext eines aktuellen News-Artikels stellt.
Manueller Trigger via UI-Button kein Auto-Versand. Drafts werden in
``presse_drafts`` persistiert und in der UI als Liste sichtbar.
Tonalitaet:
- GWÖ-Sicht (Gemeinwohl-orientiert, nicht parteipolitisch)
- Faktenbasiert, keine Lobbying-Sprache
- 200-250 Worte, presseaehnlicher Aufbau (Lead-Paragraph + Begruendung)
"""
from __future__ import annotations
import json
import logging
import sqlite3
from pathlib import Path
from typing import Optional
logger = logging.getLogger(__name__)
SYSTEM_PROMPT = """Du bist ein politischer Redakteur, der für eine
Gemeinwohl-Ökonomie-Initiative Pressemitteilungen schreibt. Deine Stil-
Richtlinien:
- 200-250 Worte
- Sachlicher, präziser Stil keine Werbesprache, keine Polemik
- Faktenbasiert: Daten aus dem Antrag und dem News-Kontext explizit nennen
- GWÖ-Werte (Würde, Solidarität, Nachhaltigkeit, Gerechtigkeit, Demokratie)
als Bewertungsmaßstab nicht parteipolitische Linie
- Klare Struktur: Titel, Lead-Paragraph (Wer? Was? Wann? Warum jetzt?),
Begründung mit Bezug auf GWÖ-Bewertung, Schluss mit Forderung oder
Einladung zum Dialog
- Niemals den Anbieter der News-Quelle (Tagesschau, Bundestag) zitieren
nur den Sachverhalt aufgreifen, der dort beschrieben ist
Antworte NUR mit gültigem JSON in dieser Struktur:
{
"titel": "<knackiger Titel, max 100 Zeichen>",
"body": "<Pressemitteilungs-Volltext, 200-250 Wörter>"
}"""
def _build_user_prompt(
drucksache: str,
bundesland: str,
antrag_titel: str,
antrag_zusammenfassung: str,
gwoe_score: float,
gwoe_begruendung: str,
empfehlung: str,
news_titel: str,
news_summary: str,
news_url: str,
) -> str:
"""Konstruiert den User-Prompt aus Antrags- und News-Daten."""
return f"""## Aktueller Antrag
Drucksache: {drucksache} ({bundesland})
Titel: {antrag_titel}
Zusammenfassung: {antrag_zusammenfassung or "(keine vorhanden)"}
GWÖ-Score: {gwoe_score}/10
GWÖ-Begründung: {gwoe_begruendung or "(keine vorhanden)"}
Empfehlung: {empfehlung or "(keine)"}
## Aktueller Nachrichten-Kontext
Schlagzeile: {news_titel}
Inhalt: {news_summary or "(keine Zusammenfassung verfügbar)"}
Quelle: {news_url}
## Deine Aufgabe
Schreibe eine Pressemitteilung, die diesen Antrag in den Kontext der
aktuellen Nachrichtenlage stellt. Begründe aus GWÖ-Sicht, warum der
Antrag gerade jetzt relevant ist (oder warum er die aktuelle Debatte
ergänzt/korrigiert). Wenn der GWÖ-Score niedrig ist (< 5), sei dabei
kritisch die PM kann auch eine Ablehnung des Antrags begründen.
"""
async def generate_draft(
drucksache: str,
news_url: str,
db_path: Optional[Path] = None,
bewerter=None,
) -> dict:
"""Erzeugt einen Pressemitteilungs-Draft und persistiert ihn.
Args:
drucksache: ID des Antrags (mit Bundesland-Kontext aus DB).
news_url: URL des News-Artikels (Lookup in news_articles).
db_path: optional override fuer Tests.
bewerter: optional injected QwenBewerter (fuer Tests). Wenn None,
wird der Default mit settings instanziiert.
Returns:
``{"id": int, "drucksache": ..., "bundesland": ...,
"news_url": ..., "news_titel": ...,
"titel": str, "body": str, "model": str, "created_at": ISO}``
Raises:
ValueError: wenn drucksache oder news_url nicht gefunden.
"""
from .config import settings
from .adapters.qwen_bewerter import LlmRequest
path = db_path or settings.db_path
conn = sqlite3.connect(str(path))
try:
antrag = conn.execute(
"""SELECT bundesland, title, antrag_zusammenfassung, gwoe_score,
gwoe_begruendung, empfehlung
FROM assessments WHERE drucksache=?""",
(drucksache,),
).fetchone()
news = conn.execute(
"SELECT titel, summary FROM news_articles WHERE url=?",
(news_url,),
).fetchone()
finally:
conn.close()
if not antrag:
raise ValueError(f"Drucksache {drucksache} nicht in assessments")
if not news:
raise ValueError(f"News-URL {news_url} nicht in news_articles")
user_prompt = _build_user_prompt(
drucksache=drucksache,
bundesland=antrag[0],
antrag_titel=antrag[1] or "",
antrag_zusammenfassung=antrag[2] or "",
gwoe_score=antrag[3] or 0.0,
gwoe_begruendung=antrag[4] or "",
empfehlung=antrag[5] or "",
news_titel=news[0],
news_summary=news[1] or "",
news_url=news_url,
)
if bewerter is None:
from .adapters.qwen_bewerter import QwenBewerter
bewerter = QwenBewerter()
req = LlmRequest(
system_prompt=SYSTEM_PROMPT,
user_prompt=user_prompt,
model=settings.llm_model_default,
base_temperature=0.3,
max_tokens=1500,
max_retries=2,
)
result = await bewerter.bewerte(req)
titel = (result.get("titel") or "").strip()[:200]
body = (result.get("body") or "").strip()
if not titel or not body:
raise ValueError("LLM-Response unvollständig (titel oder body leer)")
# Persist
conn = sqlite3.connect(str(path))
try:
cur = conn.execute(
"""INSERT INTO presse_drafts
(drucksache, bundesland, news_url, news_titel, titel, body, model)
VALUES (?, ?, ?, ?, ?, ?, ?)""",
(drucksache, antrag[0], news_url, news[0], titel, body,
settings.llm_model_default),
)
draft_id = cur.lastrowid
row = conn.execute(
"""SELECT id, drucksache, bundesland, news_url, news_titel,
titel, body, model, created_at
FROM presse_drafts WHERE id=?""",
(draft_id,),
).fetchone()
conn.commit()
finally:
conn.close()
return {
"id": row[0], "drucksache": row[1], "bundesland": row[2],
"news_url": row[3], "news_titel": row[4],
"titel": row[5], "body": row[6], "model": row[7],
"created_at": row[8],
}
def list_drafts(
limit: int = 20,
db_path: Optional[Path] = None,
) -> list[dict]:
"""Liste der zuletzt generierten Drafts. Default-Limit 20."""
from .config import settings
path = db_path or settings.db_path
if not Path(path).exists():
return []
conn = sqlite3.connect(str(path))
try:
rows = conn.execute(
"""SELECT id, drucksache, bundesland, news_url, news_titel,
titel, body, model, created_at
FROM presse_drafts
ORDER BY id DESC LIMIT ?""",
(limit,),
).fetchall()
finally:
conn.close()
return [
{
"id": r[0], "drucksache": r[1], "bundesland": r[2],
"news_url": r[3], "news_titel": r[4],
"titel": r[5], "body": r[6], "model": r[7],
"created_at": r[8],
}
for r in rows
]
def get_draft(
draft_id: int,
db_path: Optional[Path] = None,
) -> Optional[dict]:
"""Einen Draft per ID abrufen."""
from .config import settings
path = db_path or settings.db_path
if not Path(path).exists():
return None
conn = sqlite3.connect(str(path))
try:
row = conn.execute(
"""SELECT id, drucksache, bundesland, news_url, news_titel,
titel, body, model, created_at
FROM presse_drafts WHERE id=?""",
(draft_id,),
).fetchone()
finally:
conn.close()
if not row:
return None
return {
"id": row[0], "drucksache": row[1], "bundesland": row[2],
"news_url": row[3], "news_titel": row[4],
"titel": row[5], "body": row[6], "model": row[7],
"created_at": row[8],
}

View File

@ -56,6 +56,7 @@
<div class="v2-nav-group"> <div class="v2-nav-group">
<div class="v2-nav-label">— Daten</div> <div class="v2-nav-label">— Daten</div>
<a href="/auswertungen" class="v2-nav-item {% if v2_active_nav == 'auswertungen' %}active{% endif %}">{{ icon("chart-bar", 14) }} Auswertungen</a> <a href="/auswertungen" class="v2-nav-item {% if v2_active_nav == 'auswertungen' %}active{% endif %}">{{ icon("chart-bar", 14) }} Auswertungen</a>
<a href="/aktuelle-themen" class="v2-nav-item {% if v2_active_nav == 'aktuelle-themen' %}active{% endif %}">{{ icon("book-open", 14) }} Aktuelle Themen</a>
<a href="/api/auswertungen/export.csv" class="v2-nav-item">{{ icon("file-csv", 14) }} Export · API</a> <a href="/api/auswertungen/export.csv" class="v2-nav-item">{{ icon("file-csv", 14) }} Export · API</a>
<a href="/v2/feed" class="v2-nav-item {% if v2_active_nav == 'feed' %}active{% endif %}">{{ icon("rss", 14) }} Atom-Feed</a> <a href="/v2/feed" class="v2-nav-item {% if v2_active_nav == 'feed' %}active{% endif %}">{{ icon("rss", 14) }} Atom-Feed</a>
<a href="/v2/abos" class="v2-nav-item {% if v2_active_nav == 'abos' %}active{% endif %}">{{ icon("envelope-simple", 14) }} Meine Abos</a> <a href="/v2/abos" class="v2-nav-item {% if v2_active_nav == 'abos' %}active{% endif %}">{{ icon("envelope-simple", 14) }} Meine Abos</a>

View File

@ -0,0 +1,417 @@
{% extends "v2/base.html" %}
{% block title %}Aktuelle Themen — GWÖ-Antragsprüfer{% endblock %}
{% set v2_active_nav = "aktuelle-themen" %}
{% block head_extra %}
<script src="/static/chart.umd.min.js"></script>
<style>
.at-controls {
display: flex;
gap: 8px;
align-items: center;
flex-wrap: wrap;
margin-bottom: 1rem;
font-family: var(--font-mono);
font-size: 11px;
}
.at-controls select, .at-controls input[type="number"] {
font-family: var(--font-mono);
font-size: 11px;
padding: 5px 8px;
border: 1px solid var(--ecg-border);
border-radius: 3px;
background: var(--ecg-card-bg);
color: var(--ecg-dark);
}
.at-controls button {
font-family: var(--font-mono);
font-size: 11px;
padding: 5px 12px;
border: 1px solid var(--ecg-border);
border-radius: 3px;
cursor: pointer;
background: var(--ecg-teal);
color: #fff;
}
.at-news-card {
background: var(--ecg-card-bg);
border: 1px solid var(--ecg-border);
border-radius: 6px;
padding: 14px 16px;
margin-bottom: 14px;
}
.at-news-head {
font-family: var(--font-mono);
font-size: 10px;
text-transform: uppercase;
letter-spacing: 0.05em;
opacity: 0.6;
margin-bottom: 4px;
}
.at-news-title {
font-family: var(--font-display);
font-size: 15px;
color: var(--ecg-teal);
margin: 0 0 6px;
line-height: 1.3;
}
.at-news-title a { color: inherit; text-decoration: none; }
.at-news-title a:hover { text-decoration: underline; }
.at-news-summary {
font-size: 12px;
line-height: 1.5;
margin: 0 0 10px;
opacity: 0.85;
}
.at-news-tags {
font-family: var(--font-mono);
font-size: 10px;
opacity: 0.55;
margin-bottom: 8px;
}
.at-tag {
display: inline-block;
padding: 1px 6px;
background: var(--ecg-bg-subtle);
border-radius: 3px;
margin-right: 4px;
}
.at-matches {
border-top: 1px solid var(--ecg-border);
margin-top: 10px;
padding-top: 10px;
}
.at-matches-label {
font-family: var(--font-mono);
font-size: 10px;
text-transform: uppercase;
letter-spacing: 0.05em;
opacity: 0.6;
margin-bottom: 6px;
}
.at-match {
display: flex;
align-items: center;
gap: 10px;
padding: 5px 0;
font-size: 12px;
border-bottom: 1px dotted var(--ecg-border);
}
.at-match:last-child { border-bottom: none; }
.at-score-pill {
display: inline-block;
padding: 1px 7px;
border-radius: 10px;
font-family: var(--font-mono);
font-size: 10px;
font-weight: 700;
background: var(--ecg-bg-subtle);
min-width: 28px;
text-align: center;
}
.at-score-pill.s-high { background: rgba(136,158,51,0.25); color: #44570a; }
.at-score-pill.s-mid { background: rgba(247,148,29,0.18); color: #875e10; }
.at-score-pill.s-low { background: rgba(200,0,0,0.15); color: #931515; }
.at-sim {
font-family: var(--font-mono);
font-size: 10px;
opacity: 0.5;
}
.at-presse-btn {
background: var(--ecg-card-bg);
color: var(--ecg-teal);
border: 1px solid var(--ecg-teal);
border-radius: 3px;
font-family: var(--font-mono);
font-size: 10px;
padding: 3px 8px;
cursor: pointer;
margin-left: auto;
}
.at-presse-btn:hover { background: var(--ecg-teal); color: #fff; }
</style>
{% endblock %}
{% block main %}
<div style="padding:0 0 1.5rem;">
<h1 style="font-family:var(--font-display);font-size:22px;color:var(--ecg-teal);margin:0 0 4px;">Aktuelle Themen</h1>
<p style="font-size:12px;font-family:var(--font-mono);color:var(--ecg-dark);opacity:0.6;">
Tagesschau + Bundestag-RSS · gematcht mit deinen Anträgen ·
Pressemitteilungs-Vorschläge
</p>
</div>
<div class="v2-kasten outline-blue" style="margin-bottom:1rem;">
<p style="font-size:12px;line-height:1.5;margin:0 0 0.5rem;">
Die täglich aktuellen politischen Top-Themen aus
<strong>öffentlich-rechtlichen + parlamentarischen Quellen</strong>
(Tagesschau-API + Bundestag-RSS) werden semantisch mit den von dir
bewerteten Anträgen verschnitten. Pro News-Artikel siehst du die
GWÖ-Bewertung der dazu passendsten Anträge — und kannst per Klick
eine Pressemitteilung generieren lassen.
</p>
<p style="font-size:11px;line-height:1.5;opacity:0.75;margin:0;">
Bewusst <strong>nicht</strong> verwendet: Quellen mit AI-Bann in
robots.txt (z.B. RND.de). Die UI zeigt nur Titel + URL + erste Sätze
— Volltexte werden nicht persistiert.
</p>
</div>
<div class="at-controls">
<label for="at-days">Zeitfenster:</label>
<select id="at-days" onchange="loadThemen()">
<option value="3">3 Tage</option>
<option value="7" selected>7 Tage</option>
<option value="14">14 Tage</option>
<option value="30">30 Tage</option>
</select>
<label for="at-topk">Top-N News:</label>
<input type="number" id="at-topk" value="15" min="3" max="50" style="width:60px;" onchange="loadThemen()" />
<label for="at-minsim">Min. Similarity:</label>
<select id="at-minsim" onchange="loadThemen()">
<option value="0.30">0.30 (locker)</option>
<option value="0.40" selected>0.40 (default)</option>
<option value="0.50">0.50 (streng)</option>
</select>
<button onclick="loadThemen()">Aktualisieren</button>
</div>
<!-- News-Volumen-Chart -->
<h3 style="font-family:var(--font-display);font-size:14px;color:var(--ecg-teal);margin:1.5rem 0 0.5rem;">
News-Volumen pro Quelle (letzte 30 Tage)
</h3>
<div class="matrix-wrap" style="background:var(--ecg-card-bg);border:1px solid var(--ecg-border);border-radius:4px;padding:14px;">
<canvas id="at-zeitreihe-chart" style="max-height:280px;"></canvas>
</div>
<div id="at-zeitreihe-meta" class="meta-line" style="font-family:var(--font-mono);font-size:11px;opacity:0.6;margin:8px 0 1.5rem;"></div>
<!-- Top-Themen + Matches -->
<h3 style="font-family:var(--font-display);font-size:14px;color:var(--ecg-teal);margin:1.5rem 0 0.5rem;">
Top-Themen × passende Anträge
</h3>
<div id="at-themen-list">
<div style="font-family:var(--font-mono);font-size:12px;opacity:0.5;">Lade …</div>
</div>
<!-- Drafts-Liste -->
<h3 style="font-family:var(--font-display);font-size:14px;color:var(--ecg-teal);margin:2rem 0 0.5rem;">
Pressemitteilungs-Entwürfe (zuletzt generiert)
</h3>
<div id="at-drafts-list">
<div style="font-family:var(--font-mono);font-size:12px;opacity:0.5;">Lade Entwürfe …</div>
</div>
<!-- Modal für Draft-Anzeige -->
<div class="v2-modal-backdrop" id="at-modal-backdrop" onclick="atCloseModal(event)" style="display:none;position:fixed;inset:0;background:rgba(0,0,0,0.45);z-index:500;align-items:center;justify-content:center;">
<div class="v2-modal" onclick="event.stopPropagation()" style="background:var(--ecg-card-bg);border-radius:6px;padding:20px 24px;max-width:680px;width:90%;max-height:80vh;overflow-y:auto;position:relative;">
<button class="v2-modal-close" onclick="atCloseModal()" style="position:absolute;top:12px;right:14px;background:none;border:none;font-size:18px;cursor:pointer;opacity:0.5;">&times;</button>
<h2 id="at-modal-title" style="font-family:var(--font-display);font-size:16px;color:var(--ecg-teal);margin:0 0 12px;">Pressemitteilung</h2>
<div id="at-modal-body" style="font-size:13px;line-height:1.5;">Generiere …</div>
</div>
</div>
{% endblock %}
{% block body_scripts %}
<script>
let _atZeitreiheChart = null;
function atScoreClass(score) {
if (score == null) return '';
if (score >= 7) return 's-high';
if (score >= 4) return 's-mid';
return 's-low';
}
function atFmtDatum(s) {
if (!s || s.length < 10) return '';
return s.slice(0, 10);
}
async function loadThemen() {
const days = document.getElementById('at-days').value;
const topk = document.getElementById('at-topk').value;
const minsim = document.getElementById('at-minsim').value;
const list = document.getElementById('at-themen-list');
list.innerHTML = '<div style="font-family:var(--font-mono);font-size:12px;opacity:0.5;">Lade …</div>';
try {
const r = await fetch(`/api/aktuelle-themen/top?days=${days}&top_k=${topk}&min_similarity=${minsim}&matches_per_news=3`);
const data = await r.json();
if (!data.buckets || !data.buckets.length) {
list.innerHTML = '<div style="font-family:var(--font-mono);font-size:12px;opacity:0.5;">Keine News im Zeitfenster oder noch nicht embedded.</div>';
return;
}
let html = '';
for (const b of data.buckets) {
const n = b.news;
const tags = (n.tags || []).map(t => `<span class="at-tag">${t}</span>`).join('');
html += '<div class="at-news-card">';
html += `<div class="at-news-head">${atFmtDatum(n.datum)} · ${n.source}${n.ressort ? ' / ' + n.ressort : ''}</div>`;
html += `<h4 class="at-news-title"><a href="${n.url}" target="_blank" rel="noopener">${n.titel}</a></h4>`;
if (n.summary) html += `<div class="at-news-summary">${n.summary}</div>`;
if (tags) html += `<div class="at-news-tags">${tags}</div>`;
if (b.matches && b.matches.length) {
html += '<div class="at-matches">';
html += '<div class="at-matches-label">Passende Anträge:</div>';
for (const m of b.matches) {
const sc = m.gwoe_score != null ? m.gwoe_score.toFixed(1) : '—';
const fr = (m.fraktionen || []).join(', ');
html += '<div class="at-match">';
html += `<span class="at-score-pill ${atScoreClass(m.gwoe_score)}">${sc}</span>`;
html += `<a href="/antrag/${encodeURIComponent(m.drucksache)}" style="color:var(--ecg-teal);text-decoration:none;font-weight:500;">${m.drucksache}</a>`;
html += `<span style="opacity:0.85;">${m.title || ''}</span>`;
if (fr) html += `<span style="opacity:0.6;font-size:11px;">— ${fr}</span>`;
html += `<span class="at-sim">sim ${m.similarity}</span>`;
html += `<button class="at-presse-btn" onclick="generatePresse('${m.drucksache.replace(/'/g, "\\'")}', '${encodeURIComponent(n.url)}', this)">PM-Vorschlag</button>`;
html += '</div>';
}
html += '</div>';
} else {
html += '<div class="at-matches"><div class="at-matches-label">Keine GWÖ-bewerteten Anträge passen — wäre ein Kandidat für eine neue Bewertung.</div></div>';
}
html += '</div>';
}
list.innerHTML = html;
} catch (e) {
list.innerHTML = `<div style="color:#c00;font-family:var(--font-mono);font-size:12px;">Fehler: ${e}</div>`;
}
}
async function loadZeitreihe() {
const meta = document.getElementById('at-zeitreihe-meta');
try {
const r = await fetch('/api/aktuelle-themen/zeitreihe?days=30');
const data = await r.json();
if (_atZeitreiheChart) _atZeitreiheChart.destroy();
if (!data.buckets || !data.buckets.length) {
meta.textContent = 'Noch keine News-Artikel in der DB.';
return;
}
const colors = ['rgba(0,157,165,0.7)', 'rgba(247,148,29,0.7)', 'rgba(136,158,51,0.7)',
'rgba(200,30,30,0.7)', 'rgba(150,100,200,0.7)'];
const datasets = data.sources.map((s, i) => ({
label: s,
data: data.series[s],
backgroundColor: colors[i % colors.length],
borderColor: colors[i % colors.length].replace('0.7', '1'),
fill: true,
tension: 0.2,
}));
const ctx = document.getElementById('at-zeitreihe-chart');
_atZeitreiheChart = new Chart(ctx, {
type: 'line',
data: { labels: data.buckets, datasets: datasets },
options: {
responsive: true,
scales: {
y: { beginAtZero: true, stacked: true, title: { display: true, text: 'Artikel/Tag' } },
x: { title: { display: true, text: 'Datum' } },
},
plugins: {
legend: { position: 'bottom' }
}
}
});
const total = Object.values(data.series).reduce((s, arr) => s + arr.reduce((a, b) => a + b, 0), 0);
meta.textContent = `${total} News-Artikel über ${data.buckets.length} Tage, ${data.sources.length} Quellen.`;
} catch (e) {
meta.textContent = 'Fehler: ' + e;
}
}
async function loadDrafts() {
const wrap = document.getElementById('at-drafts-list');
try {
const r = await fetch('/api/aktuelle-themen/drafts?limit=10');
const data = await r.json();
if (!data.drafts || !data.drafts.length) {
wrap.innerHTML = '<div style="font-family:var(--font-mono);font-size:12px;opacity:0.5;">Noch keine Pressemitteilungen generiert.</div>';
return;
}
let html = '';
for (const d of data.drafts) {
html += '<div class="at-news-card" style="cursor:pointer;" onclick="showDraft(' + d.id + ')">';
html += `<div class="at-news-head">${atFmtDatum(d.created_at)} · DS ${d.drucksache} (${d.bundesland})</div>`;
html += `<h4 class="at-news-title">${d.titel}</h4>`;
html += `<div class="at-news-tags">Bezug: ${d.news_titel}</div>`;
html += '</div>';
}
wrap.innerHTML = html;
} catch (e) {
wrap.innerHTML = `<div style="color:#c00;font-family:var(--font-mono);font-size:12px;">Fehler: ${e}</div>`;
}
}
async function generatePresse(drucksache, newsUrlEnc, btn) {
if (!confirm(`Pressemitteilung generieren für Drucksache ${drucksache}?\n\nDas erzeugt einen LLM-Call (~2 Cent).`)) return;
btn.textContent = '…';
btn.disabled = true;
try {
const r = await fetch(`/api/aktuelle-themen/generate-presse?drucksache=${encodeURIComponent(drucksache)}&news_url=${newsUrlEnc}`, {
method: 'POST',
});
if (!r.ok) {
const err = await r.json();
alert('Fehler: ' + (err.detail || r.statusText));
btn.textContent = 'PM-Vorschlag';
btn.disabled = false;
return;
}
const data = await r.json();
showDraftFromData(data);
loadDrafts();
} catch (e) {
alert('Fehler: ' + e);
} finally {
btn.textContent = 'PM-Vorschlag';
btn.disabled = false;
}
}
function showDraftFromData(d) {
const backdrop = document.getElementById('at-modal-backdrop');
document.getElementById('at-modal-title').textContent = d.titel;
document.getElementById('at-modal-body').innerHTML =
`<div style="font-family:var(--font-mono);font-size:11px;opacity:0.6;margin-bottom:10px;">
DS ${d.drucksache} (${d.bundesland}) · Bezug zu: <a href="${d.news_url}" target="_blank" rel="noopener" style="color:var(--ecg-teal);">${d.news_titel}</a>
</div>
<div style="white-space:pre-wrap;">${d.body.replace(/</g, '&lt;')}</div>`;
backdrop.style.display = 'flex';
}
async function showDraft(id) {
try {
const r = await fetch(`/api/aktuelle-themen/drafts/${id}`);
const d = await r.json();
showDraftFromData(d);
} catch (e) {
alert('Fehler: ' + e);
}
}
function atCloseModal(ev) {
if (!ev || ev.target.id === 'at-modal-backdrop') {
document.getElementById('at-modal-backdrop').style.display = 'none';
}
}
document.addEventListener('keydown', (e) => {
if (e.key === 'Escape') document.getElementById('at-modal-backdrop').style.display = 'none';
});
// Init
loadZeitreihe();
loadThemen();
loadDrafts();
</script>
{% endblock %}

371
app/themen_matching.py Normal file
View File

@ -0,0 +1,371 @@
"""Themen × Anträge Matching fuer das Aktuelle-Themen-Dashboard
(#170 Phase 2).
Verschneidet News-Artikel-Embeddings (aus news_articles.summary_embedding)
mit Antrag-Embeddings (assessments.summary_embedding) per Cosine-Similarity.
Liefert pro News-Artikel die Top-K-passendsten Anträge.
Reuse:
- ``embeddings.cosine_similarity`` fuer den Vektor-Vergleich
- Beide Tabellen nutzen denselben Embedding-Modell-Vektorraum (qwen v4),
daher direkter Cross-Vergleich moeglich
- Filter ueber ``embedding_model``-Spalte, falls Migration laueft
"""
from __future__ import annotations
import json
import logging
import sqlite3
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
logger = logging.getLogger(__name__)
def _load_embeddings(
db_path: Path,
table: str,
select_cols: list[str],
where_extra: str = "",
params: tuple = (),
) -> list[dict]:
"""Generischer Loader fuer Tabellen mit ``summary_embedding``-Spalte.
Liefert Zeilen mit decoded Embedding-Vektor (oder filtert aus, wenn
Modell nicht zum aktuellen READ-Modell passt).
"""
from . import embeddings as emb
if not Path(db_path).exists():
return []
conn = sqlite3.connect(str(db_path))
try:
conn.row_factory = sqlite3.Row
cols = ", ".join(select_cols)
sql = (
f"SELECT {cols}, summary_embedding, embedding_model "
f"FROM {table} "
f"WHERE summary_embedding IS NOT NULL {where_extra}"
)
rows = conn.execute(sql, params).fetchall()
finally:
conn.close()
out = []
for r in rows:
if r["embedding_model"] != emb.EMBEDDING_MODEL_READ:
continue
try:
vec = json.loads(r["summary_embedding"])
except (json.JSONDecodeError, TypeError):
continue
d = dict(r)
d["_vec"] = vec
out.append(d)
return out
def find_anträge_for_news(
news_url: str,
top_k: int = 5,
min_similarity: float = 0.4,
db_path: Optional[Path] = None,
) -> list[dict]:
"""Pro gegebener News-URL: Top-K aehnlichste Antraege per Cosine-Match.
Filter ``min_similarity`` haelt den Cut-Off fuer "passt einigermassen".
0.4 ist empirisch der Punkt, ab dem qwen-v4-Embeddings semantisch
relevant matchen.
"""
from .config import settings
from . import embeddings as emb
path = db_path or settings.db_path
if not Path(path).exists():
return []
# 1. News-Vektor laden
conn = sqlite3.connect(str(path))
try:
row = conn.execute(
"""SELECT summary_embedding, embedding_model
FROM news_articles WHERE url=?""",
(news_url,),
).fetchone()
finally:
conn.close()
if not row or not row[0] or row[1] != emb.EMBEDDING_MODEL_READ:
return []
try:
news_vec = json.loads(row[0])
except (json.JSONDecodeError, TypeError):
return []
# 2. Alle Assessments mit Embedding laden + scoren
assessments = _load_embeddings(
Path(path),
"assessments",
["drucksache", "title", "bundesland", "fraktionen", "gwoe_score",
"empfehlung", "themen", "datum"],
)
scored = []
for a in assessments:
sim = emb.cosine_similarity(news_vec, a["_vec"])
if sim < min_similarity:
continue
scored.append({
"drucksache": a["drucksache"],
"title": a["title"],
"bundesland": a["bundesland"],
"fraktionen": json.loads(a["fraktionen"] or "[]"),
"gwoe_score": a["gwoe_score"],
"empfehlung": a["empfehlung"],
"themen": json.loads(a["themen"] or "[]"),
"datum": a["datum"],
"similarity": round(sim, 3),
})
scored.sort(key=lambda x: x["similarity"], reverse=True)
return scored[:top_k]
def find_news_for_antrag(
drucksache: str,
top_k: int = 5,
min_similarity: float = 0.4,
days_window: int = 90,
db_path: Optional[Path] = None,
) -> list[dict]:
"""Pro gegebener Drucksache: Top-K aehnlichste News-Artikel per Cosine.
Filtert News auf ein Zeitfenster (Default 90 Tage), damit
Pressemitteilungen aus aktueller Aktualitaet stammen.
"""
from .config import settings
from . import embeddings as emb
path = db_path or settings.db_path
if not Path(path).exists():
return []
# 1. Antrag-Vektor laden
conn = sqlite3.connect(str(path))
try:
row = conn.execute(
"""SELECT summary_embedding, embedding_model
FROM assessments WHERE drucksache=?""",
(drucksache,),
).fetchone()
finally:
conn.close()
if not row or not row[0] or row[1] != emb.EMBEDDING_MODEL_READ:
return []
try:
antrag_vec = json.loads(row[0])
except (json.JSONDecodeError, TypeError):
return []
# 2. News mit Datums-Filter laden
cutoff = datetime.now(timezone.utc).timestamp() - days_window * 86400
news = _load_embeddings(
Path(path),
"news_articles",
["url", "titel", "summary", "datum", "source", "ressort", "tags"],
)
scored = []
for n in news:
sim = emb.cosine_similarity(antrag_vec, n["_vec"])
if sim < min_similarity:
continue
# Datums-Filter
try:
news_ts = datetime.fromisoformat(
n["datum"].replace("Z", "+00:00")
).timestamp()
if news_ts < cutoff:
continue
except (ValueError, AttributeError):
pass # Wenn Datum nicht parsbar, lass es durch
try:
tags = json.loads(n["tags"]) if n["tags"] else []
except (json.JSONDecodeError, TypeError):
tags = []
scored.append({
"url": n["url"],
"titel": n["titel"],
"summary": n["summary"],
"datum": n["datum"],
"source": n["source"],
"ressort": n["ressort"],
"tags": tags,
"similarity": round(sim, 3),
})
scored.sort(key=lambda x: x["similarity"], reverse=True)
return scored[:top_k]
def aggregate_top_themen(
days_window: int = 7,
top_k: int = 10,
min_similarity: float = 0.4,
matches_per_news: int = 3,
db_path: Optional[Path] = None,
) -> dict:
"""Top-K aktuelle News (letzte N Tage) mit jeweils ihren passendsten
Antraegen der primaere Dashboard-Endpoint.
Returns:
``{
"buckets": [{
"news": {url, titel, summary, datum, source, ressort, tags},
"matches": [{drucksache, title, gwoe_score, similarity, ...}]
}, ...],
"n_total_news": int,
"filter": {...}
}``
"""
from .config import settings
from . import embeddings as emb
path = db_path or settings.db_path
if not Path(path).exists():
return {"buckets": [], "n_total_news": 0, "filter": {
"days_window": days_window, "top_k": top_k,
"min_similarity": min_similarity,
}}
cutoff = (
datetime.now(timezone.utc).timestamp() - days_window * 86400
)
news_rows = _load_embeddings(
Path(path),
"news_articles",
["url", "titel", "summary", "datum", "source", "ressort", "tags"],
)
# Nach Datum filtern
fresh = []
for n in news_rows:
try:
news_ts = datetime.fromisoformat(
n["datum"].replace("Z", "+00:00")
).timestamp()
except (ValueError, AttributeError):
continue
if news_ts < cutoff:
continue
n["_ts"] = news_ts
fresh.append(n)
# Nach Datum desc sortieren, top_k cutten
fresh.sort(key=lambda x: x["_ts"], reverse=True)
fresh = fresh[:top_k]
# Pro News: alle Antraege scoren, Top matches_per_news behalten
assessments = _load_embeddings(
Path(path),
"assessments",
["drucksache", "title", "bundesland", "fraktionen", "gwoe_score",
"empfehlung", "themen", "datum"],
)
buckets = []
for n in fresh:
scored = []
for a in assessments:
sim = emb.cosine_similarity(n["_vec"], a["_vec"])
if sim < min_similarity:
continue
scored.append({
"drucksache": a["drucksache"],
"title": a["title"],
"bundesland": a["bundesland"],
"fraktionen": json.loads(a["fraktionen"] or "[]"),
"gwoe_score": a["gwoe_score"],
"empfehlung": a["empfehlung"],
"datum": a["datum"],
"similarity": round(sim, 3),
})
scored.sort(key=lambda x: x["similarity"], reverse=True)
try:
tags = json.loads(n["tags"]) if n["tags"] else []
except (json.JSONDecodeError, TypeError):
tags = []
buckets.append({
"news": {
"url": n["url"],
"titel": n["titel"],
"summary": n["summary"],
"datum": n["datum"],
"source": n["source"],
"ressort": n["ressort"],
"tags": tags,
},
"matches": scored[:matches_per_news],
})
return {
"buckets": buckets,
"n_total_news": len(news_rows),
"filter": {
"days_window": days_window,
"top_k": top_k,
"min_similarity": min_similarity,
"matches_per_news": matches_per_news,
},
}
def aggregate_themen_zeitreihe(
days_window: int = 30,
db_path: Optional[Path] = None,
) -> dict:
"""News-Volumen pro (Tag, Source) ueber die letzten N Tage —
Stacked-Area-Chart.
Liefert Zeitreihe ohne Antrag-Match nur die News-Aktivitaet pro
Quelle, damit das Dashboard sehen kann, welche Quellen wie aktiv waren.
"""
from .config import settings
path = db_path or settings.db_path
if not Path(path).exists():
return {"buckets": [], "sources": [], "series": {}}
cutoff_ts = datetime.now(timezone.utc).timestamp() - days_window * 86400
conn = sqlite3.connect(str(path))
try:
rows = conn.execute(
"SELECT datum, source FROM news_articles"
).fetchall()
finally:
conn.close()
counts: defaultdict[tuple[str, str], int] = defaultdict(int)
sources_seen: set[str] = set()
days_seen: set[str] = set()
for datum, source in rows:
if not datum:
continue
try:
ts = datetime.fromisoformat(datum.replace("Z", "+00:00")).timestamp()
except (ValueError, AttributeError):
continue
if ts < cutoff_ts:
continue
day = datum[:10] # YYYY-MM-DD
sources_seen.add(source)
days_seen.add(day)
counts[(day, source)] += 1
days_sorted = sorted(days_seen)
sources_sorted = sorted(sources_seen)
series = {
s: [counts[(d, s)] for d in days_sorted]
for s in sources_sorted
}
return {
"buckets": days_sorted,
"sources": sources_sorted,
"series": series,
}

24
scripts/auto-fetch-news.sh Executable file
View File

@ -0,0 +1,24 @@
#!/bin/bash
# Aktuelle-Themen-Dashboard: News-Aggregator-Cron (#170 Phase 1).
#
# Holt taeglich Headlines von Tagesschau-API + Bundestag-RSS, persistiert
# sie in news_articles und embeddet die neuen via Qwen-Embeddings-API.
# Idempotent (URL-PK), wiederhol-bar bei Fehlern.
#
# Wird via Cron taeglich morgens aufgerufen, vor auto-ingest-protocols.sh.
#
# Usage:
# auto-fetch-news.sh [CONTAINER]
set -euo pipefail
CONTAINER="${1:-gwoe-antragspruefer}"
echo "=== auto-fetch-news $(date -Iseconds) ==="
docker exec -i "$CONTAINER" python <<'EOF'
from app.news_aggregator import run_aggregator
stats = run_aggregator()
print(f"News-Aggregator: inserted={stats['inserted']} updated={stats['updated']} embedded={stats['embedded']}")
EOF
echo "=== auto-fetch-news done $(date -Iseconds) ==="

View File

@ -0,0 +1,262 @@
"""Tests fuer app.news_aggregator (#170 Phase 1).
Testet Parser + DB-Persistierung gegen kontrollierte Fixtures, ohne
Live-HTTP-Calls (Tagesschau-API + Bundestag-RSS werden gemockt).
"""
from __future__ import annotations
import json
import sqlite3
from pathlib import Path
from unittest.mock import patch
import pytest
from app.news_aggregator import (
_parse_rss_date,
_strip_html,
fetch_rss,
fetch_tagesschau,
upsert_articles,
)
# ─────────────────────────────────────────────────────────────────────────────
# Helper
# ─────────────────────────────────────────────────────────────────────────────
class TestStripHtml:
def test_removes_tags(self):
assert _strip_html("<p>Hello <b>world</b></p>") == "Hello world"
def test_decodes_cdata(self):
assert "Test" in _strip_html("<![CDATA[Test]]>")
def test_decodes_entities(self):
assert _strip_html("a &amp; b") == "a & b"
def test_collapses_whitespace(self):
assert _strip_html("<p>a b\n c</p>") == "a b c"
def test_empty(self):
assert _strip_html("") == ""
class TestParseRssDate:
def test_rfc822_to_iso(self):
result = _parse_rss_date("Tue, 28 Apr 2026 10:45:12 GMT")
assert result.startswith("2026-04-28")
def test_invalid_returns_empty(self):
assert _parse_rss_date("garbage") == ""
assert _parse_rss_date("") == ""
# ─────────────────────────────────────────────────────────────────────────────
# fetch_tagesschau (mocked HTTP)
# ─────────────────────────────────────────────────────────────────────────────
SAMPLE_TAGESSCHAU_JSON = json.dumps({
"news": [
{
"title": "Bundestag berät über Wohnungsbau",
"firstSentence": "Der Bundestag hat heute über das neue Wohnungsbau-Gesetz beraten.",
"shareURL": "https://www.tagesschau.de/inland/bundestag-wohnungsbau-100.html",
"date": "2026-04-28T10:00:00.000+02:00",
"ressort": "inland",
"tags": [{"tag": "Wohnungsbau"}, {"tag": "Bundestag"}],
},
{
"title": "EU-Kommission stellt Klimapaket vor",
"firstSentence": "Die EU plant ehrgeizige Klimaziele.",
"shareURL": "https://www.tagesschau.de/ausland/eu-klima-100.html",
"date": "2026-04-28T11:00:00.000+02:00",
"ressort": "ausland",
"tags": [{"tag": "Klima"}, {"tag": "EU"}],
},
{
# Dieser hat keinen shareURL — sollte uebersprungen werden
"title": "Kein Link",
"firstSentence": "Skip mich",
},
],
}).encode("utf-8")
class TestFetchTagesschau:
def test_parses_news_array(self):
with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON):
articles = fetch_tagesschau(ressorts=["inland"])
# Deduplication ueber URL → 2 unique
assert len(articles) == 2
first = articles[0]
assert first["url"] == "https://www.tagesschau.de/inland/bundestag-wohnungsbau-100.html"
assert first["titel"] == "Bundestag berät über Wohnungsbau"
assert "Wohnungsbau" in first["summary"]
assert first["source"] == "tagesschau"
assert first["ressort"] == "inland"
assert "Wohnungsbau" in first["tags"]
def test_skips_items_without_link(self):
with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON):
articles = fetch_tagesschau(ressorts=["inland"])
assert all(a["url"] for a in articles)
def test_returns_empty_on_http_error(self):
with patch("app.news_aggregator._http_get", return_value=None):
articles = fetch_tagesschau(ressorts=["inland"])
assert articles == []
def test_dedup_across_ressorts(self):
"""Wenn dasselbe Item in zwei Ressorts erscheint, wird es nur 1× geliefert."""
with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON):
articles = fetch_tagesschau(ressorts=["inland", "ausland"])
urls = [a["url"] for a in articles]
assert len(urls) == len(set(urls))
# ─────────────────────────────────────────────────────────────────────────────
# fetch_rss (mocked HTTP)
# ─────────────────────────────────────────────────────────────────────────────
SAMPLE_RSS = """<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"><channel><title>BT Aktuell</title>
<item>
<title><![CDATA[Bundestag berät Antrag zum Wohnungsbau]]></title>
<link>https://www.bundestag.de/dokumente/textarchiv/2026/kw18-wohnungsbau-1170388</link>
<description><![CDATA[Der Bundestag hat heute den Antrag zum Wohnungsbau beraten.]]></description>
<pubDate>Tue, 28 Apr 2026 10:45:12 GMT</pubDate>
</item>
<item>
<title>Antrag zur Klimapolitik</title>
<link>https://www.bundestag.de/klima</link>
<description>Klimaschutz im Bundestag</description>
<pubDate>Mon, 27 Apr 2026 10:00:00 GMT</pubDate>
</item>
</channel></rss>""".encode("utf-8")
class TestFetchRss:
def test_parses_rss_items(self):
with patch("app.news_aggregator._http_get", return_value=SAMPLE_RSS):
articles = fetch_rss("bundestag-aktuell", "https://example.com/rss")
assert len(articles) == 2
first = articles[0]
assert "Wohnungsbau" in first["titel"]
assert first["url"].startswith("https://www.bundestag.de")
assert first["source"] == "bundestag-aktuell"
assert first["datum"].startswith("2026-04-28")
assert "Bundestag" in first["summary"]
def test_strips_cdata_and_html(self):
with patch("app.news_aggregator._http_get", return_value=SAMPLE_RSS):
articles = fetch_rss("bundestag-aktuell", "https://example.com/rss")
for a in articles:
assert "<![CDATA[" not in a["titel"]
assert "<![CDATA[" not in a["summary"]
def test_empty_on_http_error(self):
with patch("app.news_aggregator._http_get", return_value=None):
articles = fetch_rss("x", "https://example.com/rss")
assert articles == []
def test_skips_items_without_title_or_link(self):
bad = b"""<?xml version="1.0"?><rss><channel>
<item><title>Nur Titel</title></item>
<item><link>nur-link</link></item>
</channel></rss>"""
with patch("app.news_aggregator._http_get", return_value=bad):
articles = fetch_rss("x", "https://example.com/rss")
assert articles == []
# ─────────────────────────────────────────────────────────────────────────────
# upsert_articles
# ─────────────────────────────────────────────────────────────────────────────
@pytest.fixture
def empty_db(tmp_path: Path) -> Path:
db = tmp_path / "test_news.db"
conn = sqlite3.connect(str(db))
conn.execute("""
CREATE TABLE news_articles (
url TEXT PRIMARY KEY,
titel TEXT NOT NULL,
summary TEXT,
datum TEXT NOT NULL,
source TEXT NOT NULL,
ressort TEXT,
tags TEXT,
summary_embedding BLOB,
embedding_model TEXT,
fetched_at TEXT NOT NULL DEFAULT (datetime('now'))
)
""")
conn.commit()
conn.close()
return db
SAMPLE_ARTICLES = [
{
"url": "https://example.com/a",
"titel": "Wohnungsbau",
"summary": "Heute im Bundestag",
"datum": "2026-04-28",
"source": "tagesschau",
"ressort": "inland",
"tags": ["Wohnungsbau"],
},
{
"url": "https://example.com/b",
"titel": "Klima",
"summary": "EU plant Klimaziele",
"datum": "2026-04-28",
"source": "tagesschau",
"ressort": "ausland",
"tags": ["Klima", "EU"],
},
]
class TestUpsertArticles:
def test_inserts_new_articles(self, empty_db):
stats = upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False)
assert stats["inserted"] == 2
assert stats["updated"] == 0
def test_updates_existing_articles(self, empty_db):
upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False)
# Re-run with same URLs but different titel
modified = [{**a, "titel": a["titel"] + " (neu)"} for a in SAMPLE_ARTICLES]
stats = upsert_articles(modified, db_path=empty_db, embed=False)
assert stats["updated"] == 2
assert stats["inserted"] == 0
# Verify the title was updated
conn = sqlite3.connect(str(empty_db))
row = conn.execute(
"SELECT titel FROM news_articles WHERE url=?",
(SAMPLE_ARTICLES[0]["url"],),
).fetchone()
conn.close()
assert row[0].endswith("(neu)")
def test_persists_tags_as_json(self, empty_db):
upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False)
conn = sqlite3.connect(str(empty_db))
row = conn.execute(
"SELECT tags FROM news_articles WHERE url=?",
(SAMPLE_ARTICLES[0]["url"],),
).fetchone()
conn.close()
tags = json.loads(row[0])
assert tags == ["Wohnungsbau"]
def test_missing_db_returns_zeros(self, tmp_path):
stats = upsert_articles(SAMPLE_ARTICLES,
db_path=tmp_path / "missing.db", embed=False)
assert stats == {"inserted": 0, "updated": 0, "embedded": 0}

View File

@ -0,0 +1,224 @@
"""Tests fuer app.presse_generator (#170 Phase 4)."""
from __future__ import annotations
import json
import sqlite3
from pathlib import Path
from unittest.mock import patch
import pytest
from app.presse_generator import (
_build_user_prompt,
generate_draft,
get_draft,
list_drafts,
)
# ─────────────────────────────────────────────────────────────────────────────
# Fixture: DB mit Antrag + News
# ─────────────────────────────────────────────────────────────────────────────
@pytest.fixture
def db_with_antrag_and_news(tmp_path: Path) -> Path:
db = tmp_path / "test_presse.db"
conn = sqlite3.connect(str(db))
conn.execute("""
CREATE TABLE assessments (
drucksache TEXT PRIMARY KEY,
title TEXT,
bundesland TEXT,
antrag_zusammenfassung TEXT,
gwoe_score REAL,
gwoe_begruendung TEXT,
empfehlung TEXT
)
""")
conn.execute("""
CREATE TABLE news_articles (
url TEXT PRIMARY KEY,
titel TEXT NOT NULL,
summary TEXT
)
""")
conn.execute("""
CREATE TABLE presse_drafts (
id INTEGER PRIMARY KEY AUTOINCREMENT,
drucksache TEXT NOT NULL,
bundesland TEXT NOT NULL,
news_url TEXT NOT NULL,
news_titel TEXT NOT NULL,
titel TEXT NOT NULL,
body TEXT NOT NULL,
model TEXT NOT NULL,
created_at TEXT NOT NULL DEFAULT (datetime('now'))
)
""")
conn.execute(
"""INSERT INTO assessments
(drucksache, title, bundesland, antrag_zusammenfassung,
gwoe_score, gwoe_begruendung, empfehlung)
VALUES (?, ?, ?, ?, ?, ?, ?)""",
(
"18/A", "Wohnungsbau-Reform-Antrag", "NRW",
"Antrag fuer mehr sozialen Wohnungsbau",
8.5, "Stark gemeinwohlorientiert",
"Uneingeschränkt unterstützen",
),
)
conn.execute(
"INSERT INTO news_articles (url, titel, summary) VALUES (?, ?, ?)",
(
"https://example.com/wohnen",
"Wohnungsmarkt im Umbruch",
"Die Mietpreise steigen weiter, der Bundestag berät heute",
),
)
conn.commit()
conn.close()
return db
# ─────────────────────────────────────────────────────────────────────────────
# _build_user_prompt
# ─────────────────────────────────────────────────────────────────────────────
class TestBuildUserPrompt:
def test_includes_drucksache(self):
prompt = _build_user_prompt(
drucksache="18/A", bundesland="NRW",
antrag_titel="Test", antrag_zusammenfassung="Summary",
gwoe_score=7.5, gwoe_begruendung="ok",
empfehlung="Unterstützen",
news_titel="News", news_summary="Lead",
news_url="https://example.com",
)
assert "18/A" in prompt
assert "NRW" in prompt
assert "7.5" in prompt
assert "News" in prompt
def test_handles_missing_zusammenfassung(self):
prompt = _build_user_prompt(
drucksache="x", bundesland="x", antrag_titel="x",
antrag_zusammenfassung="", gwoe_score=5.0,
gwoe_begruendung="", empfehlung="",
news_titel="x", news_summary="", news_url="",
)
assert "(keine vorhanden)" in prompt
# ─────────────────────────────────────────────────────────────────────────────
# generate_draft (mocked QwenBewerter)
# ─────────────────────────────────────────────────────────────────────────────
class FakeBewerter:
"""Mock fuer QwenBewerter, gibt fixe LLM-Response zurueck."""
def __init__(self, response: dict):
self._response = response
self.last_request = None
async def bewerte(self, request):
self.last_request = request
return self._response
@pytest.mark.asyncio
async def test_generate_draft_persists_record(db_with_antrag_and_news, monkeypatch):
bewerter = FakeBewerter({
"titel": "Wohnungsbau jetzt",
"body": "Der vorliegende Antrag der Drucksache 18/A ..."
* 10, # langer Body
})
# Patch settings.dashscope_model fuer den INSERT
from app.config import settings as real_settings
monkeypatch.setattr(real_settings, "llm_model_default", "qwen-test")
result = await generate_draft(
drucksache="18/A",
news_url="https://example.com/wohnen",
db_path=db_with_antrag_and_news,
bewerter=bewerter,
)
assert result["id"] == 1
assert result["drucksache"] == "18/A"
assert result["bundesland"] == "NRW"
assert result["news_titel"] == "Wohnungsmarkt im Umbruch"
assert result["titel"] == "Wohnungsbau jetzt"
assert "18/A" in result["body"]
@pytest.mark.asyncio
async def test_generate_draft_unknown_drucksache(db_with_antrag_and_news):
bewerter = FakeBewerter({"titel": "x", "body": "y"})
with pytest.raises(ValueError, match="Drucksache"):
await generate_draft(
drucksache="99/MISSING",
news_url="https://example.com/wohnen",
db_path=db_with_antrag_and_news,
bewerter=bewerter,
)
@pytest.mark.asyncio
async def test_generate_draft_unknown_news(db_with_antrag_and_news):
bewerter = FakeBewerter({"titel": "x", "body": "y"})
with pytest.raises(ValueError, match="News-URL"):
await generate_draft(
drucksache="18/A",
news_url="https://example.com/missing",
db_path=db_with_antrag_and_news,
bewerter=bewerter,
)
@pytest.mark.asyncio
async def test_generate_draft_empty_response_raises(db_with_antrag_and_news, monkeypatch):
bewerter = FakeBewerter({"titel": "", "body": ""})
from app.config import settings as real_settings
monkeypatch.setattr(real_settings, "llm_model_default", "qwen-test")
with pytest.raises(ValueError, match="unvollständig"):
await generate_draft(
drucksache="18/A",
news_url="https://example.com/wohnen",
db_path=db_with_antrag_and_news,
bewerter=bewerter,
)
# ─────────────────────────────────────────────────────────────────────────────
# list_drafts + get_draft
# ─────────────────────────────────────────────────────────────────────────────
class TestListAndGetDrafts:
def test_empty(self, db_with_antrag_and_news):
assert list_drafts(db_path=db_with_antrag_and_news) == []
assert get_draft(99, db_path=db_with_antrag_and_news) is None
def test_after_insert(self, db_with_antrag_and_news):
# Direct DB-Insert (test setup)
conn = sqlite3.connect(str(db_with_antrag_and_news))
conn.execute(
"""INSERT INTO presse_drafts
(drucksache, bundesland, news_url, news_titel, titel, body, model)
VALUES (?, ?, ?, ?, ?, ?, ?)""",
("18/A", "NRW", "https://x.de/n", "News-Titel",
"PM-Titel", "PM-Body", "test-model"),
)
conn.commit()
conn.close()
drafts = list_drafts(db_path=db_with_antrag_and_news)
assert len(drafts) == 1
assert drafts[0]["drucksache"] == "18/A"
assert drafts[0]["titel"] == "PM-Titel"
d = get_draft(drafts[0]["id"], db_path=db_with_antrag_and_news)
assert d is not None
assert d["body"] == "PM-Body"

View File

@ -0,0 +1,297 @@
"""Tests fuer app.themen_matching (#170 Phase 2)."""
from __future__ import annotations
import json
import sqlite3
from datetime import datetime, timezone, timedelta
from pathlib import Path
from unittest.mock import patch
import pytest
from app.themen_matching import (
aggregate_themen_zeitreihe,
aggregate_top_themen,
find_anträge_for_news,
find_news_for_antrag,
)
# ─────────────────────────────────────────────────────────────────────────────
# Fixture: DB mit News + Assessments + Embeddings
# ─────────────────────────────────────────────────────────────────────────────
def _vec(dim: int = 8, val: float = 0.1) -> bytes:
"""Konstruiert einen einfachen Vektor als JSON-Bytes."""
return json.dumps([val] * dim).encode()
def _vec_from(values: list[float]) -> bytes:
return json.dumps(values).encode()
@pytest.fixture
def populated_db(tmp_path: Path) -> Path:
db = tmp_path / "test_match.db"
conn = sqlite3.connect(str(db))
conn.execute("""
CREATE TABLE news_articles (
url TEXT PRIMARY KEY,
titel TEXT NOT NULL,
summary TEXT,
datum TEXT NOT NULL,
source TEXT NOT NULL,
ressort TEXT,
tags TEXT,
summary_embedding BLOB,
embedding_model TEXT,
fetched_at TEXT NOT NULL DEFAULT (datetime('now'))
)
""")
conn.execute("""
CREATE TABLE assessments (
drucksache TEXT PRIMARY KEY,
title TEXT,
fraktionen TEXT,
datum TEXT,
link TEXT,
bundesland TEXT,
gwoe_score REAL,
gwoe_begruendung TEXT,
gwoe_matrix TEXT,
gwoe_schwerpunkt TEXT,
wahlprogramm_scores TEXT,
verbesserungen TEXT,
staerken TEXT,
schwaechen TEXT,
empfehlung TEXT,
empfehlung_symbol TEXT,
verbesserungspotenzial TEXT,
themen TEXT,
antrag_zusammenfassung TEXT,
antrag_kernpunkte TEXT,
source TEXT,
model TEXT,
created_at TEXT,
updated_at TEXT,
summary_embedding BLOB,
embedding_model TEXT
)
""")
today = datetime.now(timezone.utc).isoformat()
yesterday = (datetime.now(timezone.utc) - timedelta(days=1)).isoformat()
old = (datetime.now(timezone.utc) - timedelta(days=200)).isoformat()
# News-Artikel mit unterschiedlichen Embeddings
news = [
# Wohnungsbau-News (vec orientiert auf [1,0,0,...])
("https://example.com/n1", "Wohnungsbau-Reform",
"Bundestag berät Wohnungsbau", today, "tagesschau", "inland",
'["Wohnungsbau"]',
_vec_from([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])),
# Klima-News (vec orientiert auf [0,1,0,...])
("https://example.com/n2", "Klimaschutzgesetz",
"EU plant Klimaziele", today, "tagesschau", "ausland",
'["Klima"]',
_vec_from([0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])),
# Old news, sollte aus Zeitfenster filtern
("https://example.com/n3", "Alte News", "", old, "tagesschau", "inland",
'[]', _vec_from([0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])),
]
for url, titel, summary, datum, source, ressort, tags, vec in news:
conn.execute(
"""INSERT INTO news_articles
(url, titel, summary, datum, source, ressort, tags,
summary_embedding, embedding_model)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, 'qwen-embedding-v4')""",
(url, titel, summary, datum, source, ressort, tags, vec),
)
# Assessments mit Embeddings:
# - 18/A passt zu Wohnungsbau-News (vec [1,0,...])
# - 18/B passt zu Klima-News
# - 18/C ist orthogonal — sollte nirgends matchen
now_iso = datetime.now().isoformat()
assessments = [
("18/A", "Wohnungsbau-Antrag", '["GRÜNE"]', "2026-04-15", "NRW",
8.0, "Uneingeschränkt unterstützen",
_vec_from([0.95, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])),
("18/B", "Klima-Antrag", '["SPD"]', "2026-04-16", "NRW",
7.0, "Unterstützen mit Änderungen",
_vec_from([0.0, 0.95, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0])),
("18/C", "Sonstiges", '["CDU"]', "2026-04-17", "NRW",
5.0, "Überarbeiten",
_vec_from([0.0, 0.0, 0.0, 0.0, 0.95, 0.0, 0.0, 0.0])),
]
for ds, title, fr, dat, bl, sc, emp, vec in assessments:
conn.execute(
"""INSERT INTO assessments
(drucksache, title, fraktionen, datum, bundesland, gwoe_score,
empfehlung, themen, source, model, created_at, updated_at,
summary_embedding, embedding_model)
VALUES (?, ?, ?, ?, ?, ?, ?, '[]', 'test', 'test', ?, ?,
?, 'qwen-embedding-v4')""",
(ds, title, fr, dat, bl, sc, emp, now_iso, now_iso, vec),
)
conn.commit()
conn.close()
return db
@pytest.fixture(autouse=True)
def mock_embedding_model():
"""Stellt sicher, dass EMBEDDING_MODEL_READ=qwen-embedding-v4 fuer Tests."""
with patch("app.embeddings.EMBEDDING_MODEL_READ", "qwen-embedding-v4"):
yield
# ─────────────────────────────────────────────────────────────────────────────
# find_anträge_for_news
# ─────────────────────────────────────────────────────────────────────────────
class TestFindAnträgeForNews:
def test_wohnungsbau_news_matches_wohnungsbau_antrag(self, populated_db):
result = find_anträge_for_news(
"https://example.com/n1", db_path=populated_db,
min_similarity=0.5,
)
assert len(result) >= 1
# Top-Match sollte 18/A sein
assert result[0]["drucksache"] == "18/A"
assert result[0]["similarity"] > 0.9
def test_klima_news_matches_klima_antrag(self, populated_db):
result = find_anträge_for_news(
"https://example.com/n2", db_path=populated_db,
min_similarity=0.5,
)
assert len(result) >= 1
assert result[0]["drucksache"] == "18/B"
def test_min_similarity_filters_orthogonal(self, populated_db):
"""Mit hohem min_similarity-Cutoff darf kein orthogonaler Antrag drin sein."""
result = find_anträge_for_news(
"https://example.com/n1", db_path=populated_db,
min_similarity=0.9,
)
druck = [r["drucksache"] for r in result]
assert "18/C" not in druck # 18/C ist orthogonal zu allem
def test_unknown_news_returns_empty(self, populated_db):
assert find_anträge_for_news(
"https://example.com/missing", db_path=populated_db,
) == []
def test_empty_db(self, tmp_path):
assert find_anträge_for_news(
"x", db_path=tmp_path / "missing.db",
) == []
# ─────────────────────────────────────────────────────────────────────────────
# find_news_for_antrag
# ─────────────────────────────────────────────────────────────────────────────
class TestFindNewsForAntrag:
def test_wohnungsbau_antrag_matches_wohnungsbau_news(self, populated_db):
result = find_news_for_antrag(
"18/A", db_path=populated_db, min_similarity=0.5,
)
assert len(result) >= 1
assert result[0]["url"] == "https://example.com/n1"
def test_old_news_filtered_out(self, populated_db):
"""News aus dem 200-Tage-alten Bucket darf nicht im 90-Tage-Fenster auftauchen."""
result = find_news_for_antrag(
"18/A", db_path=populated_db, min_similarity=0.0,
days_window=90,
)
urls = [r["url"] for r in result]
assert "https://example.com/n3" not in urls
def test_top_k_limits(self, populated_db):
"""top_k=1 liefert nur den besten Match."""
result = find_news_for_antrag(
"18/A", db_path=populated_db, min_similarity=0.0,
top_k=1,
)
assert len(result) <= 1
def test_unknown_antrag(self, populated_db):
assert find_news_for_antrag(
"99/Missing", db_path=populated_db,
) == []
# ─────────────────────────────────────────────────────────────────────────────
# aggregate_top_themen
# ─────────────────────────────────────────────────────────────────────────────
class TestAggregateTopThemen:
def test_returns_buckets(self, populated_db):
result = aggregate_top_themen(
db_path=populated_db, min_similarity=0.5,
)
# Heute gibt es 2 News-Artikel, beide mit Match
assert len(result["buckets"]) == 2
assert "n_total_news" in result
def test_each_bucket_has_news_and_matches(self, populated_db):
result = aggregate_top_themen(
db_path=populated_db, min_similarity=0.5,
)
for b in result["buckets"]:
assert "news" in b
assert "matches" in b
assert "url" in b["news"]
assert "titel" in b["news"]
def test_days_window_filter(self, populated_db):
"""Mit kleinem Fenster nur die fresh News, alte raus."""
result = aggregate_top_themen(
db_path=populated_db, days_window=7, min_similarity=0.5,
)
for b in result["buckets"]:
assert b["news"]["url"] != "https://example.com/n3"
def test_min_similarity_filter(self, populated_db):
"""Mit hohem min_sim verschwinden Cross-Matches."""
result = aggregate_top_themen(
db_path=populated_db, min_similarity=0.99,
)
# Nur exakte Matches sollten überleben
for b in result["buckets"]:
for m in b["matches"]:
assert m["similarity"] > 0.99
# ─────────────────────────────────────────────────────────────────────────────
# aggregate_themen_zeitreihe
# ─────────────────────────────────────────────────────────────────────────────
class TestAggregateZeitreihe:
def test_structure(self, populated_db):
result = aggregate_themen_zeitreihe(db_path=populated_db, days_window=7)
assert "buckets" in result
assert "sources" in result
assert "series" in result
def test_only_recent(self, populated_db):
"""Mit days_window=7 darf das alte News nicht im Bucket auftauchen."""
result = aggregate_themen_zeitreihe(db_path=populated_db, days_window=7)
# Nur heutige News (n1, n2) — n3 ist 200 Tage alt
total = sum(sum(s) for s in result["series"].values())
assert total == 2
def test_series_aligned(self, populated_db):
"""Pro Source: series-Liste muss exakt so lang sein wie buckets."""
result = aggregate_themen_zeitreihe(db_path=populated_db, days_window=7)
for source in result["sources"]:
assert len(result["series"][source]) == len(result["buckets"])