diff --git a/app/database.py b/app/database.py index ea70fb7..5d7dd00 100644 --- a/app/database.py +++ b/app/database.py @@ -285,6 +285,55 @@ async def init_db(): "ON plenum_vote_results(bundesland, drucksache)" ) + # News-Artikel aus oeffentlich-rechtlichen Quellen (#170 Phase 1). + # Tagesschau-API + Bundestag-RSS — KEIN AI-banntes Quellmaterial + # (RND ist explizit per robots.txt ausgeschlossen). + # Volltexte werden NICHT persistiert — nur Titel + Summary fuer + # Embeddings + UI-Anzeige (Urheberrecht). + await db.execute(""" + CREATE TABLE IF NOT EXISTS news_articles ( + url TEXT PRIMARY KEY, + titel TEXT NOT NULL, + summary TEXT, + datum TEXT NOT NULL, + source TEXT NOT NULL, + ressort TEXT, + tags TEXT, + summary_embedding BLOB, + embedding_model TEXT, + fetched_at TEXT NOT NULL DEFAULT (datetime('now')) + ) + """) + await db.execute( + "CREATE INDEX IF NOT EXISTS idx_news_datum " + "ON news_articles(datum)" + ) + await db.execute( + "CREATE INDEX IF NOT EXISTS idx_news_source " + "ON news_articles(source)" + ) + + # Pressemitteilungs-Drafts (#170 Phase 4). LLM-generierte Vorschlaege, + # die einen Antrag in den Kontext eines News-Artikels stellen. + # Manueller Trigger, kein Auto-Versand. + await db.execute(""" + CREATE TABLE IF NOT EXISTS presse_drafts ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + drucksache TEXT NOT NULL, + bundesland TEXT NOT NULL, + news_url TEXT NOT NULL, + news_titel TEXT NOT NULL, + titel TEXT NOT NULL, + body TEXT NOT NULL, + model TEXT NOT NULL, + created_at TEXT NOT NULL DEFAULT (datetime('now')) + ) + """) + await db.execute( + "CREATE INDEX IF NOT EXISTS idx_presse_created " + "ON presse_drafts(created_at DESC)" + ) + await db.commit() diff --git a/app/main.py b/app/main.py index 2bc93fe..86e56a5 100644 --- a/app/main.py +++ b/app/main.py @@ -2008,6 +2008,116 @@ async def auswertungen_page(request: Request, current_user: dict = Depends(requi }) +# ─── Aktuelle-Themen-Dashboard (#170) ────────────────────────────────────── + + +@app.get("/aktuelle-themen", response_class=HTMLResponse) +async def aktuelle_themen_page( + request: Request, current_user: dict = Depends(require_auth) +): + """Aktuelle-Themen-Dashboard: News × Anträge × Pressemitteilungs-Drafts.""" + return templates.TemplateResponse("v2/screens/aktuelle-themen.html", { + "request": request, + "app_name": settings.app_name, + "v2_active_nav": "aktuelle-themen", + **_v2_template_context(current_user), + }) + + +@app.get("/api/aktuelle-themen/top") +async def api_aktuelle_themen_top( + days: int = 7, + top_k: int = 10, + min_similarity: float = 0.4, + matches_per_news: int = 3, +): + """Top-K News der letzten N Tage mit Antrags-Match.""" + from .themen_matching import aggregate_top_themen + return aggregate_top_themen( + days_window=days, + top_k=top_k, + min_similarity=min_similarity, + matches_per_news=matches_per_news, + ) + + +@app.get("/api/aktuelle-themen/zeitreihe") +async def api_aktuelle_themen_zeitreihe(days: int = 30): + """News-Volumen pro Tag × Source — Stacked-Area-Chart.""" + from .themen_matching import aggregate_themen_zeitreihe + return aggregate_themen_zeitreihe(days_window=days) + + +@app.get("/api/aktuelle-themen/news-fuer-antrag") +async def api_news_fuer_antrag( + drucksache: str, + top_k: int = 5, + min_similarity: float = 0.4, + days: int = 90, +): + """Top-K News, die zu einem gegebenen Antrag passen (für Detail-View).""" + from .themen_matching import find_news_for_antrag + return {"drucksache": drucksache, "matches": find_news_for_antrag( + drucksache=drucksache, top_k=top_k, + min_similarity=min_similarity, days_window=days, + )} + + +@app.get("/api/aktuelle-themen/anträge-fuer-news") +async def api_anträge_fuer_news( + url: str, + top_k: int = 5, + min_similarity: float = 0.4, +): + """Top-K Anträge, die zu einem gegebenen News-Artikel passen.""" + from .themen_matching import find_anträge_for_news + return {"news_url": url, "matches": find_anträge_for_news( + news_url=url, top_k=top_k, min_similarity=min_similarity, + )} + + +# ─── Pressemitteilungs-Drafts (#170 Phase 4) ────────────────────────── + + +@app.post("/api/aktuelle-themen/generate-presse") +@limiter.limit("5/minute") +async def api_generate_presse( + request: Request, + drucksache: str, + news_url: str, + current_user: dict = Depends(require_auth), +): + """Generiert einen LLM-Pressemitteilungs-Vorschlag. + + Auth-only + rate-limited (5/min) wegen LLM-Kosten. + """ + from .presse_generator import generate_draft + try: + return await generate_draft(drucksache=drucksache, news_url=news_url) + except ValueError as e: + raise HTTPException(status_code=404, detail=str(e)) + except Exception as e: + logger.exception("generate_draft failed") + raise HTTPException(status_code=500, detail=str(e)) + + +@app.get("/api/aktuelle-themen/drafts") +async def api_drafts_list(limit: int = 20): + """Liste der zuletzt generierten Pressemitteilungs-Entwürfe.""" + from .presse_generator import list_drafts + return {"drafts": list_drafts(limit=limit)} + + +@app.get("/api/aktuelle-themen/drafts/{draft_id}") +async def api_draft_detail(draft_id: int): + """Einen einzelnen Pressemitteilungs-Entwurf.""" + from .presse_generator import get_draft + d = get_draft(draft_id) + if not d: + raise HTTPException(status_code=404, detail="Draft nicht gefunden") + return d + + @app.get("/api/auswertungen/matrix") async def auswertungen_matrix( wahlperiode: Optional[str] = None, diff --git a/app/news_aggregator.py b/app/news_aggregator.py new file mode 100644 index 0000000..ee1c324 --- /dev/null +++ b/app/news_aggregator.py @@ -0,0 +1,347 @@ +"""News-Aggregator fuer das Aktuelle-Themen-Dashboard (#170 Phase 1). + +Fetcht regelmaessig News-Headlines aus AI-erlaubenden, oeffentlich-rechtlichen +oder parlamentarischen Quellen: + +- **Tagesschau-API** (https://www.tagesschau.de/api2u/news/) — strukturiertes + JSON mit ressort, tags, firstSentence pro Artikel. +- **Bundestag-Aktuellethemen-RSS** + (https://www.bundestag.de/static/appdata/includes/rss/aktuellethemen.rss) + — RSS mit Titel + Beschreibung pro Artikel. + +**Bewusst NICHT verwendet:** RND.de (robots.txt bannt explizit ClaudeBot, +GPTBot, ChatGPT-User, CCBot, Google-Extended). RSS-Feeds privat-publizierter +Verlage werden nur dann angebunden, wenn AI-Verarbeitung explizit erlaubt ist. + +**Compliance:** +- Volltexte werden NICHT persistiert. Nur Titel + erster Satz / Description. +- Kein User-Agent, der einen AI-Bot vortaeuscht (kein "ClaudeBot"). +- Rate-Limiting: 1 Request pro Quelle pro Aufruf (kein Loop, kein Hammer). + +Datenbank-Tabelle ``news_articles`` (siehe app/database.py): + url PK, titel, summary, datum (ISO), source, ressort, tags JSON, + summary_embedding BLOB, embedding_model, fetched_at. +""" +from __future__ import annotations + +import json +import logging +import re +import urllib.error +import urllib.request +from datetime import datetime, timezone +from email.utils import parsedate_to_datetime +from pathlib import Path +from typing import Optional + +logger = logging.getLogger(__name__) + +USER_AGENT = "GWOeAntragspruefer/1.0 (+https://gwoe.toppyr.de)" +TIMEOUT = 20 + + +# ───────────────────────────────────────────────────────────────────────────── +# Quellen +# ───────────────────────────────────────────────────────────────────────────── + +TAGESSCHAU_API = "https://www.tagesschau.de/api2u/news" + +# Politische Tagesschau-Ressorts — Sport/Panorama/Sport rausgefiltert, +# weil sie selten zu parlamentarischen Antraegen passen. +TAGESSCHAU_RESSORTS = ["inland", "ausland", "wirtschaft", "wissen"] + +BUNDESTAG_RSS = { + "bundestag-aktuell": ( + "https://www.bundestag.de/static/appdata/includes/rss/aktuellethemen.rss" + ), + "bundestag-presse": ( + "https://www.bundestag.de/static/appdata/includes/rss/pressemitteilungen.rss" + ), + "bundestag-hib": ( + "https://www.bundestag.de/static/appdata/includes/rss/hib.rss" + ), +} + + +# ───────────────────────────────────────────────────────────────────────────── +# HTTP-Helper +# ───────────────────────────────────────────────────────────────────────────── + + +def _http_get(url: str) -> Optional[bytes]: + """GET mit ehrlichem User-Agent + Timeout. Gibt None bei Fehler.""" + req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) + try: + with urllib.request.urlopen(req, timeout=TIMEOUT) as r: + return r.read() + except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e: + logger.warning("news fetch failed: %s — %s", url, e) + return None + + +def _strip_html(text: str) -> str: + """Entfernt HTML-Tags + CDATA fuer Plaintext-Summaries.""" + if not text: + return "" + text = re.sub(r"", r"\1", text, flags=re.DOTALL) + text = re.sub(r"<[^>]+>", " ", text) + text = text.replace("&", "&").replace(" ", " ").replace(""", '"') + return re.sub(r"\s+", " ", text).strip() + + +# ───────────────────────────────────────────────────────────────────────────── +# Parser +# ───────────────────────────────────────────────────────────────────────────── + + +def fetch_tagesschau(ressorts: Optional[list[str]] = None) -> list[dict]: + """Holt News aus der Tagesschau-API. Liefert Liste von Dicts mit den + Feldern: url, titel, summary, datum, source, ressort, tags. + + Volltexte (``content``) werden bewusst nicht uebernommen — nur die in + der API verfuegbare ``firstSentence`` als Summary. + """ + ressorts = ressorts or TAGESSCHAU_RESSORTS + out: list[dict] = [] + seen: set[str] = set() + for ressort in ressorts: + url = f"{TAGESSCHAU_API}?ressort={ressort}" + raw = _http_get(url) + if not raw: + continue + try: + data = json.loads(raw.decode("utf-8")) + except json.JSONDecodeError: + logger.warning("tagesschau JSON parse failed: %s", url) + continue + for item in data.get("news") or []: + link = item.get("shareURL") or item.get("detailsweb") + if not link or link in seen: + continue + seen.add(link) + titel = (item.get("title") or "").strip() + if not titel: + continue + summary = (item.get("firstSentence") or "").strip() + datum = item.get("date") or "" + tags = [t.get("tag") for t in (item.get("tags") or []) if t.get("tag")] + out.append({ + "url": link, + "titel": titel, + "summary": summary, + "datum": datum, + "source": "tagesschau", + "ressort": item.get("ressort") or ressort, + "tags": tags, + }) + return out + + +_RSS_ITEM_RE = re.compile(r"(.*?)", re.DOTALL) +_RSS_TITLE_RE = re.compile(r"(.*?)", re.DOTALL) +_RSS_LINK_RE = re.compile(r"(.*?)") +_RSS_DESC_RE = re.compile(r"(.*?)", re.DOTALL) +_RSS_PUB_RE = re.compile(r"(.*?)") + + +def _parse_rss_date(s: str) -> str: + """Konvertiere RSS-pubDate (RFC 822) → ISO-8601-Datum.""" + if not s: + return "" + try: + dt = parsedate_to_datetime(s.strip()) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt.astimezone(timezone.utc).isoformat() + except (TypeError, ValueError): + return "" + + +def fetch_rss(source: str, url: str, max_items: int = 50) -> list[dict]: + """Generischer RSS-2.0-Parser. Liefert dicts wie fetch_tagesschau.""" + raw = _http_get(url) + if not raw: + return [] + text = raw.decode("utf-8", errors="replace") + items_xml = _RSS_ITEM_RE.findall(text)[:max_items] + out: list[dict] = [] + for item in items_xml: + title_m = _RSS_TITLE_RE.search(item) + link_m = _RSS_LINK_RE.search(item) + desc_m = _RSS_DESC_RE.search(item) + pub_m = _RSS_PUB_RE.search(item) + titel = _strip_html(title_m.group(1)) if title_m else "" + link = _strip_html(link_m.group(1)) if link_m else "" + if not titel or not link: + continue + summary = _strip_html(desc_m.group(1)) if desc_m else "" + datum = _parse_rss_date(pub_m.group(1)) if pub_m else "" + out.append({ + "url": link, + "titel": titel, + "summary": summary, + "datum": datum, + "source": source, + "ressort": None, + "tags": [], + }) + return out + + +def fetch_all() -> list[dict]: + """Holt alle konfigurierten Quellen ein. Kein Caching, kein Auto-Retry.""" + out: list[dict] = [] + out.extend(fetch_tagesschau()) + for source, url in BUNDESTAG_RSS.items(): + out.extend(fetch_rss(source, url)) + return out + + +# ───────────────────────────────────────────────────────────────────────────── +# DB-Persistierung +# ───────────────────────────────────────────────────────────────────────────── + + +def upsert_articles( + articles: list[dict], + db_path: Optional[Path] = None, + embed: bool = True, +) -> dict: + """Schreibe oder aktualisiere News-Artikel in der DB. + + Idempotent ueber URL-PK. Existierende Eintraege bekommen ein neues + ``fetched_at``, aber Embedding bleibt persistent (sonst LLM-Kosten + pro Cron-Lauf). + + Returns: + ``{"inserted": int, "updated": int, "embedded": int}`` + """ + import sqlite3 + from .config import settings + + path = db_path or settings.db_path + if not Path(path).exists(): + return {"inserted": 0, "updated": 0, "embedded": 0} + + conn = sqlite3.connect(str(path)) + inserted = 0 + updated = 0 + embedded = 0 + try: + for art in articles: + url = art["url"] + cur = conn.execute( + "SELECT summary_embedding IS NOT NULL FROM news_articles WHERE url=?", + (url,), + ) + row = cur.fetchone() + tags_json = json.dumps(art.get("tags") or []) + if row is None: + conn.execute( + """INSERT INTO news_articles + (url, titel, summary, datum, source, ressort, tags, fetched_at) + VALUES (?, ?, ?, ?, ?, ?, ?, datetime('now'))""", + ( + url, art["titel"], art.get("summary") or "", + art.get("datum") or "", + art["source"], art.get("ressort"), tags_json, + ), + ) + inserted += 1 + else: + conn.execute( + """UPDATE news_articles + SET titel=?, summary=?, datum=?, source=?, ressort=?, tags=?, + fetched_at=datetime('now') + WHERE url=?""", + ( + art["titel"], art.get("summary") or "", + art.get("datum") or "", + art["source"], art.get("ressort"), tags_json, + url, + ), + ) + updated += 1 + conn.commit() + finally: + conn.close() + + if embed: + embedded = embed_pending_articles(db_path=db_path) + + return {"inserted": inserted, "updated": updated, "embedded": embedded} + + +def embed_pending_articles( + db_path: Optional[Path] = None, + limit: int = 100, +) -> int: + """Erzeuge Embeddings fuer alle News-Artikel ohne ``summary_embedding``. + + Embedded wird ein Stueck-Text aus Titel + Summary + Tags. Bei + Embedding-API-Fehler wird der Artikel uebersprungen — naechster Run + holt ihn nach. + """ + import sqlite3 + from .config import settings + from . import embeddings as emb + + path = db_path or settings.db_path + if not Path(path).exists(): + return 0 + + conn = sqlite3.connect(str(path)) + try: + rows = conn.execute( + """SELECT url, titel, summary, tags FROM news_articles + WHERE summary_embedding IS NULL ORDER BY datum DESC LIMIT ?""", + (limit,), + ).fetchall() + finally: + conn.close() + + if not rows: + return 0 + + embedded = 0 + conn = sqlite3.connect(str(path)) + try: + for url, titel, summary, tags_raw in rows: + try: + tags = json.loads(tags_raw) if tags_raw else [] + except (json.JSONDecodeError, TypeError): + tags = [] + parts = [titel or ""] + if summary: + parts.append(summary) + if tags: + parts.append(", ".join(tags)) + text = "\n".join(p for p in parts if p).strip() + if not text: + continue + try: + vec = emb.create_embedding(text, model=emb.EMBEDDING_MODEL) + except Exception: + logger.exception("embed_pending_articles: API error for %s", url) + continue + conn.execute( + """UPDATE news_articles + SET summary_embedding=?, embedding_model=? + WHERE url=?""", + (json.dumps(vec).encode(), emb.EMBEDDING_MODEL, url), + ) + embedded += 1 + conn.commit() + finally: + conn.close() + return embedded + + +def run_aggregator(db_path: Optional[Path] = None, embed: bool = True) -> dict: + """Top-Level: alle Quellen holen + persistieren + embedden. + + Sicher fuer Cron-Aufrufe — fehlende Quellen werden geloggt, nicht + geworfen. + """ + articles = fetch_all() + return upsert_articles(articles, db_path=db_path, embed=embed) diff --git a/app/presse_generator.py b/app/presse_generator.py new file mode 100644 index 0000000..b7eb53c --- /dev/null +++ b/app/presse_generator.py @@ -0,0 +1,256 @@ +"""Pressemitteilungs-Generator fuer #170 Phase 4. + +Erzeugt einen LLM-generierten Pressemitteilungs-Vorschlag, der einen +GWÖ-bewerteten Antrag in den Kontext eines aktuellen News-Artikels stellt. + +Manueller Trigger via UI-Button — kein Auto-Versand. Drafts werden in +``presse_drafts`` persistiert und in der UI als Liste sichtbar. + +Tonalitaet: +- GWÖ-Sicht (Gemeinwohl-orientiert, nicht parteipolitisch) +- Faktenbasiert, keine Lobbying-Sprache +- 200-250 Worte, presseaehnlicher Aufbau (Lead-Paragraph + Begruendung) +""" +from __future__ import annotations + +import json +import logging +import sqlite3 +from pathlib import Path +from typing import Optional + +logger = logging.getLogger(__name__) + + +SYSTEM_PROMPT = """Du bist ein politischer Redakteur, der für eine +Gemeinwohl-Ökonomie-Initiative Pressemitteilungen schreibt. Deine Stil- +Richtlinien: + +- 200-250 Worte +- Sachlicher, präziser Stil — keine Werbesprache, keine Polemik +- Faktenbasiert: Daten aus dem Antrag und dem News-Kontext explizit nennen +- GWÖ-Werte (Würde, Solidarität, Nachhaltigkeit, Gerechtigkeit, Demokratie) + als Bewertungsmaßstab — nicht parteipolitische Linie +- Klare Struktur: Titel, Lead-Paragraph (Wer? Was? Wann? Warum jetzt?), + Begründung mit Bezug auf GWÖ-Bewertung, Schluss mit Forderung oder + Einladung zum Dialog +- Niemals den Anbieter der News-Quelle (Tagesschau, Bundestag) zitieren — + nur den Sachverhalt aufgreifen, der dort beschrieben ist + +Antworte NUR mit gültigem JSON in dieser Struktur: +{ + "titel": "", + "body": "" +}""" + + +def _build_user_prompt( + drucksache: str, + bundesland: str, + antrag_titel: str, + antrag_zusammenfassung: str, + gwoe_score: float, + gwoe_begruendung: str, + empfehlung: str, + news_titel: str, + news_summary: str, + news_url: str, +) -> str: + """Konstruiert den User-Prompt aus Antrags- und News-Daten.""" + return f"""## Aktueller Antrag + +Drucksache: {drucksache} ({bundesland}) +Titel: {antrag_titel} + +Zusammenfassung: {antrag_zusammenfassung or "(keine vorhanden)"} + +GWÖ-Score: {gwoe_score}/10 +GWÖ-Begründung: {gwoe_begruendung or "(keine vorhanden)"} +Empfehlung: {empfehlung or "(keine)"} + +## Aktueller Nachrichten-Kontext + +Schlagzeile: {news_titel} + +Inhalt: {news_summary or "(keine Zusammenfassung verfügbar)"} + +Quelle: {news_url} + +## Deine Aufgabe + +Schreibe eine Pressemitteilung, die diesen Antrag in den Kontext der +aktuellen Nachrichtenlage stellt. Begründe aus GWÖ-Sicht, warum der +Antrag gerade jetzt relevant ist (oder warum er die aktuelle Debatte +ergänzt/korrigiert). Wenn der GWÖ-Score niedrig ist (< 5), sei dabei +kritisch — die PM kann auch eine Ablehnung des Antrags begründen. +""" + + +async def generate_draft( + drucksache: str, + news_url: str, + db_path: Optional[Path] = None, + bewerter=None, +) -> dict: + """Erzeugt einen Pressemitteilungs-Draft und persistiert ihn. + + Args: + drucksache: ID des Antrags (mit Bundesland-Kontext aus DB). + news_url: URL des News-Artikels (Lookup in news_articles). + db_path: optional override fuer Tests. + bewerter: optional injected QwenBewerter (fuer Tests). Wenn None, + wird der Default mit settings instanziiert. + + Returns: + ``{"id": int, "drucksache": ..., "bundesland": ..., + "news_url": ..., "news_titel": ..., + "titel": str, "body": str, "model": str, "created_at": ISO}`` + + Raises: + ValueError: wenn drucksache oder news_url nicht gefunden. + """ + from .config import settings + from .adapters.qwen_bewerter import LlmRequest + + path = db_path or settings.db_path + conn = sqlite3.connect(str(path)) + try: + antrag = conn.execute( + """SELECT bundesland, title, antrag_zusammenfassung, gwoe_score, + gwoe_begruendung, empfehlung + FROM assessments WHERE drucksache=?""", + (drucksache,), + ).fetchone() + news = conn.execute( + "SELECT titel, summary FROM news_articles WHERE url=?", + (news_url,), + ).fetchone() + finally: + conn.close() + + if not antrag: + raise ValueError(f"Drucksache {drucksache} nicht in assessments") + if not news: + raise ValueError(f"News-URL {news_url} nicht in news_articles") + + user_prompt = _build_user_prompt( + drucksache=drucksache, + bundesland=antrag[0], + antrag_titel=antrag[1] or "", + antrag_zusammenfassung=antrag[2] or "", + gwoe_score=antrag[3] or 0.0, + gwoe_begruendung=antrag[4] or "", + empfehlung=antrag[5] or "", + news_titel=news[0], + news_summary=news[1] or "", + news_url=news_url, + ) + + if bewerter is None: + from .adapters.qwen_bewerter import QwenBewerter + bewerter = QwenBewerter() + + req = LlmRequest( + system_prompt=SYSTEM_PROMPT, + user_prompt=user_prompt, + model=settings.llm_model_default, + base_temperature=0.3, + max_tokens=1500, + max_retries=2, + ) + result = await bewerter.bewerte(req) + + titel = (result.get("titel") or "").strip()[:200] + body = (result.get("body") or "").strip() + if not titel or not body: + raise ValueError("LLM-Response unvollständig (titel oder body leer)") + + # Persist + conn = sqlite3.connect(str(path)) + try: + cur = conn.execute( + """INSERT INTO presse_drafts + (drucksache, bundesland, news_url, news_titel, titel, body, model) + VALUES (?, ?, ?, ?, ?, ?, ?)""", + (drucksache, antrag[0], news_url, news[0], titel, body, + settings.llm_model_default), + ) + draft_id = cur.lastrowid + row = conn.execute( + """SELECT id, drucksache, bundesland, news_url, news_titel, + titel, body, model, created_at + FROM presse_drafts WHERE id=?""", + (draft_id,), + ).fetchone() + conn.commit() + finally: + conn.close() + + return { + "id": row[0], "drucksache": row[1], "bundesland": row[2], + "news_url": row[3], "news_titel": row[4], + "titel": row[5], "body": row[6], "model": row[7], + "created_at": row[8], + } + + +def list_drafts( + limit: int = 20, + db_path: Optional[Path] = None, +) -> list[dict]: + """Liste der zuletzt generierten Drafts. Default-Limit 20.""" + from .config import settings + + path = db_path or settings.db_path + if not Path(path).exists(): + return [] + conn = sqlite3.connect(str(path)) + try: + rows = conn.execute( + """SELECT id, drucksache, bundesland, news_url, news_titel, + titel, body, model, created_at + FROM presse_drafts + ORDER BY id DESC LIMIT ?""", + (limit,), + ).fetchall() + finally: + conn.close() + return [ + { + "id": r[0], "drucksache": r[1], "bundesland": r[2], + "news_url": r[3], "news_titel": r[4], + "titel": r[5], "body": r[6], "model": r[7], + "created_at": r[8], + } + for r in rows + ] + + +def get_draft( + draft_id: int, + db_path: Optional[Path] = None, +) -> Optional[dict]: + """Einen Draft per ID abrufen.""" + from .config import settings + + path = db_path or settings.db_path + if not Path(path).exists(): + return None + conn = sqlite3.connect(str(path)) + try: + row = conn.execute( + """SELECT id, drucksache, bundesland, news_url, news_titel, + titel, body, model, created_at + FROM presse_drafts WHERE id=?""", + (draft_id,), + ).fetchone() + finally: + conn.close() + if not row: + return None + return { + "id": row[0], "drucksache": row[1], "bundesland": row[2], + "news_url": row[3], "news_titel": row[4], + "titel": row[5], "body": row[6], "model": row[7], + "created_at": row[8], + } diff --git a/app/templates/v2/base.html b/app/templates/v2/base.html index 462e771..71e47f3 100644 --- a/app/templates/v2/base.html +++ b/app/templates/v2/base.html @@ -56,6 +56,7 @@
— Daten
{{ icon("chart-bar", 14) }} Auswertungen + {{ icon("book-open", 14) }} Aktuelle Themen {{ icon("file-csv", 14) }} Export · API {{ icon("rss", 14) }} Atom-Feed {{ icon("envelope-simple", 14) }} Meine Abos diff --git a/app/templates/v2/screens/aktuelle-themen.html b/app/templates/v2/screens/aktuelle-themen.html new file mode 100644 index 0000000..30fd08e --- /dev/null +++ b/app/templates/v2/screens/aktuelle-themen.html @@ -0,0 +1,417 @@ +{% extends "v2/base.html" %} + +{% block title %}Aktuelle Themen — GWÖ-Antragsprüfer{% endblock %} + +{% set v2_active_nav = "aktuelle-themen" %} + +{% block head_extra %} + + +{% endblock %} + +{% block main %} +
+

Aktuelle Themen

+

+ Tagesschau + Bundestag-RSS · gematcht mit deinen Anträgen · + Pressemitteilungs-Vorschläge +

+
+ +
+

+ Die täglich aktuellen politischen Top-Themen aus + öffentlich-rechtlichen + parlamentarischen Quellen + (Tagesschau-API + Bundestag-RSS) werden semantisch mit den von dir + bewerteten Anträgen verschnitten. Pro News-Artikel siehst du die + GWÖ-Bewertung der dazu passendsten Anträge — und kannst per Klick + eine Pressemitteilung generieren lassen. +

+

+ Bewusst nicht verwendet: Quellen mit AI-Bann in + robots.txt (z.B. RND.de). Die UI zeigt nur Titel + URL + erste Sätze + — Volltexte werden nicht persistiert. +

+
+ +
+ + + + + + + +
+ + +

+ News-Volumen pro Quelle (letzte 30 Tage) +

+
+ +
+
+ + +

+ Top-Themen × passende Anträge +

+
+
Lade …
+
+ + +

+ Pressemitteilungs-Entwürfe (zuletzt generiert) +

+
+
Lade Entwürfe …
+
+ + + + +{% endblock %} + +{% block body_scripts %} + +{% endblock %} diff --git a/app/themen_matching.py b/app/themen_matching.py new file mode 100644 index 0000000..5bb0b6f --- /dev/null +++ b/app/themen_matching.py @@ -0,0 +1,371 @@ +"""Themen × Anträge Matching fuer das Aktuelle-Themen-Dashboard +(#170 Phase 2). + +Verschneidet News-Artikel-Embeddings (aus news_articles.summary_embedding) +mit Antrag-Embeddings (assessments.summary_embedding) per Cosine-Similarity. +Liefert pro News-Artikel die Top-K-passendsten Anträge. + +Reuse: +- ``embeddings.cosine_similarity`` fuer den Vektor-Vergleich +- Beide Tabellen nutzen denselben Embedding-Modell-Vektorraum (qwen v4), + daher direkter Cross-Vergleich moeglich +- Filter ueber ``embedding_model``-Spalte, falls Migration laueft +""" +from __future__ import annotations + +import json +import logging +import sqlite3 +from collections import defaultdict +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional + +logger = logging.getLogger(__name__) + + +def _load_embeddings( + db_path: Path, + table: str, + select_cols: list[str], + where_extra: str = "", + params: tuple = (), +) -> list[dict]: + """Generischer Loader fuer Tabellen mit ``summary_embedding``-Spalte. + + Liefert Zeilen mit decoded Embedding-Vektor (oder filtert aus, wenn + Modell nicht zum aktuellen READ-Modell passt). + """ + from . import embeddings as emb + + if not Path(db_path).exists(): + return [] + conn = sqlite3.connect(str(db_path)) + try: + conn.row_factory = sqlite3.Row + cols = ", ".join(select_cols) + sql = ( + f"SELECT {cols}, summary_embedding, embedding_model " + f"FROM {table} " + f"WHERE summary_embedding IS NOT NULL {where_extra}" + ) + rows = conn.execute(sql, params).fetchall() + finally: + conn.close() + + out = [] + for r in rows: + if r["embedding_model"] != emb.EMBEDDING_MODEL_READ: + continue + try: + vec = json.loads(r["summary_embedding"]) + except (json.JSONDecodeError, TypeError): + continue + d = dict(r) + d["_vec"] = vec + out.append(d) + return out + + +def find_anträge_for_news( + news_url: str, + top_k: int = 5, + min_similarity: float = 0.4, + db_path: Optional[Path] = None, +) -> list[dict]: + """Pro gegebener News-URL: Top-K aehnlichste Antraege per Cosine-Match. + + Filter ``min_similarity`` haelt den Cut-Off fuer "passt einigermassen". + 0.4 ist empirisch der Punkt, ab dem qwen-v4-Embeddings semantisch + relevant matchen. + """ + from .config import settings + from . import embeddings as emb + + path = db_path or settings.db_path + if not Path(path).exists(): + return [] + + # 1. News-Vektor laden + conn = sqlite3.connect(str(path)) + try: + row = conn.execute( + """SELECT summary_embedding, embedding_model + FROM news_articles WHERE url=?""", + (news_url,), + ).fetchone() + finally: + conn.close() + if not row or not row[0] or row[1] != emb.EMBEDDING_MODEL_READ: + return [] + try: + news_vec = json.loads(row[0]) + except (json.JSONDecodeError, TypeError): + return [] + + # 2. Alle Assessments mit Embedding laden + scoren + assessments = _load_embeddings( + Path(path), + "assessments", + ["drucksache", "title", "bundesland", "fraktionen", "gwoe_score", + "empfehlung", "themen", "datum"], + ) + scored = [] + for a in assessments: + sim = emb.cosine_similarity(news_vec, a["_vec"]) + if sim < min_similarity: + continue + scored.append({ + "drucksache": a["drucksache"], + "title": a["title"], + "bundesland": a["bundesland"], + "fraktionen": json.loads(a["fraktionen"] or "[]"), + "gwoe_score": a["gwoe_score"], + "empfehlung": a["empfehlung"], + "themen": json.loads(a["themen"] or "[]"), + "datum": a["datum"], + "similarity": round(sim, 3), + }) + scored.sort(key=lambda x: x["similarity"], reverse=True) + return scored[:top_k] + + +def find_news_for_antrag( + drucksache: str, + top_k: int = 5, + min_similarity: float = 0.4, + days_window: int = 90, + db_path: Optional[Path] = None, +) -> list[dict]: + """Pro gegebener Drucksache: Top-K aehnlichste News-Artikel per Cosine. + + Filtert News auf ein Zeitfenster (Default 90 Tage), damit + Pressemitteilungen aus aktueller Aktualitaet stammen. + """ + from .config import settings + from . import embeddings as emb + + path = db_path or settings.db_path + if not Path(path).exists(): + return [] + + # 1. Antrag-Vektor laden + conn = sqlite3.connect(str(path)) + try: + row = conn.execute( + """SELECT summary_embedding, embedding_model + FROM assessments WHERE drucksache=?""", + (drucksache,), + ).fetchone() + finally: + conn.close() + if not row or not row[0] or row[1] != emb.EMBEDDING_MODEL_READ: + return [] + try: + antrag_vec = json.loads(row[0]) + except (json.JSONDecodeError, TypeError): + return [] + + # 2. News mit Datums-Filter laden + cutoff = datetime.now(timezone.utc).timestamp() - days_window * 86400 + news = _load_embeddings( + Path(path), + "news_articles", + ["url", "titel", "summary", "datum", "source", "ressort", "tags"], + ) + scored = [] + for n in news: + sim = emb.cosine_similarity(antrag_vec, n["_vec"]) + if sim < min_similarity: + continue + # Datums-Filter + try: + news_ts = datetime.fromisoformat( + n["datum"].replace("Z", "+00:00") + ).timestamp() + if news_ts < cutoff: + continue + except (ValueError, AttributeError): + pass # Wenn Datum nicht parsbar, lass es durch + try: + tags = json.loads(n["tags"]) if n["tags"] else [] + except (json.JSONDecodeError, TypeError): + tags = [] + scored.append({ + "url": n["url"], + "titel": n["titel"], + "summary": n["summary"], + "datum": n["datum"], + "source": n["source"], + "ressort": n["ressort"], + "tags": tags, + "similarity": round(sim, 3), + }) + scored.sort(key=lambda x: x["similarity"], reverse=True) + return scored[:top_k] + + +def aggregate_top_themen( + days_window: int = 7, + top_k: int = 10, + min_similarity: float = 0.4, + matches_per_news: int = 3, + db_path: Optional[Path] = None, +) -> dict: + """Top-K aktuelle News (letzte N Tage) mit jeweils ihren passendsten + Antraegen — der primaere Dashboard-Endpoint. + + Returns: + ``{ + "buckets": [{ + "news": {url, titel, summary, datum, source, ressort, tags}, + "matches": [{drucksache, title, gwoe_score, similarity, ...}] + }, ...], + "n_total_news": int, + "filter": {...} + }`` + """ + from .config import settings + from . import embeddings as emb + + path = db_path or settings.db_path + if not Path(path).exists(): + return {"buckets": [], "n_total_news": 0, "filter": { + "days_window": days_window, "top_k": top_k, + "min_similarity": min_similarity, + }} + + cutoff = ( + datetime.now(timezone.utc).timestamp() - days_window * 86400 + ) + + news_rows = _load_embeddings( + Path(path), + "news_articles", + ["url", "titel", "summary", "datum", "source", "ressort", "tags"], + ) + # Nach Datum filtern + fresh = [] + for n in news_rows: + try: + news_ts = datetime.fromisoformat( + n["datum"].replace("Z", "+00:00") + ).timestamp() + except (ValueError, AttributeError): + continue + if news_ts < cutoff: + continue + n["_ts"] = news_ts + fresh.append(n) + # Nach Datum desc sortieren, top_k cutten + fresh.sort(key=lambda x: x["_ts"], reverse=True) + fresh = fresh[:top_k] + + # Pro News: alle Antraege scoren, Top matches_per_news behalten + assessments = _load_embeddings( + Path(path), + "assessments", + ["drucksache", "title", "bundesland", "fraktionen", "gwoe_score", + "empfehlung", "themen", "datum"], + ) + + buckets = [] + for n in fresh: + scored = [] + for a in assessments: + sim = emb.cosine_similarity(n["_vec"], a["_vec"]) + if sim < min_similarity: + continue + scored.append({ + "drucksache": a["drucksache"], + "title": a["title"], + "bundesland": a["bundesland"], + "fraktionen": json.loads(a["fraktionen"] or "[]"), + "gwoe_score": a["gwoe_score"], + "empfehlung": a["empfehlung"], + "datum": a["datum"], + "similarity": round(sim, 3), + }) + scored.sort(key=lambda x: x["similarity"], reverse=True) + try: + tags = json.loads(n["tags"]) if n["tags"] else [] + except (json.JSONDecodeError, TypeError): + tags = [] + buckets.append({ + "news": { + "url": n["url"], + "titel": n["titel"], + "summary": n["summary"], + "datum": n["datum"], + "source": n["source"], + "ressort": n["ressort"], + "tags": tags, + }, + "matches": scored[:matches_per_news], + }) + + return { + "buckets": buckets, + "n_total_news": len(news_rows), + "filter": { + "days_window": days_window, + "top_k": top_k, + "min_similarity": min_similarity, + "matches_per_news": matches_per_news, + }, + } + + +def aggregate_themen_zeitreihe( + days_window: int = 30, + db_path: Optional[Path] = None, +) -> dict: + """News-Volumen pro (Tag, Source) ueber die letzten N Tage — + Stacked-Area-Chart. + + Liefert Zeitreihe ohne Antrag-Match — nur die News-Aktivitaet pro + Quelle, damit das Dashboard sehen kann, welche Quellen wie aktiv waren. + """ + from .config import settings + + path = db_path or settings.db_path + if not Path(path).exists(): + return {"buckets": [], "sources": [], "series": {}} + + cutoff_ts = datetime.now(timezone.utc).timestamp() - days_window * 86400 + conn = sqlite3.connect(str(path)) + try: + rows = conn.execute( + "SELECT datum, source FROM news_articles" + ).fetchall() + finally: + conn.close() + + counts: defaultdict[tuple[str, str], int] = defaultdict(int) + sources_seen: set[str] = set() + days_seen: set[str] = set() + for datum, source in rows: + if not datum: + continue + try: + ts = datetime.fromisoformat(datum.replace("Z", "+00:00")).timestamp() + except (ValueError, AttributeError): + continue + if ts < cutoff_ts: + continue + day = datum[:10] # YYYY-MM-DD + sources_seen.add(source) + days_seen.add(day) + counts[(day, source)] += 1 + + days_sorted = sorted(days_seen) + sources_sorted = sorted(sources_seen) + series = { + s: [counts[(d, s)] for d in days_sorted] + for s in sources_sorted + } + return { + "buckets": days_sorted, + "sources": sources_sorted, + "series": series, + } diff --git a/scripts/auto-fetch-news.sh b/scripts/auto-fetch-news.sh new file mode 100755 index 0000000..7c69d9f --- /dev/null +++ b/scripts/auto-fetch-news.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# Aktuelle-Themen-Dashboard: News-Aggregator-Cron (#170 Phase 1). +# +# Holt taeglich Headlines von Tagesschau-API + Bundestag-RSS, persistiert +# sie in news_articles und embeddet die neuen via Qwen-Embeddings-API. +# Idempotent (URL-PK), wiederhol-bar bei Fehlern. +# +# Wird via Cron taeglich morgens aufgerufen, vor auto-ingest-protocols.sh. +# +# Usage: +# auto-fetch-news.sh [CONTAINER] +set -euo pipefail + +CONTAINER="${1:-gwoe-antragspruefer}" + +echo "=== auto-fetch-news $(date -Iseconds) ===" + +docker exec -i "$CONTAINER" python <<'EOF' +from app.news_aggregator import run_aggregator +stats = run_aggregator() +print(f"News-Aggregator: inserted={stats['inserted']} updated={stats['updated']} embedded={stats['embedded']}") +EOF + +echo "=== auto-fetch-news done $(date -Iseconds) ===" diff --git a/tests/test_news_aggregator.py b/tests/test_news_aggregator.py new file mode 100644 index 0000000..84c41bc --- /dev/null +++ b/tests/test_news_aggregator.py @@ -0,0 +1,262 @@ +"""Tests fuer app.news_aggregator (#170 Phase 1). + +Testet Parser + DB-Persistierung gegen kontrollierte Fixtures, ohne +Live-HTTP-Calls (Tagesschau-API + Bundestag-RSS werden gemockt). +""" +from __future__ import annotations + +import json +import sqlite3 +from pathlib import Path +from unittest.mock import patch + +import pytest + +from app.news_aggregator import ( + _parse_rss_date, + _strip_html, + fetch_rss, + fetch_tagesschau, + upsert_articles, +) + + +# ───────────────────────────────────────────────────────────────────────────── +# Helper +# ───────────────────────────────────────────────────────────────────────────── + + +class TestStripHtml: + def test_removes_tags(self): + assert _strip_html("

Hello world

") == "Hello world" + + def test_decodes_cdata(self): + assert "Test" in _strip_html("") + + def test_decodes_entities(self): + assert _strip_html("a & b") == "a & b" + + def test_collapses_whitespace(self): + assert _strip_html("

a b\n c

") == "a b c" + + def test_empty(self): + assert _strip_html("") == "" + + +class TestParseRssDate: + def test_rfc822_to_iso(self): + result = _parse_rss_date("Tue, 28 Apr 2026 10:45:12 GMT") + assert result.startswith("2026-04-28") + + def test_invalid_returns_empty(self): + assert _parse_rss_date("garbage") == "" + assert _parse_rss_date("") == "" + + +# ───────────────────────────────────────────────────────────────────────────── +# fetch_tagesschau (mocked HTTP) +# ───────────────────────────────────────────────────────────────────────────── + + +SAMPLE_TAGESSCHAU_JSON = json.dumps({ + "news": [ + { + "title": "Bundestag berät über Wohnungsbau", + "firstSentence": "Der Bundestag hat heute über das neue Wohnungsbau-Gesetz beraten.", + "shareURL": "https://www.tagesschau.de/inland/bundestag-wohnungsbau-100.html", + "date": "2026-04-28T10:00:00.000+02:00", + "ressort": "inland", + "tags": [{"tag": "Wohnungsbau"}, {"tag": "Bundestag"}], + }, + { + "title": "EU-Kommission stellt Klimapaket vor", + "firstSentence": "Die EU plant ehrgeizige Klimaziele.", + "shareURL": "https://www.tagesschau.de/ausland/eu-klima-100.html", + "date": "2026-04-28T11:00:00.000+02:00", + "ressort": "ausland", + "tags": [{"tag": "Klima"}, {"tag": "EU"}], + }, + { + # Dieser hat keinen shareURL — sollte uebersprungen werden + "title": "Kein Link", + "firstSentence": "Skip mich", + }, + ], +}).encode("utf-8") + + +class TestFetchTagesschau: + def test_parses_news_array(self): + with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON): + articles = fetch_tagesschau(ressorts=["inland"]) + # Deduplication ueber URL → 2 unique + assert len(articles) == 2 + first = articles[0] + assert first["url"] == "https://www.tagesschau.de/inland/bundestag-wohnungsbau-100.html" + assert first["titel"] == "Bundestag berät über Wohnungsbau" + assert "Wohnungsbau" in first["summary"] + assert first["source"] == "tagesschau" + assert first["ressort"] == "inland" + assert "Wohnungsbau" in first["tags"] + + def test_skips_items_without_link(self): + with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON): + articles = fetch_tagesschau(ressorts=["inland"]) + assert all(a["url"] for a in articles) + + def test_returns_empty_on_http_error(self): + with patch("app.news_aggregator._http_get", return_value=None): + articles = fetch_tagesschau(ressorts=["inland"]) + assert articles == [] + + def test_dedup_across_ressorts(self): + """Wenn dasselbe Item in zwei Ressorts erscheint, wird es nur 1× geliefert.""" + with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON): + articles = fetch_tagesschau(ressorts=["inland", "ausland"]) + urls = [a["url"] for a in articles] + assert len(urls) == len(set(urls)) + + +# ───────────────────────────────────────────────────────────────────────────── +# fetch_rss (mocked HTTP) +# ───────────────────────────────────────────────────────────────────────────── + + +SAMPLE_RSS = """ +BT Aktuell + +<![CDATA[Bundestag berät Antrag zum Wohnungsbau]]> +https://www.bundestag.de/dokumente/textarchiv/2026/kw18-wohnungsbau-1170388 + +Tue, 28 Apr 2026 10:45:12 GMT + + +Antrag zur Klimapolitik +https://www.bundestag.de/klima +Klimaschutz im Bundestag +Mon, 27 Apr 2026 10:00:00 GMT + +""".encode("utf-8") + + +class TestFetchRss: + def test_parses_rss_items(self): + with patch("app.news_aggregator._http_get", return_value=SAMPLE_RSS): + articles = fetch_rss("bundestag-aktuell", "https://example.com/rss") + assert len(articles) == 2 + first = articles[0] + assert "Wohnungsbau" in first["titel"] + assert first["url"].startswith("https://www.bundestag.de") + assert first["source"] == "bundestag-aktuell" + assert first["datum"].startswith("2026-04-28") + assert "Bundestag" in first["summary"] + + def test_strips_cdata_and_html(self): + with patch("app.news_aggregator._http_get", return_value=SAMPLE_RSS): + articles = fetch_rss("bundestag-aktuell", "https://example.com/rss") + for a in articles: + assert " + Nur Titel + nur-link + """ + with patch("app.news_aggregator._http_get", return_value=bad): + articles = fetch_rss("x", "https://example.com/rss") + assert articles == [] + + +# ───────────────────────────────────────────────────────────────────────────── +# upsert_articles +# ───────────────────────────────────────────────────────────────────────────── + + +@pytest.fixture +def empty_db(tmp_path: Path) -> Path: + db = tmp_path / "test_news.db" + conn = sqlite3.connect(str(db)) + conn.execute(""" + CREATE TABLE news_articles ( + url TEXT PRIMARY KEY, + titel TEXT NOT NULL, + summary TEXT, + datum TEXT NOT NULL, + source TEXT NOT NULL, + ressort TEXT, + tags TEXT, + summary_embedding BLOB, + embedding_model TEXT, + fetched_at TEXT NOT NULL DEFAULT (datetime('now')) + ) + """) + conn.commit() + conn.close() + return db + + +SAMPLE_ARTICLES = [ + { + "url": "https://example.com/a", + "titel": "Wohnungsbau", + "summary": "Heute im Bundestag", + "datum": "2026-04-28", + "source": "tagesschau", + "ressort": "inland", + "tags": ["Wohnungsbau"], + }, + { + "url": "https://example.com/b", + "titel": "Klima", + "summary": "EU plant Klimaziele", + "datum": "2026-04-28", + "source": "tagesschau", + "ressort": "ausland", + "tags": ["Klima", "EU"], + }, +] + + +class TestUpsertArticles: + def test_inserts_new_articles(self, empty_db): + stats = upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False) + assert stats["inserted"] == 2 + assert stats["updated"] == 0 + + def test_updates_existing_articles(self, empty_db): + upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False) + # Re-run with same URLs but different titel + modified = [{**a, "titel": a["titel"] + " (neu)"} for a in SAMPLE_ARTICLES] + stats = upsert_articles(modified, db_path=empty_db, embed=False) + assert stats["updated"] == 2 + assert stats["inserted"] == 0 + # Verify the title was updated + conn = sqlite3.connect(str(empty_db)) + row = conn.execute( + "SELECT titel FROM news_articles WHERE url=?", + (SAMPLE_ARTICLES[0]["url"],), + ).fetchone() + conn.close() + assert row[0].endswith("(neu)") + + def test_persists_tags_as_json(self, empty_db): + upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False) + conn = sqlite3.connect(str(empty_db)) + row = conn.execute( + "SELECT tags FROM news_articles WHERE url=?", + (SAMPLE_ARTICLES[0]["url"],), + ).fetchone() + conn.close() + tags = json.loads(row[0]) + assert tags == ["Wohnungsbau"] + + def test_missing_db_returns_zeros(self, tmp_path): + stats = upsert_articles(SAMPLE_ARTICLES, + db_path=tmp_path / "missing.db", embed=False) + assert stats == {"inserted": 0, "updated": 0, "embedded": 0} diff --git a/tests/test_presse_generator.py b/tests/test_presse_generator.py new file mode 100644 index 0000000..4c20af9 --- /dev/null +++ b/tests/test_presse_generator.py @@ -0,0 +1,224 @@ +"""Tests fuer app.presse_generator (#170 Phase 4).""" +from __future__ import annotations + +import json +import sqlite3 +from pathlib import Path +from unittest.mock import patch + +import pytest + +from app.presse_generator import ( + _build_user_prompt, + generate_draft, + get_draft, + list_drafts, +) + + +# ───────────────────────────────────────────────────────────────────────────── +# Fixture: DB mit Antrag + News +# ───────────────────────────────────────────────────────────────────────────── + + +@pytest.fixture +def db_with_antrag_and_news(tmp_path: Path) -> Path: + db = tmp_path / "test_presse.db" + conn = sqlite3.connect(str(db)) + conn.execute(""" + CREATE TABLE assessments ( + drucksache TEXT PRIMARY KEY, + title TEXT, + bundesland TEXT, + antrag_zusammenfassung TEXT, + gwoe_score REAL, + gwoe_begruendung TEXT, + empfehlung TEXT + ) + """) + conn.execute(""" + CREATE TABLE news_articles ( + url TEXT PRIMARY KEY, + titel TEXT NOT NULL, + summary TEXT + ) + """) + conn.execute(""" + CREATE TABLE presse_drafts ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + drucksache TEXT NOT NULL, + bundesland TEXT NOT NULL, + news_url TEXT NOT NULL, + news_titel TEXT NOT NULL, + titel TEXT NOT NULL, + body TEXT NOT NULL, + model TEXT NOT NULL, + created_at TEXT NOT NULL DEFAULT (datetime('now')) + ) + """) + conn.execute( + """INSERT INTO assessments + (drucksache, title, bundesland, antrag_zusammenfassung, + gwoe_score, gwoe_begruendung, empfehlung) + VALUES (?, ?, ?, ?, ?, ?, ?)""", + ( + "18/A", "Wohnungsbau-Reform-Antrag", "NRW", + "Antrag fuer mehr sozialen Wohnungsbau", + 8.5, "Stark gemeinwohlorientiert", + "Uneingeschränkt unterstützen", + ), + ) + conn.execute( + "INSERT INTO news_articles (url, titel, summary) VALUES (?, ?, ?)", + ( + "https://example.com/wohnen", + "Wohnungsmarkt im Umbruch", + "Die Mietpreise steigen weiter, der Bundestag berät heute", + ), + ) + conn.commit() + conn.close() + return db + + +# ───────────────────────────────────────────────────────────────────────────── +# _build_user_prompt +# ───────────────────────────────────────────────────────────────────────────── + + +class TestBuildUserPrompt: + def test_includes_drucksache(self): + prompt = _build_user_prompt( + drucksache="18/A", bundesland="NRW", + antrag_titel="Test", antrag_zusammenfassung="Summary", + gwoe_score=7.5, gwoe_begruendung="ok", + empfehlung="Unterstützen", + news_titel="News", news_summary="Lead", + news_url="https://example.com", + ) + assert "18/A" in prompt + assert "NRW" in prompt + assert "7.5" in prompt + assert "News" in prompt + + def test_handles_missing_zusammenfassung(self): + prompt = _build_user_prompt( + drucksache="x", bundesland="x", antrag_titel="x", + antrag_zusammenfassung="", gwoe_score=5.0, + gwoe_begruendung="", empfehlung="", + news_titel="x", news_summary="", news_url="", + ) + assert "(keine vorhanden)" in prompt + + +# ───────────────────────────────────────────────────────────────────────────── +# generate_draft (mocked QwenBewerter) +# ───────────────────────────────────────────────────────────────────────────── + + +class FakeBewerter: + """Mock fuer QwenBewerter, gibt fixe LLM-Response zurueck.""" + + def __init__(self, response: dict): + self._response = response + self.last_request = None + + async def bewerte(self, request): + self.last_request = request + return self._response + + +@pytest.mark.asyncio +async def test_generate_draft_persists_record(db_with_antrag_and_news, monkeypatch): + bewerter = FakeBewerter({ + "titel": "Wohnungsbau jetzt", + "body": "Der vorliegende Antrag der Drucksache 18/A ..." + * 10, # langer Body + }) + # Patch settings.dashscope_model fuer den INSERT + from app.config import settings as real_settings + monkeypatch.setattr(real_settings, "llm_model_default", "qwen-test") + result = await generate_draft( + drucksache="18/A", + news_url="https://example.com/wohnen", + db_path=db_with_antrag_and_news, + bewerter=bewerter, + ) + + assert result["id"] == 1 + assert result["drucksache"] == "18/A" + assert result["bundesland"] == "NRW" + assert result["news_titel"] == "Wohnungsmarkt im Umbruch" + assert result["titel"] == "Wohnungsbau jetzt" + assert "18/A" in result["body"] + + +@pytest.mark.asyncio +async def test_generate_draft_unknown_drucksache(db_with_antrag_and_news): + bewerter = FakeBewerter({"titel": "x", "body": "y"}) + with pytest.raises(ValueError, match="Drucksache"): + await generate_draft( + drucksache="99/MISSING", + news_url="https://example.com/wohnen", + db_path=db_with_antrag_and_news, + bewerter=bewerter, + ) + + +@pytest.mark.asyncio +async def test_generate_draft_unknown_news(db_with_antrag_and_news): + bewerter = FakeBewerter({"titel": "x", "body": "y"}) + with pytest.raises(ValueError, match="News-URL"): + await generate_draft( + drucksache="18/A", + news_url="https://example.com/missing", + db_path=db_with_antrag_and_news, + bewerter=bewerter, + ) + + +@pytest.mark.asyncio +async def test_generate_draft_empty_response_raises(db_with_antrag_and_news, monkeypatch): + bewerter = FakeBewerter({"titel": "", "body": ""}) + from app.config import settings as real_settings + monkeypatch.setattr(real_settings, "llm_model_default", "qwen-test") + with pytest.raises(ValueError, match="unvollständig"): + await generate_draft( + drucksache="18/A", + news_url="https://example.com/wohnen", + db_path=db_with_antrag_and_news, + bewerter=bewerter, + ) + + +# ───────────────────────────────────────────────────────────────────────────── +# list_drafts + get_draft +# ───────────────────────────────────────────────────────────────────────────── + + +class TestListAndGetDrafts: + def test_empty(self, db_with_antrag_and_news): + assert list_drafts(db_path=db_with_antrag_and_news) == [] + assert get_draft(99, db_path=db_with_antrag_and_news) is None + + def test_after_insert(self, db_with_antrag_and_news): + # Direct DB-Insert (test setup) + conn = sqlite3.connect(str(db_with_antrag_and_news)) + conn.execute( + """INSERT INTO presse_drafts + (drucksache, bundesland, news_url, news_titel, titel, body, model) + VALUES (?, ?, ?, ?, ?, ?, ?)""", + ("18/A", "NRW", "https://x.de/n", "News-Titel", + "PM-Titel", "PM-Body", "test-model"), + ) + conn.commit() + conn.close() + + drafts = list_drafts(db_path=db_with_antrag_and_news) + assert len(drafts) == 1 + assert drafts[0]["drucksache"] == "18/A" + assert drafts[0]["titel"] == "PM-Titel" + + d = get_draft(drafts[0]["id"], db_path=db_with_antrag_and_news) + assert d is not None + assert d["body"] == "PM-Body" diff --git a/tests/test_themen_matching.py b/tests/test_themen_matching.py new file mode 100644 index 0000000..6a64c41 --- /dev/null +++ b/tests/test_themen_matching.py @@ -0,0 +1,297 @@ +"""Tests fuer app.themen_matching (#170 Phase 2).""" +from __future__ import annotations + +import json +import sqlite3 +from datetime import datetime, timezone, timedelta +from pathlib import Path +from unittest.mock import patch + +import pytest + +from app.themen_matching import ( + aggregate_themen_zeitreihe, + aggregate_top_themen, + find_anträge_for_news, + find_news_for_antrag, +) + + +# ───────────────────────────────────────────────────────────────────────────── +# Fixture: DB mit News + Assessments + Embeddings +# ───────────────────────────────────────────────────────────────────────────── + + +def _vec(dim: int = 8, val: float = 0.1) -> bytes: + """Konstruiert einen einfachen Vektor als JSON-Bytes.""" + return json.dumps([val] * dim).encode() + + +def _vec_from(values: list[float]) -> bytes: + return json.dumps(values).encode() + + +@pytest.fixture +def populated_db(tmp_path: Path) -> Path: + db = tmp_path / "test_match.db" + conn = sqlite3.connect(str(db)) + conn.execute(""" + CREATE TABLE news_articles ( + url TEXT PRIMARY KEY, + titel TEXT NOT NULL, + summary TEXT, + datum TEXT NOT NULL, + source TEXT NOT NULL, + ressort TEXT, + tags TEXT, + summary_embedding BLOB, + embedding_model TEXT, + fetched_at TEXT NOT NULL DEFAULT (datetime('now')) + ) + """) + conn.execute(""" + CREATE TABLE assessments ( + drucksache TEXT PRIMARY KEY, + title TEXT, + fraktionen TEXT, + datum TEXT, + link TEXT, + bundesland TEXT, + gwoe_score REAL, + gwoe_begruendung TEXT, + gwoe_matrix TEXT, + gwoe_schwerpunkt TEXT, + wahlprogramm_scores TEXT, + verbesserungen TEXT, + staerken TEXT, + schwaechen TEXT, + empfehlung TEXT, + empfehlung_symbol TEXT, + verbesserungspotenzial TEXT, + themen TEXT, + antrag_zusammenfassung TEXT, + antrag_kernpunkte TEXT, + source TEXT, + model TEXT, + created_at TEXT, + updated_at TEXT, + summary_embedding BLOB, + embedding_model TEXT + ) + """) + + today = datetime.now(timezone.utc).isoformat() + yesterday = (datetime.now(timezone.utc) - timedelta(days=1)).isoformat() + old = (datetime.now(timezone.utc) - timedelta(days=200)).isoformat() + + # News-Artikel mit unterschiedlichen Embeddings + news = [ + # Wohnungsbau-News (vec orientiert auf [1,0,0,...]) + ("https://example.com/n1", "Wohnungsbau-Reform", + "Bundestag berät Wohnungsbau", today, "tagesschau", "inland", + '["Wohnungsbau"]', + _vec_from([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])), + # Klima-News (vec orientiert auf [0,1,0,...]) + ("https://example.com/n2", "Klimaschutzgesetz", + "EU plant Klimaziele", today, "tagesschau", "ausland", + '["Klima"]', + _vec_from([0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])), + # Old news, sollte aus Zeitfenster filtern + ("https://example.com/n3", "Alte News", "", old, "tagesschau", "inland", + '[]', _vec_from([0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])), + ] + for url, titel, summary, datum, source, ressort, tags, vec in news: + conn.execute( + """INSERT INTO news_articles + (url, titel, summary, datum, source, ressort, tags, + summary_embedding, embedding_model) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, 'qwen-embedding-v4')""", + (url, titel, summary, datum, source, ressort, tags, vec), + ) + + # Assessments mit Embeddings: + # - 18/A passt zu Wohnungsbau-News (vec [1,0,...]) + # - 18/B passt zu Klima-News + # - 18/C ist orthogonal — sollte nirgends matchen + now_iso = datetime.now().isoformat() + assessments = [ + ("18/A", "Wohnungsbau-Antrag", '["GRÜNE"]', "2026-04-15", "NRW", + 8.0, "Uneingeschränkt unterstützen", + _vec_from([0.95, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])), + ("18/B", "Klima-Antrag", '["SPD"]', "2026-04-16", "NRW", + 7.0, "Unterstützen mit Änderungen", + _vec_from([0.0, 0.95, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0])), + ("18/C", "Sonstiges", '["CDU"]', "2026-04-17", "NRW", + 5.0, "Überarbeiten", + _vec_from([0.0, 0.0, 0.0, 0.0, 0.95, 0.0, 0.0, 0.0])), + ] + for ds, title, fr, dat, bl, sc, emp, vec in assessments: + conn.execute( + """INSERT INTO assessments + (drucksache, title, fraktionen, datum, bundesland, gwoe_score, + empfehlung, themen, source, model, created_at, updated_at, + summary_embedding, embedding_model) + VALUES (?, ?, ?, ?, ?, ?, ?, '[]', 'test', 'test', ?, ?, + ?, 'qwen-embedding-v4')""", + (ds, title, fr, dat, bl, sc, emp, now_iso, now_iso, vec), + ) + + conn.commit() + conn.close() + return db + + +@pytest.fixture(autouse=True) +def mock_embedding_model(): + """Stellt sicher, dass EMBEDDING_MODEL_READ=qwen-embedding-v4 fuer Tests.""" + with patch("app.embeddings.EMBEDDING_MODEL_READ", "qwen-embedding-v4"): + yield + + +# ───────────────────────────────────────────────────────────────────────────── +# find_anträge_for_news +# ───────────────────────────────────────────────────────────────────────────── + + +class TestFindAnträgeForNews: + def test_wohnungsbau_news_matches_wohnungsbau_antrag(self, populated_db): + result = find_anträge_for_news( + "https://example.com/n1", db_path=populated_db, + min_similarity=0.5, + ) + assert len(result) >= 1 + # Top-Match sollte 18/A sein + assert result[0]["drucksache"] == "18/A" + assert result[0]["similarity"] > 0.9 + + def test_klima_news_matches_klima_antrag(self, populated_db): + result = find_anträge_for_news( + "https://example.com/n2", db_path=populated_db, + min_similarity=0.5, + ) + assert len(result) >= 1 + assert result[0]["drucksache"] == "18/B" + + def test_min_similarity_filters_orthogonal(self, populated_db): + """Mit hohem min_similarity-Cutoff darf kein orthogonaler Antrag drin sein.""" + result = find_anträge_for_news( + "https://example.com/n1", db_path=populated_db, + min_similarity=0.9, + ) + druck = [r["drucksache"] for r in result] + assert "18/C" not in druck # 18/C ist orthogonal zu allem + + def test_unknown_news_returns_empty(self, populated_db): + assert find_anträge_for_news( + "https://example.com/missing", db_path=populated_db, + ) == [] + + def test_empty_db(self, tmp_path): + assert find_anträge_for_news( + "x", db_path=tmp_path / "missing.db", + ) == [] + + +# ───────────────────────────────────────────────────────────────────────────── +# find_news_for_antrag +# ───────────────────────────────────────────────────────────────────────────── + + +class TestFindNewsForAntrag: + def test_wohnungsbau_antrag_matches_wohnungsbau_news(self, populated_db): + result = find_news_for_antrag( + "18/A", db_path=populated_db, min_similarity=0.5, + ) + assert len(result) >= 1 + assert result[0]["url"] == "https://example.com/n1" + + def test_old_news_filtered_out(self, populated_db): + """News aus dem 200-Tage-alten Bucket darf nicht im 90-Tage-Fenster auftauchen.""" + result = find_news_for_antrag( + "18/A", db_path=populated_db, min_similarity=0.0, + days_window=90, + ) + urls = [r["url"] for r in result] + assert "https://example.com/n3" not in urls + + def test_top_k_limits(self, populated_db): + """top_k=1 liefert nur den besten Match.""" + result = find_news_for_antrag( + "18/A", db_path=populated_db, min_similarity=0.0, + top_k=1, + ) + assert len(result) <= 1 + + def test_unknown_antrag(self, populated_db): + assert find_news_for_antrag( + "99/Missing", db_path=populated_db, + ) == [] + + +# ───────────────────────────────────────────────────────────────────────────── +# aggregate_top_themen +# ───────────────────────────────────────────────────────────────────────────── + + +class TestAggregateTopThemen: + def test_returns_buckets(self, populated_db): + result = aggregate_top_themen( + db_path=populated_db, min_similarity=0.5, + ) + # Heute gibt es 2 News-Artikel, beide mit Match + assert len(result["buckets"]) == 2 + assert "n_total_news" in result + + def test_each_bucket_has_news_and_matches(self, populated_db): + result = aggregate_top_themen( + db_path=populated_db, min_similarity=0.5, + ) + for b in result["buckets"]: + assert "news" in b + assert "matches" in b + assert "url" in b["news"] + assert "titel" in b["news"] + + def test_days_window_filter(self, populated_db): + """Mit kleinem Fenster nur die fresh News, alte raus.""" + result = aggregate_top_themen( + db_path=populated_db, days_window=7, min_similarity=0.5, + ) + for b in result["buckets"]: + assert b["news"]["url"] != "https://example.com/n3" + + def test_min_similarity_filter(self, populated_db): + """Mit hohem min_sim verschwinden Cross-Matches.""" + result = aggregate_top_themen( + db_path=populated_db, min_similarity=0.99, + ) + # Nur exakte Matches sollten überleben + for b in result["buckets"]: + for m in b["matches"]: + assert m["similarity"] > 0.99 + + +# ───────────────────────────────────────────────────────────────────────────── +# aggregate_themen_zeitreihe +# ───────────────────────────────────────────────────────────────────────────── + + +class TestAggregateZeitreihe: + def test_structure(self, populated_db): + result = aggregate_themen_zeitreihe(db_path=populated_db, days_window=7) + assert "buckets" in result + assert "sources" in result + assert "series" in result + + def test_only_recent(self, populated_db): + """Mit days_window=7 darf das alte News nicht im Bucket auftauchen.""" + result = aggregate_themen_zeitreihe(db_path=populated_db, days_window=7) + # Nur heutige News (n1, n2) — n3 ist 200 Tage alt + total = sum(sum(s) for s in result["series"].values()) + assert total == 2 + + def test_series_aligned(self, populated_db): + """Pro Source: series-Liste muss exakt so lang sein wie buckets.""" + result = aggregate_themen_zeitreihe(db_path=populated_db, days_window=7) + for source in result["sources"]: + assert len(result["series"][source]) == len(result["buckets"])