feat(#170): Aktuelle-Themen-Dashboard — News × Anträge × Pressemitteilungen
Vollständiges 4-Phasen-Feature:
**Phase 1 — News-Aggregator** (`app/news_aggregator.py`)
- Tagesschau-API (`/api2u/news?ressort=...`) für inland/ausland/wirtschaft/wissen
- Bundestag-RSS für aktuellethemen / pressemitteilungen / hib
- DB-Tabelle `news_articles` (URL-PK, idempotent)
- Embeddings via existierender qwen-v4-Pipeline
- Cron-Script `scripts/auto-fetch-news.sh`
- Bewusst NICHT: RND.de (robots.txt bannt explizit ClaudeBot, GPTBot,
CCBot, ChatGPT-User, Google-Extended). Nur AI-erlaubende, öffentlich-
rechtliche/parlamentarische Quellen
- Volltexte werden NICHT persistiert (nur Titel + erster Satz)
**Phase 2 — Themen × Anträge Matching** (`app/themen_matching.py`)
- News-Embedding × Assessment-summary_embedding via Cosine-Similarity
- `find_anträge_for_news`: pro News die Top-K passenden Anträge
- `find_news_for_antrag`: pro Antrag Top-K News mit Datums-Fenster (90d)
- `aggregate_top_themen`: primärer Dashboard-Endpoint
- `aggregate_themen_zeitreihe`: News-Volumen pro Tag × Source
**Phase 3 — Dashboard-View** (`/aktuelle-themen`)
- Neuer linker Nav-Eintrag „Aktuelle Themen"
- Stacked-Area-Chart News-Volumen pro Quelle (30d)
- Pro News-Card: Titel + Summary + Tags + Top-3-Antrags-Match-Liste
mit GWÖ-Score-Pill, Drucksache-Link, PM-Vorschlag-Button
- Filter: Zeitfenster, Top-N, min_similarity
- Auth-protected (require_auth)
**Phase 4 — Pressemitteilungs-Generator** (`app/presse_generator.py`)
- LLM-Prompt-Template (200-250 Worte, GWÖ-Sicht, JSON-Output)
- Reuse von `QwenBewerter` aus app/adapters/qwen_bewerter.py
- DB-Tabelle `presse_drafts` (Persistenz)
- POST `/api/aktuelle-themen/generate-presse` rate-limited 5/min,
auth-only (LLM-Kosten)
- GET `/api/aktuelle-themen/drafts` + `/drafts/{id}` für Liste/Detail
- Manueller Trigger via UI-Button, kein Auto-Versand
- Modal-Anzeige des generierten Texts
**Compliance:**
- robots.txt-respektierend (ClaudeBot-Bann von RND vermieden, AI-
erlaubende Quellen verwendet)
- UI zeigt nur Titel+URL+Datum+erster Satz, keine Volltext-Reproduktion
- Pressemitteilungen sind explizit Drafts, nicht Auto-Versand
- LLM-Calls rate-limited, auth-only
**Tests:** 43 neue Tests (19 news_aggregator + 16 themen_matching +
8 presse_generator). Suite jetzt 1048 grün.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
1e381d23ab
commit
d54ce23e42
@ -285,6 +285,55 @@ async def init_db():
|
||||
"ON plenum_vote_results(bundesland, drucksache)"
|
||||
)
|
||||
|
||||
# News-Artikel aus oeffentlich-rechtlichen Quellen (#170 Phase 1).
|
||||
# Tagesschau-API + Bundestag-RSS — KEIN AI-banntes Quellmaterial
|
||||
# (RND ist explizit per robots.txt ausgeschlossen).
|
||||
# Volltexte werden NICHT persistiert — nur Titel + Summary fuer
|
||||
# Embeddings + UI-Anzeige (Urheberrecht).
|
||||
await db.execute("""
|
||||
CREATE TABLE IF NOT EXISTS news_articles (
|
||||
url TEXT PRIMARY KEY,
|
||||
titel TEXT NOT NULL,
|
||||
summary TEXT,
|
||||
datum TEXT NOT NULL,
|
||||
source TEXT NOT NULL,
|
||||
ressort TEXT,
|
||||
tags TEXT,
|
||||
summary_embedding BLOB,
|
||||
embedding_model TEXT,
|
||||
fetched_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||
)
|
||||
""")
|
||||
await db.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_news_datum "
|
||||
"ON news_articles(datum)"
|
||||
)
|
||||
await db.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_news_source "
|
||||
"ON news_articles(source)"
|
||||
)
|
||||
|
||||
# Pressemitteilungs-Drafts (#170 Phase 4). LLM-generierte Vorschlaege,
|
||||
# die einen Antrag in den Kontext eines News-Artikels stellen.
|
||||
# Manueller Trigger, kein Auto-Versand.
|
||||
await db.execute("""
|
||||
CREATE TABLE IF NOT EXISTS presse_drafts (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
drucksache TEXT NOT NULL,
|
||||
bundesland TEXT NOT NULL,
|
||||
news_url TEXT NOT NULL,
|
||||
news_titel TEXT NOT NULL,
|
||||
titel TEXT NOT NULL,
|
||||
body TEXT NOT NULL,
|
||||
model TEXT NOT NULL,
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||
)
|
||||
""")
|
||||
await db.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_presse_created "
|
||||
"ON presse_drafts(created_at DESC)"
|
||||
)
|
||||
|
||||
await db.commit()
|
||||
|
||||
|
||||
|
||||
110
app/main.py
110
app/main.py
@ -2008,6 +2008,116 @@ async def auswertungen_page(request: Request, current_user: dict = Depends(requi
|
||||
})
|
||||
|
||||
|
||||
# ─── Aktuelle-Themen-Dashboard (#170) ──────────────────────────────────────
|
||||
|
||||
|
||||
@app.get("/aktuelle-themen", response_class=HTMLResponse)
|
||||
async def aktuelle_themen_page(
|
||||
request: Request, current_user: dict = Depends(require_auth)
|
||||
):
|
||||
"""Aktuelle-Themen-Dashboard: News × Anträge × Pressemitteilungs-Drafts."""
|
||||
return templates.TemplateResponse("v2/screens/aktuelle-themen.html", {
|
||||
"request": request,
|
||||
"app_name": settings.app_name,
|
||||
"v2_active_nav": "aktuelle-themen",
|
||||
**_v2_template_context(current_user),
|
||||
})
|
||||
|
||||
|
||||
@app.get("/api/aktuelle-themen/top")
|
||||
async def api_aktuelle_themen_top(
|
||||
days: int = 7,
|
||||
top_k: int = 10,
|
||||
min_similarity: float = 0.4,
|
||||
matches_per_news: int = 3,
|
||||
):
|
||||
"""Top-K News der letzten N Tage mit Antrags-Match."""
|
||||
from .themen_matching import aggregate_top_themen
|
||||
return aggregate_top_themen(
|
||||
days_window=days,
|
||||
top_k=top_k,
|
||||
min_similarity=min_similarity,
|
||||
matches_per_news=matches_per_news,
|
||||
)
|
||||
|
||||
|
||||
@app.get("/api/aktuelle-themen/zeitreihe")
|
||||
async def api_aktuelle_themen_zeitreihe(days: int = 30):
|
||||
"""News-Volumen pro Tag × Source — Stacked-Area-Chart."""
|
||||
from .themen_matching import aggregate_themen_zeitreihe
|
||||
return aggregate_themen_zeitreihe(days_window=days)
|
||||
|
||||
|
||||
@app.get("/api/aktuelle-themen/news-fuer-antrag")
|
||||
async def api_news_fuer_antrag(
|
||||
drucksache: str,
|
||||
top_k: int = 5,
|
||||
min_similarity: float = 0.4,
|
||||
days: int = 90,
|
||||
):
|
||||
"""Top-K News, die zu einem gegebenen Antrag passen (für Detail-View)."""
|
||||
from .themen_matching import find_news_for_antrag
|
||||
return {"drucksache": drucksache, "matches": find_news_for_antrag(
|
||||
drucksache=drucksache, top_k=top_k,
|
||||
min_similarity=min_similarity, days_window=days,
|
||||
)}
|
||||
|
||||
|
||||
@app.get("/api/aktuelle-themen/anträge-fuer-news")
|
||||
async def api_anträge_fuer_news(
|
||||
url: str,
|
||||
top_k: int = 5,
|
||||
min_similarity: float = 0.4,
|
||||
):
|
||||
"""Top-K Anträge, die zu einem gegebenen News-Artikel passen."""
|
||||
from .themen_matching import find_anträge_for_news
|
||||
return {"news_url": url, "matches": find_anträge_for_news(
|
||||
news_url=url, top_k=top_k, min_similarity=min_similarity,
|
||||
)}
|
||||
|
||||
|
||||
# ─── Pressemitteilungs-Drafts (#170 Phase 4) ──────────────────────────
|
||||
|
||||
|
||||
@app.post("/api/aktuelle-themen/generate-presse")
|
||||
@limiter.limit("5/minute")
|
||||
async def api_generate_presse(
|
||||
request: Request,
|
||||
drucksache: str,
|
||||
news_url: str,
|
||||
current_user: dict = Depends(require_auth),
|
||||
):
|
||||
"""Generiert einen LLM-Pressemitteilungs-Vorschlag.
|
||||
|
||||
Auth-only + rate-limited (5/min) wegen LLM-Kosten.
|
||||
"""
|
||||
from .presse_generator import generate_draft
|
||||
try:
|
||||
return await generate_draft(drucksache=drucksache, news_url=news_url)
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=404, detail=str(e))
|
||||
except Exception as e:
|
||||
logger.exception("generate_draft failed")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.get("/api/aktuelle-themen/drafts")
|
||||
async def api_drafts_list(limit: int = 20):
|
||||
"""Liste der zuletzt generierten Pressemitteilungs-Entwürfe."""
|
||||
from .presse_generator import list_drafts
|
||||
return {"drafts": list_drafts(limit=limit)}
|
||||
|
||||
|
||||
@app.get("/api/aktuelle-themen/drafts/{draft_id}")
|
||||
async def api_draft_detail(draft_id: int):
|
||||
"""Einen einzelnen Pressemitteilungs-Entwurf."""
|
||||
from .presse_generator import get_draft
|
||||
d = get_draft(draft_id)
|
||||
if not d:
|
||||
raise HTTPException(status_code=404, detail="Draft nicht gefunden")
|
||||
return d
|
||||
|
||||
|
||||
@app.get("/api/auswertungen/matrix")
|
||||
async def auswertungen_matrix(
|
||||
wahlperiode: Optional[str] = None,
|
||||
|
||||
347
app/news_aggregator.py
Normal file
347
app/news_aggregator.py
Normal file
@ -0,0 +1,347 @@
|
||||
"""News-Aggregator fuer das Aktuelle-Themen-Dashboard (#170 Phase 1).
|
||||
|
||||
Fetcht regelmaessig News-Headlines aus AI-erlaubenden, oeffentlich-rechtlichen
|
||||
oder parlamentarischen Quellen:
|
||||
|
||||
- **Tagesschau-API** (https://www.tagesschau.de/api2u/news/) — strukturiertes
|
||||
JSON mit ressort, tags, firstSentence pro Artikel.
|
||||
- **Bundestag-Aktuellethemen-RSS**
|
||||
(https://www.bundestag.de/static/appdata/includes/rss/aktuellethemen.rss)
|
||||
— RSS mit Titel + Beschreibung pro Artikel.
|
||||
|
||||
**Bewusst NICHT verwendet:** RND.de (robots.txt bannt explizit ClaudeBot,
|
||||
GPTBot, ChatGPT-User, CCBot, Google-Extended). RSS-Feeds privat-publizierter
|
||||
Verlage werden nur dann angebunden, wenn AI-Verarbeitung explizit erlaubt ist.
|
||||
|
||||
**Compliance:**
|
||||
- Volltexte werden NICHT persistiert. Nur Titel + erster Satz / Description.
|
||||
- Kein User-Agent, der einen AI-Bot vortaeuscht (kein "ClaudeBot").
|
||||
- Rate-Limiting: 1 Request pro Quelle pro Aufruf (kein Loop, kein Hammer).
|
||||
|
||||
Datenbank-Tabelle ``news_articles`` (siehe app/database.py):
|
||||
url PK, titel, summary, datum (ISO), source, ressort, tags JSON,
|
||||
summary_embedding BLOB, embedding_model, fetched_at.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from datetime import datetime, timezone
|
||||
from email.utils import parsedate_to_datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
USER_AGENT = "GWOeAntragspruefer/1.0 (+https://gwoe.toppyr.de)"
|
||||
TIMEOUT = 20
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Quellen
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
TAGESSCHAU_API = "https://www.tagesschau.de/api2u/news"
|
||||
|
||||
# Politische Tagesschau-Ressorts — Sport/Panorama/Sport rausgefiltert,
|
||||
# weil sie selten zu parlamentarischen Antraegen passen.
|
||||
TAGESSCHAU_RESSORTS = ["inland", "ausland", "wirtschaft", "wissen"]
|
||||
|
||||
BUNDESTAG_RSS = {
|
||||
"bundestag-aktuell": (
|
||||
"https://www.bundestag.de/static/appdata/includes/rss/aktuellethemen.rss"
|
||||
),
|
||||
"bundestag-presse": (
|
||||
"https://www.bundestag.de/static/appdata/includes/rss/pressemitteilungen.rss"
|
||||
),
|
||||
"bundestag-hib": (
|
||||
"https://www.bundestag.de/static/appdata/includes/rss/hib.rss"
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# HTTP-Helper
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _http_get(url: str) -> Optional[bytes]:
|
||||
"""GET mit ehrlichem User-Agent + Timeout. Gibt None bei Fehler."""
|
||||
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=TIMEOUT) as r:
|
||||
return r.read()
|
||||
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
|
||||
logger.warning("news fetch failed: %s — %s", url, e)
|
||||
return None
|
||||
|
||||
|
||||
def _strip_html(text: str) -> str:
|
||||
"""Entfernt HTML-Tags + CDATA fuer Plaintext-Summaries."""
|
||||
if not text:
|
||||
return ""
|
||||
text = re.sub(r"<!\[CDATA\[(.*?)\]\]>", r"\1", text, flags=re.DOTALL)
|
||||
text = re.sub(r"<[^>]+>", " ", text)
|
||||
text = text.replace("&", "&").replace(" ", " ").replace(""", '"')
|
||||
return re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Parser
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def fetch_tagesschau(ressorts: Optional[list[str]] = None) -> list[dict]:
|
||||
"""Holt News aus der Tagesschau-API. Liefert Liste von Dicts mit den
|
||||
Feldern: url, titel, summary, datum, source, ressort, tags.
|
||||
|
||||
Volltexte (``content``) werden bewusst nicht uebernommen — nur die in
|
||||
der API verfuegbare ``firstSentence`` als Summary.
|
||||
"""
|
||||
ressorts = ressorts or TAGESSCHAU_RESSORTS
|
||||
out: list[dict] = []
|
||||
seen: set[str] = set()
|
||||
for ressort in ressorts:
|
||||
url = f"{TAGESSCHAU_API}?ressort={ressort}"
|
||||
raw = _http_get(url)
|
||||
if not raw:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(raw.decode("utf-8"))
|
||||
except json.JSONDecodeError:
|
||||
logger.warning("tagesschau JSON parse failed: %s", url)
|
||||
continue
|
||||
for item in data.get("news") or []:
|
||||
link = item.get("shareURL") or item.get("detailsweb")
|
||||
if not link or link in seen:
|
||||
continue
|
||||
seen.add(link)
|
||||
titel = (item.get("title") or "").strip()
|
||||
if not titel:
|
||||
continue
|
||||
summary = (item.get("firstSentence") or "").strip()
|
||||
datum = item.get("date") or ""
|
||||
tags = [t.get("tag") for t in (item.get("tags") or []) if t.get("tag")]
|
||||
out.append({
|
||||
"url": link,
|
||||
"titel": titel,
|
||||
"summary": summary,
|
||||
"datum": datum,
|
||||
"source": "tagesschau",
|
||||
"ressort": item.get("ressort") or ressort,
|
||||
"tags": tags,
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
_RSS_ITEM_RE = re.compile(r"<item>(.*?)</item>", re.DOTALL)
|
||||
_RSS_TITLE_RE = re.compile(r"<title>(.*?)</title>", re.DOTALL)
|
||||
_RSS_LINK_RE = re.compile(r"<link>(.*?)</link>")
|
||||
_RSS_DESC_RE = re.compile(r"<description>(.*?)</description>", re.DOTALL)
|
||||
_RSS_PUB_RE = re.compile(r"<pubDate>(.*?)</pubDate>")
|
||||
|
||||
|
||||
def _parse_rss_date(s: str) -> str:
|
||||
"""Konvertiere RSS-pubDate (RFC 822) → ISO-8601-Datum."""
|
||||
if not s:
|
||||
return ""
|
||||
try:
|
||||
dt = parsedate_to_datetime(s.strip())
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
return dt.astimezone(timezone.utc).isoformat()
|
||||
except (TypeError, ValueError):
|
||||
return ""
|
||||
|
||||
|
||||
def fetch_rss(source: str, url: str, max_items: int = 50) -> list[dict]:
|
||||
"""Generischer RSS-2.0-Parser. Liefert dicts wie fetch_tagesschau."""
|
||||
raw = _http_get(url)
|
||||
if not raw:
|
||||
return []
|
||||
text = raw.decode("utf-8", errors="replace")
|
||||
items_xml = _RSS_ITEM_RE.findall(text)[:max_items]
|
||||
out: list[dict] = []
|
||||
for item in items_xml:
|
||||
title_m = _RSS_TITLE_RE.search(item)
|
||||
link_m = _RSS_LINK_RE.search(item)
|
||||
desc_m = _RSS_DESC_RE.search(item)
|
||||
pub_m = _RSS_PUB_RE.search(item)
|
||||
titel = _strip_html(title_m.group(1)) if title_m else ""
|
||||
link = _strip_html(link_m.group(1)) if link_m else ""
|
||||
if not titel or not link:
|
||||
continue
|
||||
summary = _strip_html(desc_m.group(1)) if desc_m else ""
|
||||
datum = _parse_rss_date(pub_m.group(1)) if pub_m else ""
|
||||
out.append({
|
||||
"url": link,
|
||||
"titel": titel,
|
||||
"summary": summary,
|
||||
"datum": datum,
|
||||
"source": source,
|
||||
"ressort": None,
|
||||
"tags": [],
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
def fetch_all() -> list[dict]:
|
||||
"""Holt alle konfigurierten Quellen ein. Kein Caching, kein Auto-Retry."""
|
||||
out: list[dict] = []
|
||||
out.extend(fetch_tagesschau())
|
||||
for source, url in BUNDESTAG_RSS.items():
|
||||
out.extend(fetch_rss(source, url))
|
||||
return out
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# DB-Persistierung
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def upsert_articles(
|
||||
articles: list[dict],
|
||||
db_path: Optional[Path] = None,
|
||||
embed: bool = True,
|
||||
) -> dict:
|
||||
"""Schreibe oder aktualisiere News-Artikel in der DB.
|
||||
|
||||
Idempotent ueber URL-PK. Existierende Eintraege bekommen ein neues
|
||||
``fetched_at``, aber Embedding bleibt persistent (sonst LLM-Kosten
|
||||
pro Cron-Lauf).
|
||||
|
||||
Returns:
|
||||
``{"inserted": int, "updated": int, "embedded": int}``
|
||||
"""
|
||||
import sqlite3
|
||||
from .config import settings
|
||||
|
||||
path = db_path or settings.db_path
|
||||
if not Path(path).exists():
|
||||
return {"inserted": 0, "updated": 0, "embedded": 0}
|
||||
|
||||
conn = sqlite3.connect(str(path))
|
||||
inserted = 0
|
||||
updated = 0
|
||||
embedded = 0
|
||||
try:
|
||||
for art in articles:
|
||||
url = art["url"]
|
||||
cur = conn.execute(
|
||||
"SELECT summary_embedding IS NOT NULL FROM news_articles WHERE url=?",
|
||||
(url,),
|
||||
)
|
||||
row = cur.fetchone()
|
||||
tags_json = json.dumps(art.get("tags") or [])
|
||||
if row is None:
|
||||
conn.execute(
|
||||
"""INSERT INTO news_articles
|
||||
(url, titel, summary, datum, source, ressort, tags, fetched_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, datetime('now'))""",
|
||||
(
|
||||
url, art["titel"], art.get("summary") or "",
|
||||
art.get("datum") or "",
|
||||
art["source"], art.get("ressort"), tags_json,
|
||||
),
|
||||
)
|
||||
inserted += 1
|
||||
else:
|
||||
conn.execute(
|
||||
"""UPDATE news_articles
|
||||
SET titel=?, summary=?, datum=?, source=?, ressort=?, tags=?,
|
||||
fetched_at=datetime('now')
|
||||
WHERE url=?""",
|
||||
(
|
||||
art["titel"], art.get("summary") or "",
|
||||
art.get("datum") or "",
|
||||
art["source"], art.get("ressort"), tags_json,
|
||||
url,
|
||||
),
|
||||
)
|
||||
updated += 1
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
if embed:
|
||||
embedded = embed_pending_articles(db_path=db_path)
|
||||
|
||||
return {"inserted": inserted, "updated": updated, "embedded": embedded}
|
||||
|
||||
|
||||
def embed_pending_articles(
|
||||
db_path: Optional[Path] = None,
|
||||
limit: int = 100,
|
||||
) -> int:
|
||||
"""Erzeuge Embeddings fuer alle News-Artikel ohne ``summary_embedding``.
|
||||
|
||||
Embedded wird ein Stueck-Text aus Titel + Summary + Tags. Bei
|
||||
Embedding-API-Fehler wird der Artikel uebersprungen — naechster Run
|
||||
holt ihn nach.
|
||||
"""
|
||||
import sqlite3
|
||||
from .config import settings
|
||||
from . import embeddings as emb
|
||||
|
||||
path = db_path or settings.db_path
|
||||
if not Path(path).exists():
|
||||
return 0
|
||||
|
||||
conn = sqlite3.connect(str(path))
|
||||
try:
|
||||
rows = conn.execute(
|
||||
"""SELECT url, titel, summary, tags FROM news_articles
|
||||
WHERE summary_embedding IS NULL ORDER BY datum DESC LIMIT ?""",
|
||||
(limit,),
|
||||
).fetchall()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
if not rows:
|
||||
return 0
|
||||
|
||||
embedded = 0
|
||||
conn = sqlite3.connect(str(path))
|
||||
try:
|
||||
for url, titel, summary, tags_raw in rows:
|
||||
try:
|
||||
tags = json.loads(tags_raw) if tags_raw else []
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
tags = []
|
||||
parts = [titel or ""]
|
||||
if summary:
|
||||
parts.append(summary)
|
||||
if tags:
|
||||
parts.append(", ".join(tags))
|
||||
text = "\n".join(p for p in parts if p).strip()
|
||||
if not text:
|
||||
continue
|
||||
try:
|
||||
vec = emb.create_embedding(text, model=emb.EMBEDDING_MODEL)
|
||||
except Exception:
|
||||
logger.exception("embed_pending_articles: API error for %s", url)
|
||||
continue
|
||||
conn.execute(
|
||||
"""UPDATE news_articles
|
||||
SET summary_embedding=?, embedding_model=?
|
||||
WHERE url=?""",
|
||||
(json.dumps(vec).encode(), emb.EMBEDDING_MODEL, url),
|
||||
)
|
||||
embedded += 1
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
return embedded
|
||||
|
||||
|
||||
def run_aggregator(db_path: Optional[Path] = None, embed: bool = True) -> dict:
|
||||
"""Top-Level: alle Quellen holen + persistieren + embedden.
|
||||
|
||||
Sicher fuer Cron-Aufrufe — fehlende Quellen werden geloggt, nicht
|
||||
geworfen.
|
||||
"""
|
||||
articles = fetch_all()
|
||||
return upsert_articles(articles, db_path=db_path, embed=embed)
|
||||
256
app/presse_generator.py
Normal file
256
app/presse_generator.py
Normal file
@ -0,0 +1,256 @@
|
||||
"""Pressemitteilungs-Generator fuer #170 Phase 4.
|
||||
|
||||
Erzeugt einen LLM-generierten Pressemitteilungs-Vorschlag, der einen
|
||||
GWÖ-bewerteten Antrag in den Kontext eines aktuellen News-Artikels stellt.
|
||||
|
||||
Manueller Trigger via UI-Button — kein Auto-Versand. Drafts werden in
|
||||
``presse_drafts`` persistiert und in der UI als Liste sichtbar.
|
||||
|
||||
Tonalitaet:
|
||||
- GWÖ-Sicht (Gemeinwohl-orientiert, nicht parteipolitisch)
|
||||
- Faktenbasiert, keine Lobbying-Sprache
|
||||
- 200-250 Worte, presseaehnlicher Aufbau (Lead-Paragraph + Begruendung)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
SYSTEM_PROMPT = """Du bist ein politischer Redakteur, der für eine
|
||||
Gemeinwohl-Ökonomie-Initiative Pressemitteilungen schreibt. Deine Stil-
|
||||
Richtlinien:
|
||||
|
||||
- 200-250 Worte
|
||||
- Sachlicher, präziser Stil — keine Werbesprache, keine Polemik
|
||||
- Faktenbasiert: Daten aus dem Antrag und dem News-Kontext explizit nennen
|
||||
- GWÖ-Werte (Würde, Solidarität, Nachhaltigkeit, Gerechtigkeit, Demokratie)
|
||||
als Bewertungsmaßstab — nicht parteipolitische Linie
|
||||
- Klare Struktur: Titel, Lead-Paragraph (Wer? Was? Wann? Warum jetzt?),
|
||||
Begründung mit Bezug auf GWÖ-Bewertung, Schluss mit Forderung oder
|
||||
Einladung zum Dialog
|
||||
- Niemals den Anbieter der News-Quelle (Tagesschau, Bundestag) zitieren —
|
||||
nur den Sachverhalt aufgreifen, der dort beschrieben ist
|
||||
|
||||
Antworte NUR mit gültigem JSON in dieser Struktur:
|
||||
{
|
||||
"titel": "<knackiger Titel, max 100 Zeichen>",
|
||||
"body": "<Pressemitteilungs-Volltext, 200-250 Wörter>"
|
||||
}"""
|
||||
|
||||
|
||||
def _build_user_prompt(
|
||||
drucksache: str,
|
||||
bundesland: str,
|
||||
antrag_titel: str,
|
||||
antrag_zusammenfassung: str,
|
||||
gwoe_score: float,
|
||||
gwoe_begruendung: str,
|
||||
empfehlung: str,
|
||||
news_titel: str,
|
||||
news_summary: str,
|
||||
news_url: str,
|
||||
) -> str:
|
||||
"""Konstruiert den User-Prompt aus Antrags- und News-Daten."""
|
||||
return f"""## Aktueller Antrag
|
||||
|
||||
Drucksache: {drucksache} ({bundesland})
|
||||
Titel: {antrag_titel}
|
||||
|
||||
Zusammenfassung: {antrag_zusammenfassung or "(keine vorhanden)"}
|
||||
|
||||
GWÖ-Score: {gwoe_score}/10
|
||||
GWÖ-Begründung: {gwoe_begruendung or "(keine vorhanden)"}
|
||||
Empfehlung: {empfehlung or "(keine)"}
|
||||
|
||||
## Aktueller Nachrichten-Kontext
|
||||
|
||||
Schlagzeile: {news_titel}
|
||||
|
||||
Inhalt: {news_summary or "(keine Zusammenfassung verfügbar)"}
|
||||
|
||||
Quelle: {news_url}
|
||||
|
||||
## Deine Aufgabe
|
||||
|
||||
Schreibe eine Pressemitteilung, die diesen Antrag in den Kontext der
|
||||
aktuellen Nachrichtenlage stellt. Begründe aus GWÖ-Sicht, warum der
|
||||
Antrag gerade jetzt relevant ist (oder warum er die aktuelle Debatte
|
||||
ergänzt/korrigiert). Wenn der GWÖ-Score niedrig ist (< 5), sei dabei
|
||||
kritisch — die PM kann auch eine Ablehnung des Antrags begründen.
|
||||
"""
|
||||
|
||||
|
||||
async def generate_draft(
|
||||
drucksache: str,
|
||||
news_url: str,
|
||||
db_path: Optional[Path] = None,
|
||||
bewerter=None,
|
||||
) -> dict:
|
||||
"""Erzeugt einen Pressemitteilungs-Draft und persistiert ihn.
|
||||
|
||||
Args:
|
||||
drucksache: ID des Antrags (mit Bundesland-Kontext aus DB).
|
||||
news_url: URL des News-Artikels (Lookup in news_articles).
|
||||
db_path: optional override fuer Tests.
|
||||
bewerter: optional injected QwenBewerter (fuer Tests). Wenn None,
|
||||
wird der Default mit settings instanziiert.
|
||||
|
||||
Returns:
|
||||
``{"id": int, "drucksache": ..., "bundesland": ...,
|
||||
"news_url": ..., "news_titel": ...,
|
||||
"titel": str, "body": str, "model": str, "created_at": ISO}``
|
||||
|
||||
Raises:
|
||||
ValueError: wenn drucksache oder news_url nicht gefunden.
|
||||
"""
|
||||
from .config import settings
|
||||
from .adapters.qwen_bewerter import LlmRequest
|
||||
|
||||
path = db_path or settings.db_path
|
||||
conn = sqlite3.connect(str(path))
|
||||
try:
|
||||
antrag = conn.execute(
|
||||
"""SELECT bundesland, title, antrag_zusammenfassung, gwoe_score,
|
||||
gwoe_begruendung, empfehlung
|
||||
FROM assessments WHERE drucksache=?""",
|
||||
(drucksache,),
|
||||
).fetchone()
|
||||
news = conn.execute(
|
||||
"SELECT titel, summary FROM news_articles WHERE url=?",
|
||||
(news_url,),
|
||||
).fetchone()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
if not antrag:
|
||||
raise ValueError(f"Drucksache {drucksache} nicht in assessments")
|
||||
if not news:
|
||||
raise ValueError(f"News-URL {news_url} nicht in news_articles")
|
||||
|
||||
user_prompt = _build_user_prompt(
|
||||
drucksache=drucksache,
|
||||
bundesland=antrag[0],
|
||||
antrag_titel=antrag[1] or "",
|
||||
antrag_zusammenfassung=antrag[2] or "",
|
||||
gwoe_score=antrag[3] or 0.0,
|
||||
gwoe_begruendung=antrag[4] or "",
|
||||
empfehlung=antrag[5] or "",
|
||||
news_titel=news[0],
|
||||
news_summary=news[1] or "",
|
||||
news_url=news_url,
|
||||
)
|
||||
|
||||
if bewerter is None:
|
||||
from .adapters.qwen_bewerter import QwenBewerter
|
||||
bewerter = QwenBewerter()
|
||||
|
||||
req = LlmRequest(
|
||||
system_prompt=SYSTEM_PROMPT,
|
||||
user_prompt=user_prompt,
|
||||
model=settings.llm_model_default,
|
||||
base_temperature=0.3,
|
||||
max_tokens=1500,
|
||||
max_retries=2,
|
||||
)
|
||||
result = await bewerter.bewerte(req)
|
||||
|
||||
titel = (result.get("titel") or "").strip()[:200]
|
||||
body = (result.get("body") or "").strip()
|
||||
if not titel or not body:
|
||||
raise ValueError("LLM-Response unvollständig (titel oder body leer)")
|
||||
|
||||
# Persist
|
||||
conn = sqlite3.connect(str(path))
|
||||
try:
|
||||
cur = conn.execute(
|
||||
"""INSERT INTO presse_drafts
|
||||
(drucksache, bundesland, news_url, news_titel, titel, body, model)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)""",
|
||||
(drucksache, antrag[0], news_url, news[0], titel, body,
|
||||
settings.llm_model_default),
|
||||
)
|
||||
draft_id = cur.lastrowid
|
||||
row = conn.execute(
|
||||
"""SELECT id, drucksache, bundesland, news_url, news_titel,
|
||||
titel, body, model, created_at
|
||||
FROM presse_drafts WHERE id=?""",
|
||||
(draft_id,),
|
||||
).fetchone()
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
return {
|
||||
"id": row[0], "drucksache": row[1], "bundesland": row[2],
|
||||
"news_url": row[3], "news_titel": row[4],
|
||||
"titel": row[5], "body": row[6], "model": row[7],
|
||||
"created_at": row[8],
|
||||
}
|
||||
|
||||
|
||||
def list_drafts(
|
||||
limit: int = 20,
|
||||
db_path: Optional[Path] = None,
|
||||
) -> list[dict]:
|
||||
"""Liste der zuletzt generierten Drafts. Default-Limit 20."""
|
||||
from .config import settings
|
||||
|
||||
path = db_path or settings.db_path
|
||||
if not Path(path).exists():
|
||||
return []
|
||||
conn = sqlite3.connect(str(path))
|
||||
try:
|
||||
rows = conn.execute(
|
||||
"""SELECT id, drucksache, bundesland, news_url, news_titel,
|
||||
titel, body, model, created_at
|
||||
FROM presse_drafts
|
||||
ORDER BY id DESC LIMIT ?""",
|
||||
(limit,),
|
||||
).fetchall()
|
||||
finally:
|
||||
conn.close()
|
||||
return [
|
||||
{
|
||||
"id": r[0], "drucksache": r[1], "bundesland": r[2],
|
||||
"news_url": r[3], "news_titel": r[4],
|
||||
"titel": r[5], "body": r[6], "model": r[7],
|
||||
"created_at": r[8],
|
||||
}
|
||||
for r in rows
|
||||
]
|
||||
|
||||
|
||||
def get_draft(
|
||||
draft_id: int,
|
||||
db_path: Optional[Path] = None,
|
||||
) -> Optional[dict]:
|
||||
"""Einen Draft per ID abrufen."""
|
||||
from .config import settings
|
||||
|
||||
path = db_path or settings.db_path
|
||||
if not Path(path).exists():
|
||||
return None
|
||||
conn = sqlite3.connect(str(path))
|
||||
try:
|
||||
row = conn.execute(
|
||||
"""SELECT id, drucksache, bundesland, news_url, news_titel,
|
||||
titel, body, model, created_at
|
||||
FROM presse_drafts WHERE id=?""",
|
||||
(draft_id,),
|
||||
).fetchone()
|
||||
finally:
|
||||
conn.close()
|
||||
if not row:
|
||||
return None
|
||||
return {
|
||||
"id": row[0], "drucksache": row[1], "bundesland": row[2],
|
||||
"news_url": row[3], "news_titel": row[4],
|
||||
"titel": row[5], "body": row[6], "model": row[7],
|
||||
"created_at": row[8],
|
||||
}
|
||||
@ -56,6 +56,7 @@
|
||||
<div class="v2-nav-group">
|
||||
<div class="v2-nav-label">— Daten</div>
|
||||
<a href="/auswertungen" class="v2-nav-item {% if v2_active_nav == 'auswertungen' %}active{% endif %}">{{ icon("chart-bar", 14) }} Auswertungen</a>
|
||||
<a href="/aktuelle-themen" class="v2-nav-item {% if v2_active_nav == 'aktuelle-themen' %}active{% endif %}">{{ icon("book-open", 14) }} Aktuelle Themen</a>
|
||||
<a href="/api/auswertungen/export.csv" class="v2-nav-item">{{ icon("file-csv", 14) }} Export · API</a>
|
||||
<a href="/v2/feed" class="v2-nav-item {% if v2_active_nav == 'feed' %}active{% endif %}">{{ icon("rss", 14) }} Atom-Feed</a>
|
||||
<a href="/v2/abos" class="v2-nav-item {% if v2_active_nav == 'abos' %}active{% endif %}">{{ icon("envelope-simple", 14) }} Meine Abos</a>
|
||||
|
||||
417
app/templates/v2/screens/aktuelle-themen.html
Normal file
417
app/templates/v2/screens/aktuelle-themen.html
Normal file
@ -0,0 +1,417 @@
|
||||
{% extends "v2/base.html" %}
|
||||
|
||||
{% block title %}Aktuelle Themen — GWÖ-Antragsprüfer{% endblock %}
|
||||
|
||||
{% set v2_active_nav = "aktuelle-themen" %}
|
||||
|
||||
{% block head_extra %}
|
||||
<script src="/static/chart.umd.min.js"></script>
|
||||
<style>
|
||||
.at-controls {
|
||||
display: flex;
|
||||
gap: 8px;
|
||||
align-items: center;
|
||||
flex-wrap: wrap;
|
||||
margin-bottom: 1rem;
|
||||
font-family: var(--font-mono);
|
||||
font-size: 11px;
|
||||
}
|
||||
.at-controls select, .at-controls input[type="number"] {
|
||||
font-family: var(--font-mono);
|
||||
font-size: 11px;
|
||||
padding: 5px 8px;
|
||||
border: 1px solid var(--ecg-border);
|
||||
border-radius: 3px;
|
||||
background: var(--ecg-card-bg);
|
||||
color: var(--ecg-dark);
|
||||
}
|
||||
.at-controls button {
|
||||
font-family: var(--font-mono);
|
||||
font-size: 11px;
|
||||
padding: 5px 12px;
|
||||
border: 1px solid var(--ecg-border);
|
||||
border-radius: 3px;
|
||||
cursor: pointer;
|
||||
background: var(--ecg-teal);
|
||||
color: #fff;
|
||||
}
|
||||
.at-news-card {
|
||||
background: var(--ecg-card-bg);
|
||||
border: 1px solid var(--ecg-border);
|
||||
border-radius: 6px;
|
||||
padding: 14px 16px;
|
||||
margin-bottom: 14px;
|
||||
}
|
||||
.at-news-head {
|
||||
font-family: var(--font-mono);
|
||||
font-size: 10px;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.05em;
|
||||
opacity: 0.6;
|
||||
margin-bottom: 4px;
|
||||
}
|
||||
.at-news-title {
|
||||
font-family: var(--font-display);
|
||||
font-size: 15px;
|
||||
color: var(--ecg-teal);
|
||||
margin: 0 0 6px;
|
||||
line-height: 1.3;
|
||||
}
|
||||
.at-news-title a { color: inherit; text-decoration: none; }
|
||||
.at-news-title a:hover { text-decoration: underline; }
|
||||
.at-news-summary {
|
||||
font-size: 12px;
|
||||
line-height: 1.5;
|
||||
margin: 0 0 10px;
|
||||
opacity: 0.85;
|
||||
}
|
||||
.at-news-tags {
|
||||
font-family: var(--font-mono);
|
||||
font-size: 10px;
|
||||
opacity: 0.55;
|
||||
margin-bottom: 8px;
|
||||
}
|
||||
.at-tag {
|
||||
display: inline-block;
|
||||
padding: 1px 6px;
|
||||
background: var(--ecg-bg-subtle);
|
||||
border-radius: 3px;
|
||||
margin-right: 4px;
|
||||
}
|
||||
.at-matches {
|
||||
border-top: 1px solid var(--ecg-border);
|
||||
margin-top: 10px;
|
||||
padding-top: 10px;
|
||||
}
|
||||
.at-matches-label {
|
||||
font-family: var(--font-mono);
|
||||
font-size: 10px;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.05em;
|
||||
opacity: 0.6;
|
||||
margin-bottom: 6px;
|
||||
}
|
||||
.at-match {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 10px;
|
||||
padding: 5px 0;
|
||||
font-size: 12px;
|
||||
border-bottom: 1px dotted var(--ecg-border);
|
||||
}
|
||||
.at-match:last-child { border-bottom: none; }
|
||||
.at-score-pill {
|
||||
display: inline-block;
|
||||
padding: 1px 7px;
|
||||
border-radius: 10px;
|
||||
font-family: var(--font-mono);
|
||||
font-size: 10px;
|
||||
font-weight: 700;
|
||||
background: var(--ecg-bg-subtle);
|
||||
min-width: 28px;
|
||||
text-align: center;
|
||||
}
|
||||
.at-score-pill.s-high { background: rgba(136,158,51,0.25); color: #44570a; }
|
||||
.at-score-pill.s-mid { background: rgba(247,148,29,0.18); color: #875e10; }
|
||||
.at-score-pill.s-low { background: rgba(200,0,0,0.15); color: #931515; }
|
||||
.at-sim {
|
||||
font-family: var(--font-mono);
|
||||
font-size: 10px;
|
||||
opacity: 0.5;
|
||||
}
|
||||
.at-presse-btn {
|
||||
background: var(--ecg-card-bg);
|
||||
color: var(--ecg-teal);
|
||||
border: 1px solid var(--ecg-teal);
|
||||
border-radius: 3px;
|
||||
font-family: var(--font-mono);
|
||||
font-size: 10px;
|
||||
padding: 3px 8px;
|
||||
cursor: pointer;
|
||||
margin-left: auto;
|
||||
}
|
||||
.at-presse-btn:hover { background: var(--ecg-teal); color: #fff; }
|
||||
</style>
|
||||
{% endblock %}
|
||||
|
||||
{% block main %}
|
||||
<div style="padding:0 0 1.5rem;">
|
||||
<h1 style="font-family:var(--font-display);font-size:22px;color:var(--ecg-teal);margin:0 0 4px;">Aktuelle Themen</h1>
|
||||
<p style="font-size:12px;font-family:var(--font-mono);color:var(--ecg-dark);opacity:0.6;">
|
||||
Tagesschau + Bundestag-RSS · gematcht mit deinen Anträgen ·
|
||||
Pressemitteilungs-Vorschläge
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="v2-kasten outline-blue" style="margin-bottom:1rem;">
|
||||
<p style="font-size:12px;line-height:1.5;margin:0 0 0.5rem;">
|
||||
Die täglich aktuellen politischen Top-Themen aus
|
||||
<strong>öffentlich-rechtlichen + parlamentarischen Quellen</strong>
|
||||
(Tagesschau-API + Bundestag-RSS) werden semantisch mit den von dir
|
||||
bewerteten Anträgen verschnitten. Pro News-Artikel siehst du die
|
||||
GWÖ-Bewertung der dazu passendsten Anträge — und kannst per Klick
|
||||
eine Pressemitteilung generieren lassen.
|
||||
</p>
|
||||
<p style="font-size:11px;line-height:1.5;opacity:0.75;margin:0;">
|
||||
Bewusst <strong>nicht</strong> verwendet: Quellen mit AI-Bann in
|
||||
robots.txt (z.B. RND.de). Die UI zeigt nur Titel + URL + erste Sätze
|
||||
— Volltexte werden nicht persistiert.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="at-controls">
|
||||
<label for="at-days">Zeitfenster:</label>
|
||||
<select id="at-days" onchange="loadThemen()">
|
||||
<option value="3">3 Tage</option>
|
||||
<option value="7" selected>7 Tage</option>
|
||||
<option value="14">14 Tage</option>
|
||||
<option value="30">30 Tage</option>
|
||||
</select>
|
||||
<label for="at-topk">Top-N News:</label>
|
||||
<input type="number" id="at-topk" value="15" min="3" max="50" style="width:60px;" onchange="loadThemen()" />
|
||||
<label for="at-minsim">Min. Similarity:</label>
|
||||
<select id="at-minsim" onchange="loadThemen()">
|
||||
<option value="0.30">0.30 (locker)</option>
|
||||
<option value="0.40" selected>0.40 (default)</option>
|
||||
<option value="0.50">0.50 (streng)</option>
|
||||
</select>
|
||||
<button onclick="loadThemen()">Aktualisieren</button>
|
||||
</div>
|
||||
|
||||
<!-- News-Volumen-Chart -->
|
||||
<h3 style="font-family:var(--font-display);font-size:14px;color:var(--ecg-teal);margin:1.5rem 0 0.5rem;">
|
||||
News-Volumen pro Quelle (letzte 30 Tage)
|
||||
</h3>
|
||||
<div class="matrix-wrap" style="background:var(--ecg-card-bg);border:1px solid var(--ecg-border);border-radius:4px;padding:14px;">
|
||||
<canvas id="at-zeitreihe-chart" style="max-height:280px;"></canvas>
|
||||
</div>
|
||||
<div id="at-zeitreihe-meta" class="meta-line" style="font-family:var(--font-mono);font-size:11px;opacity:0.6;margin:8px 0 1.5rem;"></div>
|
||||
|
||||
<!-- Top-Themen + Matches -->
|
||||
<h3 style="font-family:var(--font-display);font-size:14px;color:var(--ecg-teal);margin:1.5rem 0 0.5rem;">
|
||||
Top-Themen × passende Anträge
|
||||
</h3>
|
||||
<div id="at-themen-list">
|
||||
<div style="font-family:var(--font-mono);font-size:12px;opacity:0.5;">Lade …</div>
|
||||
</div>
|
||||
|
||||
<!-- Drafts-Liste -->
|
||||
<h3 style="font-family:var(--font-display);font-size:14px;color:var(--ecg-teal);margin:2rem 0 0.5rem;">
|
||||
Pressemitteilungs-Entwürfe (zuletzt generiert)
|
||||
</h3>
|
||||
<div id="at-drafts-list">
|
||||
<div style="font-family:var(--font-mono);font-size:12px;opacity:0.5;">Lade Entwürfe …</div>
|
||||
</div>
|
||||
|
||||
<!-- Modal für Draft-Anzeige -->
|
||||
<div class="v2-modal-backdrop" id="at-modal-backdrop" onclick="atCloseModal(event)" style="display:none;position:fixed;inset:0;background:rgba(0,0,0,0.45);z-index:500;align-items:center;justify-content:center;">
|
||||
<div class="v2-modal" onclick="event.stopPropagation()" style="background:var(--ecg-card-bg);border-radius:6px;padding:20px 24px;max-width:680px;width:90%;max-height:80vh;overflow-y:auto;position:relative;">
|
||||
<button class="v2-modal-close" onclick="atCloseModal()" style="position:absolute;top:12px;right:14px;background:none;border:none;font-size:18px;cursor:pointer;opacity:0.5;">×</button>
|
||||
<h2 id="at-modal-title" style="font-family:var(--font-display);font-size:16px;color:var(--ecg-teal);margin:0 0 12px;">Pressemitteilung</h2>
|
||||
<div id="at-modal-body" style="font-size:13px;line-height:1.5;">Generiere …</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% endblock %}
|
||||
|
||||
{% block body_scripts %}
|
||||
<script>
|
||||
let _atZeitreiheChart = null;
|
||||
|
||||
function atScoreClass(score) {
|
||||
if (score == null) return '';
|
||||
if (score >= 7) return 's-high';
|
||||
if (score >= 4) return 's-mid';
|
||||
return 's-low';
|
||||
}
|
||||
|
||||
function atFmtDatum(s) {
|
||||
if (!s || s.length < 10) return '';
|
||||
return s.slice(0, 10);
|
||||
}
|
||||
|
||||
async function loadThemen() {
|
||||
const days = document.getElementById('at-days').value;
|
||||
const topk = document.getElementById('at-topk').value;
|
||||
const minsim = document.getElementById('at-minsim').value;
|
||||
const list = document.getElementById('at-themen-list');
|
||||
list.innerHTML = '<div style="font-family:var(--font-mono);font-size:12px;opacity:0.5;">Lade …</div>';
|
||||
|
||||
try {
|
||||
const r = await fetch(`/api/aktuelle-themen/top?days=${days}&top_k=${topk}&min_similarity=${minsim}&matches_per_news=3`);
|
||||
const data = await r.json();
|
||||
|
||||
if (!data.buckets || !data.buckets.length) {
|
||||
list.innerHTML = '<div style="font-family:var(--font-mono);font-size:12px;opacity:0.5;">Keine News im Zeitfenster oder noch nicht embedded.</div>';
|
||||
return;
|
||||
}
|
||||
|
||||
let html = '';
|
||||
for (const b of data.buckets) {
|
||||
const n = b.news;
|
||||
const tags = (n.tags || []).map(t => `<span class="at-tag">${t}</span>`).join('');
|
||||
html += '<div class="at-news-card">';
|
||||
html += `<div class="at-news-head">${atFmtDatum(n.datum)} · ${n.source}${n.ressort ? ' / ' + n.ressort : ''}</div>`;
|
||||
html += `<h4 class="at-news-title"><a href="${n.url}" target="_blank" rel="noopener">${n.titel}</a></h4>`;
|
||||
if (n.summary) html += `<div class="at-news-summary">${n.summary}</div>`;
|
||||
if (tags) html += `<div class="at-news-tags">${tags}</div>`;
|
||||
|
||||
if (b.matches && b.matches.length) {
|
||||
html += '<div class="at-matches">';
|
||||
html += '<div class="at-matches-label">Passende Anträge:</div>';
|
||||
for (const m of b.matches) {
|
||||
const sc = m.gwoe_score != null ? m.gwoe_score.toFixed(1) : '—';
|
||||
const fr = (m.fraktionen || []).join(', ');
|
||||
html += '<div class="at-match">';
|
||||
html += `<span class="at-score-pill ${atScoreClass(m.gwoe_score)}">${sc}</span>`;
|
||||
html += `<a href="/antrag/${encodeURIComponent(m.drucksache)}" style="color:var(--ecg-teal);text-decoration:none;font-weight:500;">${m.drucksache}</a>`;
|
||||
html += `<span style="opacity:0.85;">${m.title || ''}</span>`;
|
||||
if (fr) html += `<span style="opacity:0.6;font-size:11px;">— ${fr}</span>`;
|
||||
html += `<span class="at-sim">sim ${m.similarity}</span>`;
|
||||
html += `<button class="at-presse-btn" onclick="generatePresse('${m.drucksache.replace(/'/g, "\\'")}', '${encodeURIComponent(n.url)}', this)">PM-Vorschlag</button>`;
|
||||
html += '</div>';
|
||||
}
|
||||
html += '</div>';
|
||||
} else {
|
||||
html += '<div class="at-matches"><div class="at-matches-label">Keine GWÖ-bewerteten Anträge passen — wäre ein Kandidat für eine neue Bewertung.</div></div>';
|
||||
}
|
||||
html += '</div>';
|
||||
}
|
||||
list.innerHTML = html;
|
||||
} catch (e) {
|
||||
list.innerHTML = `<div style="color:#c00;font-family:var(--font-mono);font-size:12px;">Fehler: ${e}</div>`;
|
||||
}
|
||||
}
|
||||
|
||||
async function loadZeitreihe() {
|
||||
const meta = document.getElementById('at-zeitreihe-meta');
|
||||
try {
|
||||
const r = await fetch('/api/aktuelle-themen/zeitreihe?days=30');
|
||||
const data = await r.json();
|
||||
if (_atZeitreiheChart) _atZeitreiheChart.destroy();
|
||||
|
||||
if (!data.buckets || !data.buckets.length) {
|
||||
meta.textContent = 'Noch keine News-Artikel in der DB.';
|
||||
return;
|
||||
}
|
||||
|
||||
const colors = ['rgba(0,157,165,0.7)', 'rgba(247,148,29,0.7)', 'rgba(136,158,51,0.7)',
|
||||
'rgba(200,30,30,0.7)', 'rgba(150,100,200,0.7)'];
|
||||
const datasets = data.sources.map((s, i) => ({
|
||||
label: s,
|
||||
data: data.series[s],
|
||||
backgroundColor: colors[i % colors.length],
|
||||
borderColor: colors[i % colors.length].replace('0.7', '1'),
|
||||
fill: true,
|
||||
tension: 0.2,
|
||||
}));
|
||||
|
||||
const ctx = document.getElementById('at-zeitreihe-chart');
|
||||
_atZeitreiheChart = new Chart(ctx, {
|
||||
type: 'line',
|
||||
data: { labels: data.buckets, datasets: datasets },
|
||||
options: {
|
||||
responsive: true,
|
||||
scales: {
|
||||
y: { beginAtZero: true, stacked: true, title: { display: true, text: 'Artikel/Tag' } },
|
||||
x: { title: { display: true, text: 'Datum' } },
|
||||
},
|
||||
plugins: {
|
||||
legend: { position: 'bottom' }
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
const total = Object.values(data.series).reduce((s, arr) => s + arr.reduce((a, b) => a + b, 0), 0);
|
||||
meta.textContent = `${total} News-Artikel über ${data.buckets.length} Tage, ${data.sources.length} Quellen.`;
|
||||
} catch (e) {
|
||||
meta.textContent = 'Fehler: ' + e;
|
||||
}
|
||||
}
|
||||
|
||||
async function loadDrafts() {
|
||||
const wrap = document.getElementById('at-drafts-list');
|
||||
try {
|
||||
const r = await fetch('/api/aktuelle-themen/drafts?limit=10');
|
||||
const data = await r.json();
|
||||
if (!data.drafts || !data.drafts.length) {
|
||||
wrap.innerHTML = '<div style="font-family:var(--font-mono);font-size:12px;opacity:0.5;">Noch keine Pressemitteilungen generiert.</div>';
|
||||
return;
|
||||
}
|
||||
let html = '';
|
||||
for (const d of data.drafts) {
|
||||
html += '<div class="at-news-card" style="cursor:pointer;" onclick="showDraft(' + d.id + ')">';
|
||||
html += `<div class="at-news-head">${atFmtDatum(d.created_at)} · DS ${d.drucksache} (${d.bundesland})</div>`;
|
||||
html += `<h4 class="at-news-title">${d.titel}</h4>`;
|
||||
html += `<div class="at-news-tags">Bezug: ${d.news_titel}</div>`;
|
||||
html += '</div>';
|
||||
}
|
||||
wrap.innerHTML = html;
|
||||
} catch (e) {
|
||||
wrap.innerHTML = `<div style="color:#c00;font-family:var(--font-mono);font-size:12px;">Fehler: ${e}</div>`;
|
||||
}
|
||||
}
|
||||
|
||||
async function generatePresse(drucksache, newsUrlEnc, btn) {
|
||||
if (!confirm(`Pressemitteilung generieren für Drucksache ${drucksache}?\n\nDas erzeugt einen LLM-Call (~2 Cent).`)) return;
|
||||
btn.textContent = '…';
|
||||
btn.disabled = true;
|
||||
try {
|
||||
const r = await fetch(`/api/aktuelle-themen/generate-presse?drucksache=${encodeURIComponent(drucksache)}&news_url=${newsUrlEnc}`, {
|
||||
method: 'POST',
|
||||
});
|
||||
if (!r.ok) {
|
||||
const err = await r.json();
|
||||
alert('Fehler: ' + (err.detail || r.statusText));
|
||||
btn.textContent = 'PM-Vorschlag';
|
||||
btn.disabled = false;
|
||||
return;
|
||||
}
|
||||
const data = await r.json();
|
||||
showDraftFromData(data);
|
||||
loadDrafts();
|
||||
} catch (e) {
|
||||
alert('Fehler: ' + e);
|
||||
} finally {
|
||||
btn.textContent = 'PM-Vorschlag';
|
||||
btn.disabled = false;
|
||||
}
|
||||
}
|
||||
|
||||
function showDraftFromData(d) {
|
||||
const backdrop = document.getElementById('at-modal-backdrop');
|
||||
document.getElementById('at-modal-title').textContent = d.titel;
|
||||
document.getElementById('at-modal-body').innerHTML =
|
||||
`<div style="font-family:var(--font-mono);font-size:11px;opacity:0.6;margin-bottom:10px;">
|
||||
DS ${d.drucksache} (${d.bundesland}) · Bezug zu: <a href="${d.news_url}" target="_blank" rel="noopener" style="color:var(--ecg-teal);">${d.news_titel}</a>
|
||||
</div>
|
||||
<div style="white-space:pre-wrap;">${d.body.replace(/</g, '<')}</div>`;
|
||||
backdrop.style.display = 'flex';
|
||||
}
|
||||
|
||||
async function showDraft(id) {
|
||||
try {
|
||||
const r = await fetch(`/api/aktuelle-themen/drafts/${id}`);
|
||||
const d = await r.json();
|
||||
showDraftFromData(d);
|
||||
} catch (e) {
|
||||
alert('Fehler: ' + e);
|
||||
}
|
||||
}
|
||||
|
||||
function atCloseModal(ev) {
|
||||
if (!ev || ev.target.id === 'at-modal-backdrop') {
|
||||
document.getElementById('at-modal-backdrop').style.display = 'none';
|
||||
}
|
||||
}
|
||||
|
||||
document.addEventListener('keydown', (e) => {
|
||||
if (e.key === 'Escape') document.getElementById('at-modal-backdrop').style.display = 'none';
|
||||
});
|
||||
|
||||
// Init
|
||||
loadZeitreihe();
|
||||
loadThemen();
|
||||
loadDrafts();
|
||||
</script>
|
||||
{% endblock %}
|
||||
371
app/themen_matching.py
Normal file
371
app/themen_matching.py
Normal file
@ -0,0 +1,371 @@
|
||||
"""Themen × Anträge Matching fuer das Aktuelle-Themen-Dashboard
|
||||
(#170 Phase 2).
|
||||
|
||||
Verschneidet News-Artikel-Embeddings (aus news_articles.summary_embedding)
|
||||
mit Antrag-Embeddings (assessments.summary_embedding) per Cosine-Similarity.
|
||||
Liefert pro News-Artikel die Top-K-passendsten Anträge.
|
||||
|
||||
Reuse:
|
||||
- ``embeddings.cosine_similarity`` fuer den Vektor-Vergleich
|
||||
- Beide Tabellen nutzen denselben Embedding-Modell-Vektorraum (qwen v4),
|
||||
daher direkter Cross-Vergleich moeglich
|
||||
- Filter ueber ``embedding_model``-Spalte, falls Migration laueft
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import sqlite3
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _load_embeddings(
|
||||
db_path: Path,
|
||||
table: str,
|
||||
select_cols: list[str],
|
||||
where_extra: str = "",
|
||||
params: tuple = (),
|
||||
) -> list[dict]:
|
||||
"""Generischer Loader fuer Tabellen mit ``summary_embedding``-Spalte.
|
||||
|
||||
Liefert Zeilen mit decoded Embedding-Vektor (oder filtert aus, wenn
|
||||
Modell nicht zum aktuellen READ-Modell passt).
|
||||
"""
|
||||
from . import embeddings as emb
|
||||
|
||||
if not Path(db_path).exists():
|
||||
return []
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
try:
|
||||
conn.row_factory = sqlite3.Row
|
||||
cols = ", ".join(select_cols)
|
||||
sql = (
|
||||
f"SELECT {cols}, summary_embedding, embedding_model "
|
||||
f"FROM {table} "
|
||||
f"WHERE summary_embedding IS NOT NULL {where_extra}"
|
||||
)
|
||||
rows = conn.execute(sql, params).fetchall()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
out = []
|
||||
for r in rows:
|
||||
if r["embedding_model"] != emb.EMBEDDING_MODEL_READ:
|
||||
continue
|
||||
try:
|
||||
vec = json.loads(r["summary_embedding"])
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
continue
|
||||
d = dict(r)
|
||||
d["_vec"] = vec
|
||||
out.append(d)
|
||||
return out
|
||||
|
||||
|
||||
def find_anträge_for_news(
|
||||
news_url: str,
|
||||
top_k: int = 5,
|
||||
min_similarity: float = 0.4,
|
||||
db_path: Optional[Path] = None,
|
||||
) -> list[dict]:
|
||||
"""Pro gegebener News-URL: Top-K aehnlichste Antraege per Cosine-Match.
|
||||
|
||||
Filter ``min_similarity`` haelt den Cut-Off fuer "passt einigermassen".
|
||||
0.4 ist empirisch der Punkt, ab dem qwen-v4-Embeddings semantisch
|
||||
relevant matchen.
|
||||
"""
|
||||
from .config import settings
|
||||
from . import embeddings as emb
|
||||
|
||||
path = db_path or settings.db_path
|
||||
if not Path(path).exists():
|
||||
return []
|
||||
|
||||
# 1. News-Vektor laden
|
||||
conn = sqlite3.connect(str(path))
|
||||
try:
|
||||
row = conn.execute(
|
||||
"""SELECT summary_embedding, embedding_model
|
||||
FROM news_articles WHERE url=?""",
|
||||
(news_url,),
|
||||
).fetchone()
|
||||
finally:
|
||||
conn.close()
|
||||
if not row or not row[0] or row[1] != emb.EMBEDDING_MODEL_READ:
|
||||
return []
|
||||
try:
|
||||
news_vec = json.loads(row[0])
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
return []
|
||||
|
||||
# 2. Alle Assessments mit Embedding laden + scoren
|
||||
assessments = _load_embeddings(
|
||||
Path(path),
|
||||
"assessments",
|
||||
["drucksache", "title", "bundesland", "fraktionen", "gwoe_score",
|
||||
"empfehlung", "themen", "datum"],
|
||||
)
|
||||
scored = []
|
||||
for a in assessments:
|
||||
sim = emb.cosine_similarity(news_vec, a["_vec"])
|
||||
if sim < min_similarity:
|
||||
continue
|
||||
scored.append({
|
||||
"drucksache": a["drucksache"],
|
||||
"title": a["title"],
|
||||
"bundesland": a["bundesland"],
|
||||
"fraktionen": json.loads(a["fraktionen"] or "[]"),
|
||||
"gwoe_score": a["gwoe_score"],
|
||||
"empfehlung": a["empfehlung"],
|
||||
"themen": json.loads(a["themen"] or "[]"),
|
||||
"datum": a["datum"],
|
||||
"similarity": round(sim, 3),
|
||||
})
|
||||
scored.sort(key=lambda x: x["similarity"], reverse=True)
|
||||
return scored[:top_k]
|
||||
|
||||
|
||||
def find_news_for_antrag(
|
||||
drucksache: str,
|
||||
top_k: int = 5,
|
||||
min_similarity: float = 0.4,
|
||||
days_window: int = 90,
|
||||
db_path: Optional[Path] = None,
|
||||
) -> list[dict]:
|
||||
"""Pro gegebener Drucksache: Top-K aehnlichste News-Artikel per Cosine.
|
||||
|
||||
Filtert News auf ein Zeitfenster (Default 90 Tage), damit
|
||||
Pressemitteilungen aus aktueller Aktualitaet stammen.
|
||||
"""
|
||||
from .config import settings
|
||||
from . import embeddings as emb
|
||||
|
||||
path = db_path or settings.db_path
|
||||
if not Path(path).exists():
|
||||
return []
|
||||
|
||||
# 1. Antrag-Vektor laden
|
||||
conn = sqlite3.connect(str(path))
|
||||
try:
|
||||
row = conn.execute(
|
||||
"""SELECT summary_embedding, embedding_model
|
||||
FROM assessments WHERE drucksache=?""",
|
||||
(drucksache,),
|
||||
).fetchone()
|
||||
finally:
|
||||
conn.close()
|
||||
if not row or not row[0] or row[1] != emb.EMBEDDING_MODEL_READ:
|
||||
return []
|
||||
try:
|
||||
antrag_vec = json.loads(row[0])
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
return []
|
||||
|
||||
# 2. News mit Datums-Filter laden
|
||||
cutoff = datetime.now(timezone.utc).timestamp() - days_window * 86400
|
||||
news = _load_embeddings(
|
||||
Path(path),
|
||||
"news_articles",
|
||||
["url", "titel", "summary", "datum", "source", "ressort", "tags"],
|
||||
)
|
||||
scored = []
|
||||
for n in news:
|
||||
sim = emb.cosine_similarity(antrag_vec, n["_vec"])
|
||||
if sim < min_similarity:
|
||||
continue
|
||||
# Datums-Filter
|
||||
try:
|
||||
news_ts = datetime.fromisoformat(
|
||||
n["datum"].replace("Z", "+00:00")
|
||||
).timestamp()
|
||||
if news_ts < cutoff:
|
||||
continue
|
||||
except (ValueError, AttributeError):
|
||||
pass # Wenn Datum nicht parsbar, lass es durch
|
||||
try:
|
||||
tags = json.loads(n["tags"]) if n["tags"] else []
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
tags = []
|
||||
scored.append({
|
||||
"url": n["url"],
|
||||
"titel": n["titel"],
|
||||
"summary": n["summary"],
|
||||
"datum": n["datum"],
|
||||
"source": n["source"],
|
||||
"ressort": n["ressort"],
|
||||
"tags": tags,
|
||||
"similarity": round(sim, 3),
|
||||
})
|
||||
scored.sort(key=lambda x: x["similarity"], reverse=True)
|
||||
return scored[:top_k]
|
||||
|
||||
|
||||
def aggregate_top_themen(
|
||||
days_window: int = 7,
|
||||
top_k: int = 10,
|
||||
min_similarity: float = 0.4,
|
||||
matches_per_news: int = 3,
|
||||
db_path: Optional[Path] = None,
|
||||
) -> dict:
|
||||
"""Top-K aktuelle News (letzte N Tage) mit jeweils ihren passendsten
|
||||
Antraegen — der primaere Dashboard-Endpoint.
|
||||
|
||||
Returns:
|
||||
``{
|
||||
"buckets": [{
|
||||
"news": {url, titel, summary, datum, source, ressort, tags},
|
||||
"matches": [{drucksache, title, gwoe_score, similarity, ...}]
|
||||
}, ...],
|
||||
"n_total_news": int,
|
||||
"filter": {...}
|
||||
}``
|
||||
"""
|
||||
from .config import settings
|
||||
from . import embeddings as emb
|
||||
|
||||
path = db_path or settings.db_path
|
||||
if not Path(path).exists():
|
||||
return {"buckets": [], "n_total_news": 0, "filter": {
|
||||
"days_window": days_window, "top_k": top_k,
|
||||
"min_similarity": min_similarity,
|
||||
}}
|
||||
|
||||
cutoff = (
|
||||
datetime.now(timezone.utc).timestamp() - days_window * 86400
|
||||
)
|
||||
|
||||
news_rows = _load_embeddings(
|
||||
Path(path),
|
||||
"news_articles",
|
||||
["url", "titel", "summary", "datum", "source", "ressort", "tags"],
|
||||
)
|
||||
# Nach Datum filtern
|
||||
fresh = []
|
||||
for n in news_rows:
|
||||
try:
|
||||
news_ts = datetime.fromisoformat(
|
||||
n["datum"].replace("Z", "+00:00")
|
||||
).timestamp()
|
||||
except (ValueError, AttributeError):
|
||||
continue
|
||||
if news_ts < cutoff:
|
||||
continue
|
||||
n["_ts"] = news_ts
|
||||
fresh.append(n)
|
||||
# Nach Datum desc sortieren, top_k cutten
|
||||
fresh.sort(key=lambda x: x["_ts"], reverse=True)
|
||||
fresh = fresh[:top_k]
|
||||
|
||||
# Pro News: alle Antraege scoren, Top matches_per_news behalten
|
||||
assessments = _load_embeddings(
|
||||
Path(path),
|
||||
"assessments",
|
||||
["drucksache", "title", "bundesland", "fraktionen", "gwoe_score",
|
||||
"empfehlung", "themen", "datum"],
|
||||
)
|
||||
|
||||
buckets = []
|
||||
for n in fresh:
|
||||
scored = []
|
||||
for a in assessments:
|
||||
sim = emb.cosine_similarity(n["_vec"], a["_vec"])
|
||||
if sim < min_similarity:
|
||||
continue
|
||||
scored.append({
|
||||
"drucksache": a["drucksache"],
|
||||
"title": a["title"],
|
||||
"bundesland": a["bundesland"],
|
||||
"fraktionen": json.loads(a["fraktionen"] or "[]"),
|
||||
"gwoe_score": a["gwoe_score"],
|
||||
"empfehlung": a["empfehlung"],
|
||||
"datum": a["datum"],
|
||||
"similarity": round(sim, 3),
|
||||
})
|
||||
scored.sort(key=lambda x: x["similarity"], reverse=True)
|
||||
try:
|
||||
tags = json.loads(n["tags"]) if n["tags"] else []
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
tags = []
|
||||
buckets.append({
|
||||
"news": {
|
||||
"url": n["url"],
|
||||
"titel": n["titel"],
|
||||
"summary": n["summary"],
|
||||
"datum": n["datum"],
|
||||
"source": n["source"],
|
||||
"ressort": n["ressort"],
|
||||
"tags": tags,
|
||||
},
|
||||
"matches": scored[:matches_per_news],
|
||||
})
|
||||
|
||||
return {
|
||||
"buckets": buckets,
|
||||
"n_total_news": len(news_rows),
|
||||
"filter": {
|
||||
"days_window": days_window,
|
||||
"top_k": top_k,
|
||||
"min_similarity": min_similarity,
|
||||
"matches_per_news": matches_per_news,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def aggregate_themen_zeitreihe(
|
||||
days_window: int = 30,
|
||||
db_path: Optional[Path] = None,
|
||||
) -> dict:
|
||||
"""News-Volumen pro (Tag, Source) ueber die letzten N Tage —
|
||||
Stacked-Area-Chart.
|
||||
|
||||
Liefert Zeitreihe ohne Antrag-Match — nur die News-Aktivitaet pro
|
||||
Quelle, damit das Dashboard sehen kann, welche Quellen wie aktiv waren.
|
||||
"""
|
||||
from .config import settings
|
||||
|
||||
path = db_path or settings.db_path
|
||||
if not Path(path).exists():
|
||||
return {"buckets": [], "sources": [], "series": {}}
|
||||
|
||||
cutoff_ts = datetime.now(timezone.utc).timestamp() - days_window * 86400
|
||||
conn = sqlite3.connect(str(path))
|
||||
try:
|
||||
rows = conn.execute(
|
||||
"SELECT datum, source FROM news_articles"
|
||||
).fetchall()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
counts: defaultdict[tuple[str, str], int] = defaultdict(int)
|
||||
sources_seen: set[str] = set()
|
||||
days_seen: set[str] = set()
|
||||
for datum, source in rows:
|
||||
if not datum:
|
||||
continue
|
||||
try:
|
||||
ts = datetime.fromisoformat(datum.replace("Z", "+00:00")).timestamp()
|
||||
except (ValueError, AttributeError):
|
||||
continue
|
||||
if ts < cutoff_ts:
|
||||
continue
|
||||
day = datum[:10] # YYYY-MM-DD
|
||||
sources_seen.add(source)
|
||||
days_seen.add(day)
|
||||
counts[(day, source)] += 1
|
||||
|
||||
days_sorted = sorted(days_seen)
|
||||
sources_sorted = sorted(sources_seen)
|
||||
series = {
|
||||
s: [counts[(d, s)] for d in days_sorted]
|
||||
for s in sources_sorted
|
||||
}
|
||||
return {
|
||||
"buckets": days_sorted,
|
||||
"sources": sources_sorted,
|
||||
"series": series,
|
||||
}
|
||||
24
scripts/auto-fetch-news.sh
Executable file
24
scripts/auto-fetch-news.sh
Executable file
@ -0,0 +1,24 @@
|
||||
#!/bin/bash
|
||||
# Aktuelle-Themen-Dashboard: News-Aggregator-Cron (#170 Phase 1).
|
||||
#
|
||||
# Holt taeglich Headlines von Tagesschau-API + Bundestag-RSS, persistiert
|
||||
# sie in news_articles und embeddet die neuen via Qwen-Embeddings-API.
|
||||
# Idempotent (URL-PK), wiederhol-bar bei Fehlern.
|
||||
#
|
||||
# Wird via Cron taeglich morgens aufgerufen, vor auto-ingest-protocols.sh.
|
||||
#
|
||||
# Usage:
|
||||
# auto-fetch-news.sh [CONTAINER]
|
||||
set -euo pipefail
|
||||
|
||||
CONTAINER="${1:-gwoe-antragspruefer}"
|
||||
|
||||
echo "=== auto-fetch-news $(date -Iseconds) ==="
|
||||
|
||||
docker exec -i "$CONTAINER" python <<'EOF'
|
||||
from app.news_aggregator import run_aggregator
|
||||
stats = run_aggregator()
|
||||
print(f"News-Aggregator: inserted={stats['inserted']} updated={stats['updated']} embedded={stats['embedded']}")
|
||||
EOF
|
||||
|
||||
echo "=== auto-fetch-news done $(date -Iseconds) ==="
|
||||
262
tests/test_news_aggregator.py
Normal file
262
tests/test_news_aggregator.py
Normal file
@ -0,0 +1,262 @@
|
||||
"""Tests fuer app.news_aggregator (#170 Phase 1).
|
||||
|
||||
Testet Parser + DB-Persistierung gegen kontrollierte Fixtures, ohne
|
||||
Live-HTTP-Calls (Tagesschau-API + Bundestag-RSS werden gemockt).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from app.news_aggregator import (
|
||||
_parse_rss_date,
|
||||
_strip_html,
|
||||
fetch_rss,
|
||||
fetch_tagesschau,
|
||||
upsert_articles,
|
||||
)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Helper
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestStripHtml:
|
||||
def test_removes_tags(self):
|
||||
assert _strip_html("<p>Hello <b>world</b></p>") == "Hello world"
|
||||
|
||||
def test_decodes_cdata(self):
|
||||
assert "Test" in _strip_html("<![CDATA[Test]]>")
|
||||
|
||||
def test_decodes_entities(self):
|
||||
assert _strip_html("a & b") == "a & b"
|
||||
|
||||
def test_collapses_whitespace(self):
|
||||
assert _strip_html("<p>a b\n c</p>") == "a b c"
|
||||
|
||||
def test_empty(self):
|
||||
assert _strip_html("") == ""
|
||||
|
||||
|
||||
class TestParseRssDate:
|
||||
def test_rfc822_to_iso(self):
|
||||
result = _parse_rss_date("Tue, 28 Apr 2026 10:45:12 GMT")
|
||||
assert result.startswith("2026-04-28")
|
||||
|
||||
def test_invalid_returns_empty(self):
|
||||
assert _parse_rss_date("garbage") == ""
|
||||
assert _parse_rss_date("") == ""
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# fetch_tagesschau (mocked HTTP)
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
SAMPLE_TAGESSCHAU_JSON = json.dumps({
|
||||
"news": [
|
||||
{
|
||||
"title": "Bundestag berät über Wohnungsbau",
|
||||
"firstSentence": "Der Bundestag hat heute über das neue Wohnungsbau-Gesetz beraten.",
|
||||
"shareURL": "https://www.tagesschau.de/inland/bundestag-wohnungsbau-100.html",
|
||||
"date": "2026-04-28T10:00:00.000+02:00",
|
||||
"ressort": "inland",
|
||||
"tags": [{"tag": "Wohnungsbau"}, {"tag": "Bundestag"}],
|
||||
},
|
||||
{
|
||||
"title": "EU-Kommission stellt Klimapaket vor",
|
||||
"firstSentence": "Die EU plant ehrgeizige Klimaziele.",
|
||||
"shareURL": "https://www.tagesschau.de/ausland/eu-klima-100.html",
|
||||
"date": "2026-04-28T11:00:00.000+02:00",
|
||||
"ressort": "ausland",
|
||||
"tags": [{"tag": "Klima"}, {"tag": "EU"}],
|
||||
},
|
||||
{
|
||||
# Dieser hat keinen shareURL — sollte uebersprungen werden
|
||||
"title": "Kein Link",
|
||||
"firstSentence": "Skip mich",
|
||||
},
|
||||
],
|
||||
}).encode("utf-8")
|
||||
|
||||
|
||||
class TestFetchTagesschau:
|
||||
def test_parses_news_array(self):
|
||||
with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON):
|
||||
articles = fetch_tagesschau(ressorts=["inland"])
|
||||
# Deduplication ueber URL → 2 unique
|
||||
assert len(articles) == 2
|
||||
first = articles[0]
|
||||
assert first["url"] == "https://www.tagesschau.de/inland/bundestag-wohnungsbau-100.html"
|
||||
assert first["titel"] == "Bundestag berät über Wohnungsbau"
|
||||
assert "Wohnungsbau" in first["summary"]
|
||||
assert first["source"] == "tagesschau"
|
||||
assert first["ressort"] == "inland"
|
||||
assert "Wohnungsbau" in first["tags"]
|
||||
|
||||
def test_skips_items_without_link(self):
|
||||
with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON):
|
||||
articles = fetch_tagesschau(ressorts=["inland"])
|
||||
assert all(a["url"] for a in articles)
|
||||
|
||||
def test_returns_empty_on_http_error(self):
|
||||
with patch("app.news_aggregator._http_get", return_value=None):
|
||||
articles = fetch_tagesschau(ressorts=["inland"])
|
||||
assert articles == []
|
||||
|
||||
def test_dedup_across_ressorts(self):
|
||||
"""Wenn dasselbe Item in zwei Ressorts erscheint, wird es nur 1× geliefert."""
|
||||
with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON):
|
||||
articles = fetch_tagesschau(ressorts=["inland", "ausland"])
|
||||
urls = [a["url"] for a in articles]
|
||||
assert len(urls) == len(set(urls))
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# fetch_rss (mocked HTTP)
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
SAMPLE_RSS = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss version="2.0"><channel><title>BT Aktuell</title>
|
||||
<item>
|
||||
<title><![CDATA[Bundestag berät Antrag zum Wohnungsbau]]></title>
|
||||
<link>https://www.bundestag.de/dokumente/textarchiv/2026/kw18-wohnungsbau-1170388</link>
|
||||
<description><![CDATA[Der Bundestag hat heute den Antrag zum Wohnungsbau beraten.]]></description>
|
||||
<pubDate>Tue, 28 Apr 2026 10:45:12 GMT</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title>Antrag zur Klimapolitik</title>
|
||||
<link>https://www.bundestag.de/klima</link>
|
||||
<description>Klimaschutz im Bundestag</description>
|
||||
<pubDate>Mon, 27 Apr 2026 10:00:00 GMT</pubDate>
|
||||
</item>
|
||||
</channel></rss>""".encode("utf-8")
|
||||
|
||||
|
||||
class TestFetchRss:
|
||||
def test_parses_rss_items(self):
|
||||
with patch("app.news_aggregator._http_get", return_value=SAMPLE_RSS):
|
||||
articles = fetch_rss("bundestag-aktuell", "https://example.com/rss")
|
||||
assert len(articles) == 2
|
||||
first = articles[0]
|
||||
assert "Wohnungsbau" in first["titel"]
|
||||
assert first["url"].startswith("https://www.bundestag.de")
|
||||
assert first["source"] == "bundestag-aktuell"
|
||||
assert first["datum"].startswith("2026-04-28")
|
||||
assert "Bundestag" in first["summary"]
|
||||
|
||||
def test_strips_cdata_and_html(self):
|
||||
with patch("app.news_aggregator._http_get", return_value=SAMPLE_RSS):
|
||||
articles = fetch_rss("bundestag-aktuell", "https://example.com/rss")
|
||||
for a in articles:
|
||||
assert "<![CDATA[" not in a["titel"]
|
||||
assert "<![CDATA[" not in a["summary"]
|
||||
|
||||
def test_empty_on_http_error(self):
|
||||
with patch("app.news_aggregator._http_get", return_value=None):
|
||||
articles = fetch_rss("x", "https://example.com/rss")
|
||||
assert articles == []
|
||||
|
||||
def test_skips_items_without_title_or_link(self):
|
||||
bad = b"""<?xml version="1.0"?><rss><channel>
|
||||
<item><title>Nur Titel</title></item>
|
||||
<item><link>nur-link</link></item>
|
||||
</channel></rss>"""
|
||||
with patch("app.news_aggregator._http_get", return_value=bad):
|
||||
articles = fetch_rss("x", "https://example.com/rss")
|
||||
assert articles == []
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# upsert_articles
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def empty_db(tmp_path: Path) -> Path:
|
||||
db = tmp_path / "test_news.db"
|
||||
conn = sqlite3.connect(str(db))
|
||||
conn.execute("""
|
||||
CREATE TABLE news_articles (
|
||||
url TEXT PRIMARY KEY,
|
||||
titel TEXT NOT NULL,
|
||||
summary TEXT,
|
||||
datum TEXT NOT NULL,
|
||||
source TEXT NOT NULL,
|
||||
ressort TEXT,
|
||||
tags TEXT,
|
||||
summary_embedding BLOB,
|
||||
embedding_model TEXT,
|
||||
fetched_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||
)
|
||||
""")
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return db
|
||||
|
||||
|
||||
SAMPLE_ARTICLES = [
|
||||
{
|
||||
"url": "https://example.com/a",
|
||||
"titel": "Wohnungsbau",
|
||||
"summary": "Heute im Bundestag",
|
||||
"datum": "2026-04-28",
|
||||
"source": "tagesschau",
|
||||
"ressort": "inland",
|
||||
"tags": ["Wohnungsbau"],
|
||||
},
|
||||
{
|
||||
"url": "https://example.com/b",
|
||||
"titel": "Klima",
|
||||
"summary": "EU plant Klimaziele",
|
||||
"datum": "2026-04-28",
|
||||
"source": "tagesschau",
|
||||
"ressort": "ausland",
|
||||
"tags": ["Klima", "EU"],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
class TestUpsertArticles:
|
||||
def test_inserts_new_articles(self, empty_db):
|
||||
stats = upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False)
|
||||
assert stats["inserted"] == 2
|
||||
assert stats["updated"] == 0
|
||||
|
||||
def test_updates_existing_articles(self, empty_db):
|
||||
upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False)
|
||||
# Re-run with same URLs but different titel
|
||||
modified = [{**a, "titel": a["titel"] + " (neu)"} for a in SAMPLE_ARTICLES]
|
||||
stats = upsert_articles(modified, db_path=empty_db, embed=False)
|
||||
assert stats["updated"] == 2
|
||||
assert stats["inserted"] == 0
|
||||
# Verify the title was updated
|
||||
conn = sqlite3.connect(str(empty_db))
|
||||
row = conn.execute(
|
||||
"SELECT titel FROM news_articles WHERE url=?",
|
||||
(SAMPLE_ARTICLES[0]["url"],),
|
||||
).fetchone()
|
||||
conn.close()
|
||||
assert row[0].endswith("(neu)")
|
||||
|
||||
def test_persists_tags_as_json(self, empty_db):
|
||||
upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False)
|
||||
conn = sqlite3.connect(str(empty_db))
|
||||
row = conn.execute(
|
||||
"SELECT tags FROM news_articles WHERE url=?",
|
||||
(SAMPLE_ARTICLES[0]["url"],),
|
||||
).fetchone()
|
||||
conn.close()
|
||||
tags = json.loads(row[0])
|
||||
assert tags == ["Wohnungsbau"]
|
||||
|
||||
def test_missing_db_returns_zeros(self, tmp_path):
|
||||
stats = upsert_articles(SAMPLE_ARTICLES,
|
||||
db_path=tmp_path / "missing.db", embed=False)
|
||||
assert stats == {"inserted": 0, "updated": 0, "embedded": 0}
|
||||
224
tests/test_presse_generator.py
Normal file
224
tests/test_presse_generator.py
Normal file
@ -0,0 +1,224 @@
|
||||
"""Tests fuer app.presse_generator (#170 Phase 4)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from app.presse_generator import (
|
||||
_build_user_prompt,
|
||||
generate_draft,
|
||||
get_draft,
|
||||
list_drafts,
|
||||
)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Fixture: DB mit Antrag + News
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def db_with_antrag_and_news(tmp_path: Path) -> Path:
|
||||
db = tmp_path / "test_presse.db"
|
||||
conn = sqlite3.connect(str(db))
|
||||
conn.execute("""
|
||||
CREATE TABLE assessments (
|
||||
drucksache TEXT PRIMARY KEY,
|
||||
title TEXT,
|
||||
bundesland TEXT,
|
||||
antrag_zusammenfassung TEXT,
|
||||
gwoe_score REAL,
|
||||
gwoe_begruendung TEXT,
|
||||
empfehlung TEXT
|
||||
)
|
||||
""")
|
||||
conn.execute("""
|
||||
CREATE TABLE news_articles (
|
||||
url TEXT PRIMARY KEY,
|
||||
titel TEXT NOT NULL,
|
||||
summary TEXT
|
||||
)
|
||||
""")
|
||||
conn.execute("""
|
||||
CREATE TABLE presse_drafts (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
drucksache TEXT NOT NULL,
|
||||
bundesland TEXT NOT NULL,
|
||||
news_url TEXT NOT NULL,
|
||||
news_titel TEXT NOT NULL,
|
||||
titel TEXT NOT NULL,
|
||||
body TEXT NOT NULL,
|
||||
model TEXT NOT NULL,
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||
)
|
||||
""")
|
||||
conn.execute(
|
||||
"""INSERT INTO assessments
|
||||
(drucksache, title, bundesland, antrag_zusammenfassung,
|
||||
gwoe_score, gwoe_begruendung, empfehlung)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)""",
|
||||
(
|
||||
"18/A", "Wohnungsbau-Reform-Antrag", "NRW",
|
||||
"Antrag fuer mehr sozialen Wohnungsbau",
|
||||
8.5, "Stark gemeinwohlorientiert",
|
||||
"Uneingeschränkt unterstützen",
|
||||
),
|
||||
)
|
||||
conn.execute(
|
||||
"INSERT INTO news_articles (url, titel, summary) VALUES (?, ?, ?)",
|
||||
(
|
||||
"https://example.com/wohnen",
|
||||
"Wohnungsmarkt im Umbruch",
|
||||
"Die Mietpreise steigen weiter, der Bundestag berät heute",
|
||||
),
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return db
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# _build_user_prompt
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestBuildUserPrompt:
|
||||
def test_includes_drucksache(self):
|
||||
prompt = _build_user_prompt(
|
||||
drucksache="18/A", bundesland="NRW",
|
||||
antrag_titel="Test", antrag_zusammenfassung="Summary",
|
||||
gwoe_score=7.5, gwoe_begruendung="ok",
|
||||
empfehlung="Unterstützen",
|
||||
news_titel="News", news_summary="Lead",
|
||||
news_url="https://example.com",
|
||||
)
|
||||
assert "18/A" in prompt
|
||||
assert "NRW" in prompt
|
||||
assert "7.5" in prompt
|
||||
assert "News" in prompt
|
||||
|
||||
def test_handles_missing_zusammenfassung(self):
|
||||
prompt = _build_user_prompt(
|
||||
drucksache="x", bundesland="x", antrag_titel="x",
|
||||
antrag_zusammenfassung="", gwoe_score=5.0,
|
||||
gwoe_begruendung="", empfehlung="",
|
||||
news_titel="x", news_summary="", news_url="",
|
||||
)
|
||||
assert "(keine vorhanden)" in prompt
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# generate_draft (mocked QwenBewerter)
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class FakeBewerter:
|
||||
"""Mock fuer QwenBewerter, gibt fixe LLM-Response zurueck."""
|
||||
|
||||
def __init__(self, response: dict):
|
||||
self._response = response
|
||||
self.last_request = None
|
||||
|
||||
async def bewerte(self, request):
|
||||
self.last_request = request
|
||||
return self._response
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_generate_draft_persists_record(db_with_antrag_and_news, monkeypatch):
|
||||
bewerter = FakeBewerter({
|
||||
"titel": "Wohnungsbau jetzt",
|
||||
"body": "Der vorliegende Antrag der Drucksache 18/A ..."
|
||||
* 10, # langer Body
|
||||
})
|
||||
# Patch settings.dashscope_model fuer den INSERT
|
||||
from app.config import settings as real_settings
|
||||
monkeypatch.setattr(real_settings, "llm_model_default", "qwen-test")
|
||||
result = await generate_draft(
|
||||
drucksache="18/A",
|
||||
news_url="https://example.com/wohnen",
|
||||
db_path=db_with_antrag_and_news,
|
||||
bewerter=bewerter,
|
||||
)
|
||||
|
||||
assert result["id"] == 1
|
||||
assert result["drucksache"] == "18/A"
|
||||
assert result["bundesland"] == "NRW"
|
||||
assert result["news_titel"] == "Wohnungsmarkt im Umbruch"
|
||||
assert result["titel"] == "Wohnungsbau jetzt"
|
||||
assert "18/A" in result["body"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_generate_draft_unknown_drucksache(db_with_antrag_and_news):
|
||||
bewerter = FakeBewerter({"titel": "x", "body": "y"})
|
||||
with pytest.raises(ValueError, match="Drucksache"):
|
||||
await generate_draft(
|
||||
drucksache="99/MISSING",
|
||||
news_url="https://example.com/wohnen",
|
||||
db_path=db_with_antrag_and_news,
|
||||
bewerter=bewerter,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_generate_draft_unknown_news(db_with_antrag_and_news):
|
||||
bewerter = FakeBewerter({"titel": "x", "body": "y"})
|
||||
with pytest.raises(ValueError, match="News-URL"):
|
||||
await generate_draft(
|
||||
drucksache="18/A",
|
||||
news_url="https://example.com/missing",
|
||||
db_path=db_with_antrag_and_news,
|
||||
bewerter=bewerter,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_generate_draft_empty_response_raises(db_with_antrag_and_news, monkeypatch):
|
||||
bewerter = FakeBewerter({"titel": "", "body": ""})
|
||||
from app.config import settings as real_settings
|
||||
monkeypatch.setattr(real_settings, "llm_model_default", "qwen-test")
|
||||
with pytest.raises(ValueError, match="unvollständig"):
|
||||
await generate_draft(
|
||||
drucksache="18/A",
|
||||
news_url="https://example.com/wohnen",
|
||||
db_path=db_with_antrag_and_news,
|
||||
bewerter=bewerter,
|
||||
)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# list_drafts + get_draft
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestListAndGetDrafts:
|
||||
def test_empty(self, db_with_antrag_and_news):
|
||||
assert list_drafts(db_path=db_with_antrag_and_news) == []
|
||||
assert get_draft(99, db_path=db_with_antrag_and_news) is None
|
||||
|
||||
def test_after_insert(self, db_with_antrag_and_news):
|
||||
# Direct DB-Insert (test setup)
|
||||
conn = sqlite3.connect(str(db_with_antrag_and_news))
|
||||
conn.execute(
|
||||
"""INSERT INTO presse_drafts
|
||||
(drucksache, bundesland, news_url, news_titel, titel, body, model)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)""",
|
||||
("18/A", "NRW", "https://x.de/n", "News-Titel",
|
||||
"PM-Titel", "PM-Body", "test-model"),
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
drafts = list_drafts(db_path=db_with_antrag_and_news)
|
||||
assert len(drafts) == 1
|
||||
assert drafts[0]["drucksache"] == "18/A"
|
||||
assert drafts[0]["titel"] == "PM-Titel"
|
||||
|
||||
d = get_draft(drafts[0]["id"], db_path=db_with_antrag_and_news)
|
||||
assert d is not None
|
||||
assert d["body"] == "PM-Body"
|
||||
297
tests/test_themen_matching.py
Normal file
297
tests/test_themen_matching.py
Normal file
@ -0,0 +1,297 @@
|
||||
"""Tests fuer app.themen_matching (#170 Phase 2)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from app.themen_matching import (
|
||||
aggregate_themen_zeitreihe,
|
||||
aggregate_top_themen,
|
||||
find_anträge_for_news,
|
||||
find_news_for_antrag,
|
||||
)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Fixture: DB mit News + Assessments + Embeddings
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _vec(dim: int = 8, val: float = 0.1) -> bytes:
|
||||
"""Konstruiert einen einfachen Vektor als JSON-Bytes."""
|
||||
return json.dumps([val] * dim).encode()
|
||||
|
||||
|
||||
def _vec_from(values: list[float]) -> bytes:
|
||||
return json.dumps(values).encode()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def populated_db(tmp_path: Path) -> Path:
|
||||
db = tmp_path / "test_match.db"
|
||||
conn = sqlite3.connect(str(db))
|
||||
conn.execute("""
|
||||
CREATE TABLE news_articles (
|
||||
url TEXT PRIMARY KEY,
|
||||
titel TEXT NOT NULL,
|
||||
summary TEXT,
|
||||
datum TEXT NOT NULL,
|
||||
source TEXT NOT NULL,
|
||||
ressort TEXT,
|
||||
tags TEXT,
|
||||
summary_embedding BLOB,
|
||||
embedding_model TEXT,
|
||||
fetched_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||
)
|
||||
""")
|
||||
conn.execute("""
|
||||
CREATE TABLE assessments (
|
||||
drucksache TEXT PRIMARY KEY,
|
||||
title TEXT,
|
||||
fraktionen TEXT,
|
||||
datum TEXT,
|
||||
link TEXT,
|
||||
bundesland TEXT,
|
||||
gwoe_score REAL,
|
||||
gwoe_begruendung TEXT,
|
||||
gwoe_matrix TEXT,
|
||||
gwoe_schwerpunkt TEXT,
|
||||
wahlprogramm_scores TEXT,
|
||||
verbesserungen TEXT,
|
||||
staerken TEXT,
|
||||
schwaechen TEXT,
|
||||
empfehlung TEXT,
|
||||
empfehlung_symbol TEXT,
|
||||
verbesserungspotenzial TEXT,
|
||||
themen TEXT,
|
||||
antrag_zusammenfassung TEXT,
|
||||
antrag_kernpunkte TEXT,
|
||||
source TEXT,
|
||||
model TEXT,
|
||||
created_at TEXT,
|
||||
updated_at TEXT,
|
||||
summary_embedding BLOB,
|
||||
embedding_model TEXT
|
||||
)
|
||||
""")
|
||||
|
||||
today = datetime.now(timezone.utc).isoformat()
|
||||
yesterday = (datetime.now(timezone.utc) - timedelta(days=1)).isoformat()
|
||||
old = (datetime.now(timezone.utc) - timedelta(days=200)).isoformat()
|
||||
|
||||
# News-Artikel mit unterschiedlichen Embeddings
|
||||
news = [
|
||||
# Wohnungsbau-News (vec orientiert auf [1,0,0,...])
|
||||
("https://example.com/n1", "Wohnungsbau-Reform",
|
||||
"Bundestag berät Wohnungsbau", today, "tagesschau", "inland",
|
||||
'["Wohnungsbau"]',
|
||||
_vec_from([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])),
|
||||
# Klima-News (vec orientiert auf [0,1,0,...])
|
||||
("https://example.com/n2", "Klimaschutzgesetz",
|
||||
"EU plant Klimaziele", today, "tagesschau", "ausland",
|
||||
'["Klima"]',
|
||||
_vec_from([0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])),
|
||||
# Old news, sollte aus Zeitfenster filtern
|
||||
("https://example.com/n3", "Alte News", "", old, "tagesschau", "inland",
|
||||
'[]', _vec_from([0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])),
|
||||
]
|
||||
for url, titel, summary, datum, source, ressort, tags, vec in news:
|
||||
conn.execute(
|
||||
"""INSERT INTO news_articles
|
||||
(url, titel, summary, datum, source, ressort, tags,
|
||||
summary_embedding, embedding_model)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, 'qwen-embedding-v4')""",
|
||||
(url, titel, summary, datum, source, ressort, tags, vec),
|
||||
)
|
||||
|
||||
# Assessments mit Embeddings:
|
||||
# - 18/A passt zu Wohnungsbau-News (vec [1,0,...])
|
||||
# - 18/B passt zu Klima-News
|
||||
# - 18/C ist orthogonal — sollte nirgends matchen
|
||||
now_iso = datetime.now().isoformat()
|
||||
assessments = [
|
||||
("18/A", "Wohnungsbau-Antrag", '["GRÜNE"]', "2026-04-15", "NRW",
|
||||
8.0, "Uneingeschränkt unterstützen",
|
||||
_vec_from([0.95, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])),
|
||||
("18/B", "Klima-Antrag", '["SPD"]', "2026-04-16", "NRW",
|
||||
7.0, "Unterstützen mit Änderungen",
|
||||
_vec_from([0.0, 0.95, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0])),
|
||||
("18/C", "Sonstiges", '["CDU"]', "2026-04-17", "NRW",
|
||||
5.0, "Überarbeiten",
|
||||
_vec_from([0.0, 0.0, 0.0, 0.0, 0.95, 0.0, 0.0, 0.0])),
|
||||
]
|
||||
for ds, title, fr, dat, bl, sc, emp, vec in assessments:
|
||||
conn.execute(
|
||||
"""INSERT INTO assessments
|
||||
(drucksache, title, fraktionen, datum, bundesland, gwoe_score,
|
||||
empfehlung, themen, source, model, created_at, updated_at,
|
||||
summary_embedding, embedding_model)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, '[]', 'test', 'test', ?, ?,
|
||||
?, 'qwen-embedding-v4')""",
|
||||
(ds, title, fr, dat, bl, sc, emp, now_iso, now_iso, vec),
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return db
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def mock_embedding_model():
|
||||
"""Stellt sicher, dass EMBEDDING_MODEL_READ=qwen-embedding-v4 fuer Tests."""
|
||||
with patch("app.embeddings.EMBEDDING_MODEL_READ", "qwen-embedding-v4"):
|
||||
yield
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# find_anträge_for_news
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestFindAnträgeForNews:
|
||||
def test_wohnungsbau_news_matches_wohnungsbau_antrag(self, populated_db):
|
||||
result = find_anträge_for_news(
|
||||
"https://example.com/n1", db_path=populated_db,
|
||||
min_similarity=0.5,
|
||||
)
|
||||
assert len(result) >= 1
|
||||
# Top-Match sollte 18/A sein
|
||||
assert result[0]["drucksache"] == "18/A"
|
||||
assert result[0]["similarity"] > 0.9
|
||||
|
||||
def test_klima_news_matches_klima_antrag(self, populated_db):
|
||||
result = find_anträge_for_news(
|
||||
"https://example.com/n2", db_path=populated_db,
|
||||
min_similarity=0.5,
|
||||
)
|
||||
assert len(result) >= 1
|
||||
assert result[0]["drucksache"] == "18/B"
|
||||
|
||||
def test_min_similarity_filters_orthogonal(self, populated_db):
|
||||
"""Mit hohem min_similarity-Cutoff darf kein orthogonaler Antrag drin sein."""
|
||||
result = find_anträge_for_news(
|
||||
"https://example.com/n1", db_path=populated_db,
|
||||
min_similarity=0.9,
|
||||
)
|
||||
druck = [r["drucksache"] for r in result]
|
||||
assert "18/C" not in druck # 18/C ist orthogonal zu allem
|
||||
|
||||
def test_unknown_news_returns_empty(self, populated_db):
|
||||
assert find_anträge_for_news(
|
||||
"https://example.com/missing", db_path=populated_db,
|
||||
) == []
|
||||
|
||||
def test_empty_db(self, tmp_path):
|
||||
assert find_anträge_for_news(
|
||||
"x", db_path=tmp_path / "missing.db",
|
||||
) == []
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# find_news_for_antrag
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestFindNewsForAntrag:
|
||||
def test_wohnungsbau_antrag_matches_wohnungsbau_news(self, populated_db):
|
||||
result = find_news_for_antrag(
|
||||
"18/A", db_path=populated_db, min_similarity=0.5,
|
||||
)
|
||||
assert len(result) >= 1
|
||||
assert result[0]["url"] == "https://example.com/n1"
|
||||
|
||||
def test_old_news_filtered_out(self, populated_db):
|
||||
"""News aus dem 200-Tage-alten Bucket darf nicht im 90-Tage-Fenster auftauchen."""
|
||||
result = find_news_for_antrag(
|
||||
"18/A", db_path=populated_db, min_similarity=0.0,
|
||||
days_window=90,
|
||||
)
|
||||
urls = [r["url"] for r in result]
|
||||
assert "https://example.com/n3" not in urls
|
||||
|
||||
def test_top_k_limits(self, populated_db):
|
||||
"""top_k=1 liefert nur den besten Match."""
|
||||
result = find_news_for_antrag(
|
||||
"18/A", db_path=populated_db, min_similarity=0.0,
|
||||
top_k=1,
|
||||
)
|
||||
assert len(result) <= 1
|
||||
|
||||
def test_unknown_antrag(self, populated_db):
|
||||
assert find_news_for_antrag(
|
||||
"99/Missing", db_path=populated_db,
|
||||
) == []
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# aggregate_top_themen
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestAggregateTopThemen:
|
||||
def test_returns_buckets(self, populated_db):
|
||||
result = aggregate_top_themen(
|
||||
db_path=populated_db, min_similarity=0.5,
|
||||
)
|
||||
# Heute gibt es 2 News-Artikel, beide mit Match
|
||||
assert len(result["buckets"]) == 2
|
||||
assert "n_total_news" in result
|
||||
|
||||
def test_each_bucket_has_news_and_matches(self, populated_db):
|
||||
result = aggregate_top_themen(
|
||||
db_path=populated_db, min_similarity=0.5,
|
||||
)
|
||||
for b in result["buckets"]:
|
||||
assert "news" in b
|
||||
assert "matches" in b
|
||||
assert "url" in b["news"]
|
||||
assert "titel" in b["news"]
|
||||
|
||||
def test_days_window_filter(self, populated_db):
|
||||
"""Mit kleinem Fenster nur die fresh News, alte raus."""
|
||||
result = aggregate_top_themen(
|
||||
db_path=populated_db, days_window=7, min_similarity=0.5,
|
||||
)
|
||||
for b in result["buckets"]:
|
||||
assert b["news"]["url"] != "https://example.com/n3"
|
||||
|
||||
def test_min_similarity_filter(self, populated_db):
|
||||
"""Mit hohem min_sim verschwinden Cross-Matches."""
|
||||
result = aggregate_top_themen(
|
||||
db_path=populated_db, min_similarity=0.99,
|
||||
)
|
||||
# Nur exakte Matches sollten überleben
|
||||
for b in result["buckets"]:
|
||||
for m in b["matches"]:
|
||||
assert m["similarity"] > 0.99
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# aggregate_themen_zeitreihe
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestAggregateZeitreihe:
|
||||
def test_structure(self, populated_db):
|
||||
result = aggregate_themen_zeitreihe(db_path=populated_db, days_window=7)
|
||||
assert "buckets" in result
|
||||
assert "sources" in result
|
||||
assert "series" in result
|
||||
|
||||
def test_only_recent(self, populated_db):
|
||||
"""Mit days_window=7 darf das alte News nicht im Bucket auftauchen."""
|
||||
result = aggregate_themen_zeitreihe(db_path=populated_db, days_window=7)
|
||||
# Nur heutige News (n1, n2) — n3 ist 200 Tage alt
|
||||
total = sum(sum(s) for s in result["series"].values())
|
||||
assert total == 2
|
||||
|
||||
def test_series_aligned(self, populated_db):
|
||||
"""Pro Source: series-Liste muss exakt so lang sein wie buckets."""
|
||||
result = aggregate_themen_zeitreihe(db_path=populated_db, days_window=7)
|
||||
for source in result["sources"]:
|
||||
assert len(result["series"][source]) == len(result["buckets"])
|
||||
Loading…
Reference in New Issue
Block a user