feat(#170): Aktuelle-Themen-Dashboard — News × Anträge × Pressemitteilungen
Vollständiges 4-Phasen-Feature:
**Phase 1 — News-Aggregator** (`app/news_aggregator.py`)
- Tagesschau-API (`/api2u/news?ressort=...`) für inland/ausland/wirtschaft/wissen
- Bundestag-RSS für aktuellethemen / pressemitteilungen / hib
- DB-Tabelle `news_articles` (URL-PK, idempotent)
- Embeddings via existierender qwen-v4-Pipeline
- Cron-Script `scripts/auto-fetch-news.sh`
- Bewusst NICHT: RND.de (robots.txt bannt explizit ClaudeBot, GPTBot,
CCBot, ChatGPT-User, Google-Extended). Nur AI-erlaubende, öffentlich-
rechtliche/parlamentarische Quellen
- Volltexte werden NICHT persistiert (nur Titel + erster Satz)
**Phase 2 — Themen × Anträge Matching** (`app/themen_matching.py`)
- News-Embedding × Assessment-summary_embedding via Cosine-Similarity
- `find_anträge_for_news`: pro News die Top-K passenden Anträge
- `find_news_for_antrag`: pro Antrag Top-K News mit Datums-Fenster (90d)
- `aggregate_top_themen`: primärer Dashboard-Endpoint
- `aggregate_themen_zeitreihe`: News-Volumen pro Tag × Source
**Phase 3 — Dashboard-View** (`/aktuelle-themen`)
- Neuer linker Nav-Eintrag „Aktuelle Themen"
- Stacked-Area-Chart News-Volumen pro Quelle (30d)
- Pro News-Card: Titel + Summary + Tags + Top-3-Antrags-Match-Liste
mit GWÖ-Score-Pill, Drucksache-Link, PM-Vorschlag-Button
- Filter: Zeitfenster, Top-N, min_similarity
- Auth-protected (require_auth)
**Phase 4 — Pressemitteilungs-Generator** (`app/presse_generator.py`)
- LLM-Prompt-Template (200-250 Worte, GWÖ-Sicht, JSON-Output)
- Reuse von `QwenBewerter` aus app/adapters/qwen_bewerter.py
- DB-Tabelle `presse_drafts` (Persistenz)
- POST `/api/aktuelle-themen/generate-presse` rate-limited 5/min,
auth-only (LLM-Kosten)
- GET `/api/aktuelle-themen/drafts` + `/drafts/{id}` für Liste/Detail
- Manueller Trigger via UI-Button, kein Auto-Versand
- Modal-Anzeige des generierten Texts
**Compliance:**
- robots.txt-respektierend (ClaudeBot-Bann von RND vermieden, AI-
erlaubende Quellen verwendet)
- UI zeigt nur Titel+URL+Datum+erster Satz, keine Volltext-Reproduktion
- Pressemitteilungen sind explizit Drafts, nicht Auto-Versand
- LLM-Calls rate-limited, auth-only
**Tests:** 43 neue Tests (19 news_aggregator + 16 themen_matching +
8 presse_generator). Suite jetzt 1048 grün.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
1e381d23ab
commit
d54ce23e42
@ -285,6 +285,55 @@ async def init_db():
|
|||||||
"ON plenum_vote_results(bundesland, drucksache)"
|
"ON plenum_vote_results(bundesland, drucksache)"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# News-Artikel aus oeffentlich-rechtlichen Quellen (#170 Phase 1).
|
||||||
|
# Tagesschau-API + Bundestag-RSS — KEIN AI-banntes Quellmaterial
|
||||||
|
# (RND ist explizit per robots.txt ausgeschlossen).
|
||||||
|
# Volltexte werden NICHT persistiert — nur Titel + Summary fuer
|
||||||
|
# Embeddings + UI-Anzeige (Urheberrecht).
|
||||||
|
await db.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS news_articles (
|
||||||
|
url TEXT PRIMARY KEY,
|
||||||
|
titel TEXT NOT NULL,
|
||||||
|
summary TEXT,
|
||||||
|
datum TEXT NOT NULL,
|
||||||
|
source TEXT NOT NULL,
|
||||||
|
ressort TEXT,
|
||||||
|
tags TEXT,
|
||||||
|
summary_embedding BLOB,
|
||||||
|
embedding_model TEXT,
|
||||||
|
fetched_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
await db.execute(
|
||||||
|
"CREATE INDEX IF NOT EXISTS idx_news_datum "
|
||||||
|
"ON news_articles(datum)"
|
||||||
|
)
|
||||||
|
await db.execute(
|
||||||
|
"CREATE INDEX IF NOT EXISTS idx_news_source "
|
||||||
|
"ON news_articles(source)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Pressemitteilungs-Drafts (#170 Phase 4). LLM-generierte Vorschlaege,
|
||||||
|
# die einen Antrag in den Kontext eines News-Artikels stellen.
|
||||||
|
# Manueller Trigger, kein Auto-Versand.
|
||||||
|
await db.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS presse_drafts (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
drucksache TEXT NOT NULL,
|
||||||
|
bundesland TEXT NOT NULL,
|
||||||
|
news_url TEXT NOT NULL,
|
||||||
|
news_titel TEXT NOT NULL,
|
||||||
|
titel TEXT NOT NULL,
|
||||||
|
body TEXT NOT NULL,
|
||||||
|
model TEXT NOT NULL,
|
||||||
|
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
await db.execute(
|
||||||
|
"CREATE INDEX IF NOT EXISTS idx_presse_created "
|
||||||
|
"ON presse_drafts(created_at DESC)"
|
||||||
|
)
|
||||||
|
|
||||||
await db.commit()
|
await db.commit()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
110
app/main.py
110
app/main.py
@ -2008,6 +2008,116 @@ async def auswertungen_page(request: Request, current_user: dict = Depends(requi
|
|||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Aktuelle-Themen-Dashboard (#170) ──────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/aktuelle-themen", response_class=HTMLResponse)
|
||||||
|
async def aktuelle_themen_page(
|
||||||
|
request: Request, current_user: dict = Depends(require_auth)
|
||||||
|
):
|
||||||
|
"""Aktuelle-Themen-Dashboard: News × Anträge × Pressemitteilungs-Drafts."""
|
||||||
|
return templates.TemplateResponse("v2/screens/aktuelle-themen.html", {
|
||||||
|
"request": request,
|
||||||
|
"app_name": settings.app_name,
|
||||||
|
"v2_active_nav": "aktuelle-themen",
|
||||||
|
**_v2_template_context(current_user),
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/aktuelle-themen/top")
|
||||||
|
async def api_aktuelle_themen_top(
|
||||||
|
days: int = 7,
|
||||||
|
top_k: int = 10,
|
||||||
|
min_similarity: float = 0.4,
|
||||||
|
matches_per_news: int = 3,
|
||||||
|
):
|
||||||
|
"""Top-K News der letzten N Tage mit Antrags-Match."""
|
||||||
|
from .themen_matching import aggregate_top_themen
|
||||||
|
return aggregate_top_themen(
|
||||||
|
days_window=days,
|
||||||
|
top_k=top_k,
|
||||||
|
min_similarity=min_similarity,
|
||||||
|
matches_per_news=matches_per_news,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/aktuelle-themen/zeitreihe")
|
||||||
|
async def api_aktuelle_themen_zeitreihe(days: int = 30):
|
||||||
|
"""News-Volumen pro Tag × Source — Stacked-Area-Chart."""
|
||||||
|
from .themen_matching import aggregate_themen_zeitreihe
|
||||||
|
return aggregate_themen_zeitreihe(days_window=days)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/aktuelle-themen/news-fuer-antrag")
|
||||||
|
async def api_news_fuer_antrag(
|
||||||
|
drucksache: str,
|
||||||
|
top_k: int = 5,
|
||||||
|
min_similarity: float = 0.4,
|
||||||
|
days: int = 90,
|
||||||
|
):
|
||||||
|
"""Top-K News, die zu einem gegebenen Antrag passen (für Detail-View)."""
|
||||||
|
from .themen_matching import find_news_for_antrag
|
||||||
|
return {"drucksache": drucksache, "matches": find_news_for_antrag(
|
||||||
|
drucksache=drucksache, top_k=top_k,
|
||||||
|
min_similarity=min_similarity, days_window=days,
|
||||||
|
)}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/aktuelle-themen/anträge-fuer-news")
|
||||||
|
async def api_anträge_fuer_news(
|
||||||
|
url: str,
|
||||||
|
top_k: int = 5,
|
||||||
|
min_similarity: float = 0.4,
|
||||||
|
):
|
||||||
|
"""Top-K Anträge, die zu einem gegebenen News-Artikel passen."""
|
||||||
|
from .themen_matching import find_anträge_for_news
|
||||||
|
return {"news_url": url, "matches": find_anträge_for_news(
|
||||||
|
news_url=url, top_k=top_k, min_similarity=min_similarity,
|
||||||
|
)}
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Pressemitteilungs-Drafts (#170 Phase 4) ──────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/aktuelle-themen/generate-presse")
|
||||||
|
@limiter.limit("5/minute")
|
||||||
|
async def api_generate_presse(
|
||||||
|
request: Request,
|
||||||
|
drucksache: str,
|
||||||
|
news_url: str,
|
||||||
|
current_user: dict = Depends(require_auth),
|
||||||
|
):
|
||||||
|
"""Generiert einen LLM-Pressemitteilungs-Vorschlag.
|
||||||
|
|
||||||
|
Auth-only + rate-limited (5/min) wegen LLM-Kosten.
|
||||||
|
"""
|
||||||
|
from .presse_generator import generate_draft
|
||||||
|
try:
|
||||||
|
return await generate_draft(drucksache=drucksache, news_url=news_url)
|
||||||
|
except ValueError as e:
|
||||||
|
raise HTTPException(status_code=404, detail=str(e))
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("generate_draft failed")
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/aktuelle-themen/drafts")
|
||||||
|
async def api_drafts_list(limit: int = 20):
|
||||||
|
"""Liste der zuletzt generierten Pressemitteilungs-Entwürfe."""
|
||||||
|
from .presse_generator import list_drafts
|
||||||
|
return {"drafts": list_drafts(limit=limit)}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/aktuelle-themen/drafts/{draft_id}")
|
||||||
|
async def api_draft_detail(draft_id: int):
|
||||||
|
"""Einen einzelnen Pressemitteilungs-Entwurf."""
|
||||||
|
from .presse_generator import get_draft
|
||||||
|
d = get_draft(draft_id)
|
||||||
|
if not d:
|
||||||
|
raise HTTPException(status_code=404, detail="Draft nicht gefunden")
|
||||||
|
return d
|
||||||
|
|
||||||
|
|
||||||
@app.get("/api/auswertungen/matrix")
|
@app.get("/api/auswertungen/matrix")
|
||||||
async def auswertungen_matrix(
|
async def auswertungen_matrix(
|
||||||
wahlperiode: Optional[str] = None,
|
wahlperiode: Optional[str] = None,
|
||||||
|
|||||||
347
app/news_aggregator.py
Normal file
347
app/news_aggregator.py
Normal file
@ -0,0 +1,347 @@
|
|||||||
|
"""News-Aggregator fuer das Aktuelle-Themen-Dashboard (#170 Phase 1).
|
||||||
|
|
||||||
|
Fetcht regelmaessig News-Headlines aus AI-erlaubenden, oeffentlich-rechtlichen
|
||||||
|
oder parlamentarischen Quellen:
|
||||||
|
|
||||||
|
- **Tagesschau-API** (https://www.tagesschau.de/api2u/news/) — strukturiertes
|
||||||
|
JSON mit ressort, tags, firstSentence pro Artikel.
|
||||||
|
- **Bundestag-Aktuellethemen-RSS**
|
||||||
|
(https://www.bundestag.de/static/appdata/includes/rss/aktuellethemen.rss)
|
||||||
|
— RSS mit Titel + Beschreibung pro Artikel.
|
||||||
|
|
||||||
|
**Bewusst NICHT verwendet:** RND.de (robots.txt bannt explizit ClaudeBot,
|
||||||
|
GPTBot, ChatGPT-User, CCBot, Google-Extended). RSS-Feeds privat-publizierter
|
||||||
|
Verlage werden nur dann angebunden, wenn AI-Verarbeitung explizit erlaubt ist.
|
||||||
|
|
||||||
|
**Compliance:**
|
||||||
|
- Volltexte werden NICHT persistiert. Nur Titel + erster Satz / Description.
|
||||||
|
- Kein User-Agent, der einen AI-Bot vortaeuscht (kein "ClaudeBot").
|
||||||
|
- Rate-Limiting: 1 Request pro Quelle pro Aufruf (kein Loop, kein Hammer).
|
||||||
|
|
||||||
|
Datenbank-Tabelle ``news_articles`` (siehe app/database.py):
|
||||||
|
url PK, titel, summary, datum (ISO), source, ressort, tags JSON,
|
||||||
|
summary_embedding BLOB, embedding_model, fetched_at.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import urllib.error
|
||||||
|
import urllib.request
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from email.utils import parsedate_to_datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
USER_AGENT = "GWOeAntragspruefer/1.0 (+https://gwoe.toppyr.de)"
|
||||||
|
TIMEOUT = 20
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Quellen
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
TAGESSCHAU_API = "https://www.tagesschau.de/api2u/news"
|
||||||
|
|
||||||
|
# Politische Tagesschau-Ressorts — Sport/Panorama/Sport rausgefiltert,
|
||||||
|
# weil sie selten zu parlamentarischen Antraegen passen.
|
||||||
|
TAGESSCHAU_RESSORTS = ["inland", "ausland", "wirtschaft", "wissen"]
|
||||||
|
|
||||||
|
BUNDESTAG_RSS = {
|
||||||
|
"bundestag-aktuell": (
|
||||||
|
"https://www.bundestag.de/static/appdata/includes/rss/aktuellethemen.rss"
|
||||||
|
),
|
||||||
|
"bundestag-presse": (
|
||||||
|
"https://www.bundestag.de/static/appdata/includes/rss/pressemitteilungen.rss"
|
||||||
|
),
|
||||||
|
"bundestag-hib": (
|
||||||
|
"https://www.bundestag.de/static/appdata/includes/rss/hib.rss"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# HTTP-Helper
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _http_get(url: str) -> Optional[bytes]:
|
||||||
|
"""GET mit ehrlichem User-Agent + Timeout. Gibt None bei Fehler."""
|
||||||
|
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req, timeout=TIMEOUT) as r:
|
||||||
|
return r.read()
|
||||||
|
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
|
||||||
|
logger.warning("news fetch failed: %s — %s", url, e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_html(text: str) -> str:
|
||||||
|
"""Entfernt HTML-Tags + CDATA fuer Plaintext-Summaries."""
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
text = re.sub(r"<!\[CDATA\[(.*?)\]\]>", r"\1", text, flags=re.DOTALL)
|
||||||
|
text = re.sub(r"<[^>]+>", " ", text)
|
||||||
|
text = text.replace("&", "&").replace(" ", " ").replace(""", '"')
|
||||||
|
return re.sub(r"\s+", " ", text).strip()
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Parser
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_tagesschau(ressorts: Optional[list[str]] = None) -> list[dict]:
|
||||||
|
"""Holt News aus der Tagesschau-API. Liefert Liste von Dicts mit den
|
||||||
|
Feldern: url, titel, summary, datum, source, ressort, tags.
|
||||||
|
|
||||||
|
Volltexte (``content``) werden bewusst nicht uebernommen — nur die in
|
||||||
|
der API verfuegbare ``firstSentence`` als Summary.
|
||||||
|
"""
|
||||||
|
ressorts = ressorts or TAGESSCHAU_RESSORTS
|
||||||
|
out: list[dict] = []
|
||||||
|
seen: set[str] = set()
|
||||||
|
for ressort in ressorts:
|
||||||
|
url = f"{TAGESSCHAU_API}?ressort={ressort}"
|
||||||
|
raw = _http_get(url)
|
||||||
|
if not raw:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
data = json.loads(raw.decode("utf-8"))
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
logger.warning("tagesschau JSON parse failed: %s", url)
|
||||||
|
continue
|
||||||
|
for item in data.get("news") or []:
|
||||||
|
link = item.get("shareURL") or item.get("detailsweb")
|
||||||
|
if not link or link in seen:
|
||||||
|
continue
|
||||||
|
seen.add(link)
|
||||||
|
titel = (item.get("title") or "").strip()
|
||||||
|
if not titel:
|
||||||
|
continue
|
||||||
|
summary = (item.get("firstSentence") or "").strip()
|
||||||
|
datum = item.get("date") or ""
|
||||||
|
tags = [t.get("tag") for t in (item.get("tags") or []) if t.get("tag")]
|
||||||
|
out.append({
|
||||||
|
"url": link,
|
||||||
|
"titel": titel,
|
||||||
|
"summary": summary,
|
||||||
|
"datum": datum,
|
||||||
|
"source": "tagesschau",
|
||||||
|
"ressort": item.get("ressort") or ressort,
|
||||||
|
"tags": tags,
|
||||||
|
})
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
_RSS_ITEM_RE = re.compile(r"<item>(.*?)</item>", re.DOTALL)
|
||||||
|
_RSS_TITLE_RE = re.compile(r"<title>(.*?)</title>", re.DOTALL)
|
||||||
|
_RSS_LINK_RE = re.compile(r"<link>(.*?)</link>")
|
||||||
|
_RSS_DESC_RE = re.compile(r"<description>(.*?)</description>", re.DOTALL)
|
||||||
|
_RSS_PUB_RE = re.compile(r"<pubDate>(.*?)</pubDate>")
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_rss_date(s: str) -> str:
|
||||||
|
"""Konvertiere RSS-pubDate (RFC 822) → ISO-8601-Datum."""
|
||||||
|
if not s:
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
dt = parsedate_to_datetime(s.strip())
|
||||||
|
if dt.tzinfo is None:
|
||||||
|
dt = dt.replace(tzinfo=timezone.utc)
|
||||||
|
return dt.astimezone(timezone.utc).isoformat()
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_rss(source: str, url: str, max_items: int = 50) -> list[dict]:
|
||||||
|
"""Generischer RSS-2.0-Parser. Liefert dicts wie fetch_tagesschau."""
|
||||||
|
raw = _http_get(url)
|
||||||
|
if not raw:
|
||||||
|
return []
|
||||||
|
text = raw.decode("utf-8", errors="replace")
|
||||||
|
items_xml = _RSS_ITEM_RE.findall(text)[:max_items]
|
||||||
|
out: list[dict] = []
|
||||||
|
for item in items_xml:
|
||||||
|
title_m = _RSS_TITLE_RE.search(item)
|
||||||
|
link_m = _RSS_LINK_RE.search(item)
|
||||||
|
desc_m = _RSS_DESC_RE.search(item)
|
||||||
|
pub_m = _RSS_PUB_RE.search(item)
|
||||||
|
titel = _strip_html(title_m.group(1)) if title_m else ""
|
||||||
|
link = _strip_html(link_m.group(1)) if link_m else ""
|
||||||
|
if not titel or not link:
|
||||||
|
continue
|
||||||
|
summary = _strip_html(desc_m.group(1)) if desc_m else ""
|
||||||
|
datum = _parse_rss_date(pub_m.group(1)) if pub_m else ""
|
||||||
|
out.append({
|
||||||
|
"url": link,
|
||||||
|
"titel": titel,
|
||||||
|
"summary": summary,
|
||||||
|
"datum": datum,
|
||||||
|
"source": source,
|
||||||
|
"ressort": None,
|
||||||
|
"tags": [],
|
||||||
|
})
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_all() -> list[dict]:
|
||||||
|
"""Holt alle konfigurierten Quellen ein. Kein Caching, kein Auto-Retry."""
|
||||||
|
out: list[dict] = []
|
||||||
|
out.extend(fetch_tagesschau())
|
||||||
|
for source, url in BUNDESTAG_RSS.items():
|
||||||
|
out.extend(fetch_rss(source, url))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# DB-Persistierung
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def upsert_articles(
|
||||||
|
articles: list[dict],
|
||||||
|
db_path: Optional[Path] = None,
|
||||||
|
embed: bool = True,
|
||||||
|
) -> dict:
|
||||||
|
"""Schreibe oder aktualisiere News-Artikel in der DB.
|
||||||
|
|
||||||
|
Idempotent ueber URL-PK. Existierende Eintraege bekommen ein neues
|
||||||
|
``fetched_at``, aber Embedding bleibt persistent (sonst LLM-Kosten
|
||||||
|
pro Cron-Lauf).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
``{"inserted": int, "updated": int, "embedded": int}``
|
||||||
|
"""
|
||||||
|
import sqlite3
|
||||||
|
from .config import settings
|
||||||
|
|
||||||
|
path = db_path or settings.db_path
|
||||||
|
if not Path(path).exists():
|
||||||
|
return {"inserted": 0, "updated": 0, "embedded": 0}
|
||||||
|
|
||||||
|
conn = sqlite3.connect(str(path))
|
||||||
|
inserted = 0
|
||||||
|
updated = 0
|
||||||
|
embedded = 0
|
||||||
|
try:
|
||||||
|
for art in articles:
|
||||||
|
url = art["url"]
|
||||||
|
cur = conn.execute(
|
||||||
|
"SELECT summary_embedding IS NOT NULL FROM news_articles WHERE url=?",
|
||||||
|
(url,),
|
||||||
|
)
|
||||||
|
row = cur.fetchone()
|
||||||
|
tags_json = json.dumps(art.get("tags") or [])
|
||||||
|
if row is None:
|
||||||
|
conn.execute(
|
||||||
|
"""INSERT INTO news_articles
|
||||||
|
(url, titel, summary, datum, source, ressort, tags, fetched_at)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, datetime('now'))""",
|
||||||
|
(
|
||||||
|
url, art["titel"], art.get("summary") or "",
|
||||||
|
art.get("datum") or "",
|
||||||
|
art["source"], art.get("ressort"), tags_json,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
inserted += 1
|
||||||
|
else:
|
||||||
|
conn.execute(
|
||||||
|
"""UPDATE news_articles
|
||||||
|
SET titel=?, summary=?, datum=?, source=?, ressort=?, tags=?,
|
||||||
|
fetched_at=datetime('now')
|
||||||
|
WHERE url=?""",
|
||||||
|
(
|
||||||
|
art["titel"], art.get("summary") or "",
|
||||||
|
art.get("datum") or "",
|
||||||
|
art["source"], art.get("ressort"), tags_json,
|
||||||
|
url,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
updated += 1
|
||||||
|
conn.commit()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if embed:
|
||||||
|
embedded = embed_pending_articles(db_path=db_path)
|
||||||
|
|
||||||
|
return {"inserted": inserted, "updated": updated, "embedded": embedded}
|
||||||
|
|
||||||
|
|
||||||
|
def embed_pending_articles(
|
||||||
|
db_path: Optional[Path] = None,
|
||||||
|
limit: int = 100,
|
||||||
|
) -> int:
|
||||||
|
"""Erzeuge Embeddings fuer alle News-Artikel ohne ``summary_embedding``.
|
||||||
|
|
||||||
|
Embedded wird ein Stueck-Text aus Titel + Summary + Tags. Bei
|
||||||
|
Embedding-API-Fehler wird der Artikel uebersprungen — naechster Run
|
||||||
|
holt ihn nach.
|
||||||
|
"""
|
||||||
|
import sqlite3
|
||||||
|
from .config import settings
|
||||||
|
from . import embeddings as emb
|
||||||
|
|
||||||
|
path = db_path or settings.db_path
|
||||||
|
if not Path(path).exists():
|
||||||
|
return 0
|
||||||
|
|
||||||
|
conn = sqlite3.connect(str(path))
|
||||||
|
try:
|
||||||
|
rows = conn.execute(
|
||||||
|
"""SELECT url, titel, summary, tags FROM news_articles
|
||||||
|
WHERE summary_embedding IS NULL ORDER BY datum DESC LIMIT ?""",
|
||||||
|
(limit,),
|
||||||
|
).fetchall()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
embedded = 0
|
||||||
|
conn = sqlite3.connect(str(path))
|
||||||
|
try:
|
||||||
|
for url, titel, summary, tags_raw in rows:
|
||||||
|
try:
|
||||||
|
tags = json.loads(tags_raw) if tags_raw else []
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
tags = []
|
||||||
|
parts = [titel or ""]
|
||||||
|
if summary:
|
||||||
|
parts.append(summary)
|
||||||
|
if tags:
|
||||||
|
parts.append(", ".join(tags))
|
||||||
|
text = "\n".join(p for p in parts if p).strip()
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
vec = emb.create_embedding(text, model=emb.EMBEDDING_MODEL)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("embed_pending_articles: API error for %s", url)
|
||||||
|
continue
|
||||||
|
conn.execute(
|
||||||
|
"""UPDATE news_articles
|
||||||
|
SET summary_embedding=?, embedding_model=?
|
||||||
|
WHERE url=?""",
|
||||||
|
(json.dumps(vec).encode(), emb.EMBEDDING_MODEL, url),
|
||||||
|
)
|
||||||
|
embedded += 1
|
||||||
|
conn.commit()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
return embedded
|
||||||
|
|
||||||
|
|
||||||
|
def run_aggregator(db_path: Optional[Path] = None, embed: bool = True) -> dict:
|
||||||
|
"""Top-Level: alle Quellen holen + persistieren + embedden.
|
||||||
|
|
||||||
|
Sicher fuer Cron-Aufrufe — fehlende Quellen werden geloggt, nicht
|
||||||
|
geworfen.
|
||||||
|
"""
|
||||||
|
articles = fetch_all()
|
||||||
|
return upsert_articles(articles, db_path=db_path, embed=embed)
|
||||||
256
app/presse_generator.py
Normal file
256
app/presse_generator.py
Normal file
@ -0,0 +1,256 @@
|
|||||||
|
"""Pressemitteilungs-Generator fuer #170 Phase 4.
|
||||||
|
|
||||||
|
Erzeugt einen LLM-generierten Pressemitteilungs-Vorschlag, der einen
|
||||||
|
GWÖ-bewerteten Antrag in den Kontext eines aktuellen News-Artikels stellt.
|
||||||
|
|
||||||
|
Manueller Trigger via UI-Button — kein Auto-Versand. Drafts werden in
|
||||||
|
``presse_drafts`` persistiert und in der UI als Liste sichtbar.
|
||||||
|
|
||||||
|
Tonalitaet:
|
||||||
|
- GWÖ-Sicht (Gemeinwohl-orientiert, nicht parteipolitisch)
|
||||||
|
- Faktenbasiert, keine Lobbying-Sprache
|
||||||
|
- 200-250 Worte, presseaehnlicher Aufbau (Lead-Paragraph + Begruendung)
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import sqlite3
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
SYSTEM_PROMPT = """Du bist ein politischer Redakteur, der für eine
|
||||||
|
Gemeinwohl-Ökonomie-Initiative Pressemitteilungen schreibt. Deine Stil-
|
||||||
|
Richtlinien:
|
||||||
|
|
||||||
|
- 200-250 Worte
|
||||||
|
- Sachlicher, präziser Stil — keine Werbesprache, keine Polemik
|
||||||
|
- Faktenbasiert: Daten aus dem Antrag und dem News-Kontext explizit nennen
|
||||||
|
- GWÖ-Werte (Würde, Solidarität, Nachhaltigkeit, Gerechtigkeit, Demokratie)
|
||||||
|
als Bewertungsmaßstab — nicht parteipolitische Linie
|
||||||
|
- Klare Struktur: Titel, Lead-Paragraph (Wer? Was? Wann? Warum jetzt?),
|
||||||
|
Begründung mit Bezug auf GWÖ-Bewertung, Schluss mit Forderung oder
|
||||||
|
Einladung zum Dialog
|
||||||
|
- Niemals den Anbieter der News-Quelle (Tagesschau, Bundestag) zitieren —
|
||||||
|
nur den Sachverhalt aufgreifen, der dort beschrieben ist
|
||||||
|
|
||||||
|
Antworte NUR mit gültigem JSON in dieser Struktur:
|
||||||
|
{
|
||||||
|
"titel": "<knackiger Titel, max 100 Zeichen>",
|
||||||
|
"body": "<Pressemitteilungs-Volltext, 200-250 Wörter>"
|
||||||
|
}"""
|
||||||
|
|
||||||
|
|
||||||
|
def _build_user_prompt(
|
||||||
|
drucksache: str,
|
||||||
|
bundesland: str,
|
||||||
|
antrag_titel: str,
|
||||||
|
antrag_zusammenfassung: str,
|
||||||
|
gwoe_score: float,
|
||||||
|
gwoe_begruendung: str,
|
||||||
|
empfehlung: str,
|
||||||
|
news_titel: str,
|
||||||
|
news_summary: str,
|
||||||
|
news_url: str,
|
||||||
|
) -> str:
|
||||||
|
"""Konstruiert den User-Prompt aus Antrags- und News-Daten."""
|
||||||
|
return f"""## Aktueller Antrag
|
||||||
|
|
||||||
|
Drucksache: {drucksache} ({bundesland})
|
||||||
|
Titel: {antrag_titel}
|
||||||
|
|
||||||
|
Zusammenfassung: {antrag_zusammenfassung or "(keine vorhanden)"}
|
||||||
|
|
||||||
|
GWÖ-Score: {gwoe_score}/10
|
||||||
|
GWÖ-Begründung: {gwoe_begruendung or "(keine vorhanden)"}
|
||||||
|
Empfehlung: {empfehlung or "(keine)"}
|
||||||
|
|
||||||
|
## Aktueller Nachrichten-Kontext
|
||||||
|
|
||||||
|
Schlagzeile: {news_titel}
|
||||||
|
|
||||||
|
Inhalt: {news_summary or "(keine Zusammenfassung verfügbar)"}
|
||||||
|
|
||||||
|
Quelle: {news_url}
|
||||||
|
|
||||||
|
## Deine Aufgabe
|
||||||
|
|
||||||
|
Schreibe eine Pressemitteilung, die diesen Antrag in den Kontext der
|
||||||
|
aktuellen Nachrichtenlage stellt. Begründe aus GWÖ-Sicht, warum der
|
||||||
|
Antrag gerade jetzt relevant ist (oder warum er die aktuelle Debatte
|
||||||
|
ergänzt/korrigiert). Wenn der GWÖ-Score niedrig ist (< 5), sei dabei
|
||||||
|
kritisch — die PM kann auch eine Ablehnung des Antrags begründen.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
async def generate_draft(
|
||||||
|
drucksache: str,
|
||||||
|
news_url: str,
|
||||||
|
db_path: Optional[Path] = None,
|
||||||
|
bewerter=None,
|
||||||
|
) -> dict:
|
||||||
|
"""Erzeugt einen Pressemitteilungs-Draft und persistiert ihn.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
drucksache: ID des Antrags (mit Bundesland-Kontext aus DB).
|
||||||
|
news_url: URL des News-Artikels (Lookup in news_articles).
|
||||||
|
db_path: optional override fuer Tests.
|
||||||
|
bewerter: optional injected QwenBewerter (fuer Tests). Wenn None,
|
||||||
|
wird der Default mit settings instanziiert.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
``{"id": int, "drucksache": ..., "bundesland": ...,
|
||||||
|
"news_url": ..., "news_titel": ...,
|
||||||
|
"titel": str, "body": str, "model": str, "created_at": ISO}``
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: wenn drucksache oder news_url nicht gefunden.
|
||||||
|
"""
|
||||||
|
from .config import settings
|
||||||
|
from .adapters.qwen_bewerter import LlmRequest
|
||||||
|
|
||||||
|
path = db_path or settings.db_path
|
||||||
|
conn = sqlite3.connect(str(path))
|
||||||
|
try:
|
||||||
|
antrag = conn.execute(
|
||||||
|
"""SELECT bundesland, title, antrag_zusammenfassung, gwoe_score,
|
||||||
|
gwoe_begruendung, empfehlung
|
||||||
|
FROM assessments WHERE drucksache=?""",
|
||||||
|
(drucksache,),
|
||||||
|
).fetchone()
|
||||||
|
news = conn.execute(
|
||||||
|
"SELECT titel, summary FROM news_articles WHERE url=?",
|
||||||
|
(news_url,),
|
||||||
|
).fetchone()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if not antrag:
|
||||||
|
raise ValueError(f"Drucksache {drucksache} nicht in assessments")
|
||||||
|
if not news:
|
||||||
|
raise ValueError(f"News-URL {news_url} nicht in news_articles")
|
||||||
|
|
||||||
|
user_prompt = _build_user_prompt(
|
||||||
|
drucksache=drucksache,
|
||||||
|
bundesland=antrag[0],
|
||||||
|
antrag_titel=antrag[1] or "",
|
||||||
|
antrag_zusammenfassung=antrag[2] or "",
|
||||||
|
gwoe_score=antrag[3] or 0.0,
|
||||||
|
gwoe_begruendung=antrag[4] or "",
|
||||||
|
empfehlung=antrag[5] or "",
|
||||||
|
news_titel=news[0],
|
||||||
|
news_summary=news[1] or "",
|
||||||
|
news_url=news_url,
|
||||||
|
)
|
||||||
|
|
||||||
|
if bewerter is None:
|
||||||
|
from .adapters.qwen_bewerter import QwenBewerter
|
||||||
|
bewerter = QwenBewerter()
|
||||||
|
|
||||||
|
req = LlmRequest(
|
||||||
|
system_prompt=SYSTEM_PROMPT,
|
||||||
|
user_prompt=user_prompt,
|
||||||
|
model=settings.llm_model_default,
|
||||||
|
base_temperature=0.3,
|
||||||
|
max_tokens=1500,
|
||||||
|
max_retries=2,
|
||||||
|
)
|
||||||
|
result = await bewerter.bewerte(req)
|
||||||
|
|
||||||
|
titel = (result.get("titel") or "").strip()[:200]
|
||||||
|
body = (result.get("body") or "").strip()
|
||||||
|
if not titel or not body:
|
||||||
|
raise ValueError("LLM-Response unvollständig (titel oder body leer)")
|
||||||
|
|
||||||
|
# Persist
|
||||||
|
conn = sqlite3.connect(str(path))
|
||||||
|
try:
|
||||||
|
cur = conn.execute(
|
||||||
|
"""INSERT INTO presse_drafts
|
||||||
|
(drucksache, bundesland, news_url, news_titel, titel, body, model)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?)""",
|
||||||
|
(drucksache, antrag[0], news_url, news[0], titel, body,
|
||||||
|
settings.llm_model_default),
|
||||||
|
)
|
||||||
|
draft_id = cur.lastrowid
|
||||||
|
row = conn.execute(
|
||||||
|
"""SELECT id, drucksache, bundesland, news_url, news_titel,
|
||||||
|
titel, body, model, created_at
|
||||||
|
FROM presse_drafts WHERE id=?""",
|
||||||
|
(draft_id,),
|
||||||
|
).fetchone()
|
||||||
|
conn.commit()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"id": row[0], "drucksache": row[1], "bundesland": row[2],
|
||||||
|
"news_url": row[3], "news_titel": row[4],
|
||||||
|
"titel": row[5], "body": row[6], "model": row[7],
|
||||||
|
"created_at": row[8],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def list_drafts(
|
||||||
|
limit: int = 20,
|
||||||
|
db_path: Optional[Path] = None,
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Liste der zuletzt generierten Drafts. Default-Limit 20."""
|
||||||
|
from .config import settings
|
||||||
|
|
||||||
|
path = db_path or settings.db_path
|
||||||
|
if not Path(path).exists():
|
||||||
|
return []
|
||||||
|
conn = sqlite3.connect(str(path))
|
||||||
|
try:
|
||||||
|
rows = conn.execute(
|
||||||
|
"""SELECT id, drucksache, bundesland, news_url, news_titel,
|
||||||
|
titel, body, model, created_at
|
||||||
|
FROM presse_drafts
|
||||||
|
ORDER BY id DESC LIMIT ?""",
|
||||||
|
(limit,),
|
||||||
|
).fetchall()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"id": r[0], "drucksache": r[1], "bundesland": r[2],
|
||||||
|
"news_url": r[3], "news_titel": r[4],
|
||||||
|
"titel": r[5], "body": r[6], "model": r[7],
|
||||||
|
"created_at": r[8],
|
||||||
|
}
|
||||||
|
for r in rows
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def get_draft(
|
||||||
|
draft_id: int,
|
||||||
|
db_path: Optional[Path] = None,
|
||||||
|
) -> Optional[dict]:
|
||||||
|
"""Einen Draft per ID abrufen."""
|
||||||
|
from .config import settings
|
||||||
|
|
||||||
|
path = db_path or settings.db_path
|
||||||
|
if not Path(path).exists():
|
||||||
|
return None
|
||||||
|
conn = sqlite3.connect(str(path))
|
||||||
|
try:
|
||||||
|
row = conn.execute(
|
||||||
|
"""SELECT id, drucksache, bundesland, news_url, news_titel,
|
||||||
|
titel, body, model, created_at
|
||||||
|
FROM presse_drafts WHERE id=?""",
|
||||||
|
(draft_id,),
|
||||||
|
).fetchone()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
if not row:
|
||||||
|
return None
|
||||||
|
return {
|
||||||
|
"id": row[0], "drucksache": row[1], "bundesland": row[2],
|
||||||
|
"news_url": row[3], "news_titel": row[4],
|
||||||
|
"titel": row[5], "body": row[6], "model": row[7],
|
||||||
|
"created_at": row[8],
|
||||||
|
}
|
||||||
@ -56,6 +56,7 @@
|
|||||||
<div class="v2-nav-group">
|
<div class="v2-nav-group">
|
||||||
<div class="v2-nav-label">— Daten</div>
|
<div class="v2-nav-label">— Daten</div>
|
||||||
<a href="/auswertungen" class="v2-nav-item {% if v2_active_nav == 'auswertungen' %}active{% endif %}">{{ icon("chart-bar", 14) }} Auswertungen</a>
|
<a href="/auswertungen" class="v2-nav-item {% if v2_active_nav == 'auswertungen' %}active{% endif %}">{{ icon("chart-bar", 14) }} Auswertungen</a>
|
||||||
|
<a href="/aktuelle-themen" class="v2-nav-item {% if v2_active_nav == 'aktuelle-themen' %}active{% endif %}">{{ icon("book-open", 14) }} Aktuelle Themen</a>
|
||||||
<a href="/api/auswertungen/export.csv" class="v2-nav-item">{{ icon("file-csv", 14) }} Export · API</a>
|
<a href="/api/auswertungen/export.csv" class="v2-nav-item">{{ icon("file-csv", 14) }} Export · API</a>
|
||||||
<a href="/v2/feed" class="v2-nav-item {% if v2_active_nav == 'feed' %}active{% endif %}">{{ icon("rss", 14) }} Atom-Feed</a>
|
<a href="/v2/feed" class="v2-nav-item {% if v2_active_nav == 'feed' %}active{% endif %}">{{ icon("rss", 14) }} Atom-Feed</a>
|
||||||
<a href="/v2/abos" class="v2-nav-item {% if v2_active_nav == 'abos' %}active{% endif %}">{{ icon("envelope-simple", 14) }} Meine Abos</a>
|
<a href="/v2/abos" class="v2-nav-item {% if v2_active_nav == 'abos' %}active{% endif %}">{{ icon("envelope-simple", 14) }} Meine Abos</a>
|
||||||
|
|||||||
417
app/templates/v2/screens/aktuelle-themen.html
Normal file
417
app/templates/v2/screens/aktuelle-themen.html
Normal file
@ -0,0 +1,417 @@
|
|||||||
|
{% extends "v2/base.html" %}
|
||||||
|
|
||||||
|
{% block title %}Aktuelle Themen — GWÖ-Antragsprüfer{% endblock %}
|
||||||
|
|
||||||
|
{% set v2_active_nav = "aktuelle-themen" %}
|
||||||
|
|
||||||
|
{% block head_extra %}
|
||||||
|
<script src="/static/chart.umd.min.js"></script>
|
||||||
|
<style>
|
||||||
|
.at-controls {
|
||||||
|
display: flex;
|
||||||
|
gap: 8px;
|
||||||
|
align-items: center;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
margin-bottom: 1rem;
|
||||||
|
font-family: var(--font-mono);
|
||||||
|
font-size: 11px;
|
||||||
|
}
|
||||||
|
.at-controls select, .at-controls input[type="number"] {
|
||||||
|
font-family: var(--font-mono);
|
||||||
|
font-size: 11px;
|
||||||
|
padding: 5px 8px;
|
||||||
|
border: 1px solid var(--ecg-border);
|
||||||
|
border-radius: 3px;
|
||||||
|
background: var(--ecg-card-bg);
|
||||||
|
color: var(--ecg-dark);
|
||||||
|
}
|
||||||
|
.at-controls button {
|
||||||
|
font-family: var(--font-mono);
|
||||||
|
font-size: 11px;
|
||||||
|
padding: 5px 12px;
|
||||||
|
border: 1px solid var(--ecg-border);
|
||||||
|
border-radius: 3px;
|
||||||
|
cursor: pointer;
|
||||||
|
background: var(--ecg-teal);
|
||||||
|
color: #fff;
|
||||||
|
}
|
||||||
|
.at-news-card {
|
||||||
|
background: var(--ecg-card-bg);
|
||||||
|
border: 1px solid var(--ecg-border);
|
||||||
|
border-radius: 6px;
|
||||||
|
padding: 14px 16px;
|
||||||
|
margin-bottom: 14px;
|
||||||
|
}
|
||||||
|
.at-news-head {
|
||||||
|
font-family: var(--font-mono);
|
||||||
|
font-size: 10px;
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: 0.05em;
|
||||||
|
opacity: 0.6;
|
||||||
|
margin-bottom: 4px;
|
||||||
|
}
|
||||||
|
.at-news-title {
|
||||||
|
font-family: var(--font-display);
|
||||||
|
font-size: 15px;
|
||||||
|
color: var(--ecg-teal);
|
||||||
|
margin: 0 0 6px;
|
||||||
|
line-height: 1.3;
|
||||||
|
}
|
||||||
|
.at-news-title a { color: inherit; text-decoration: none; }
|
||||||
|
.at-news-title a:hover { text-decoration: underline; }
|
||||||
|
.at-news-summary {
|
||||||
|
font-size: 12px;
|
||||||
|
line-height: 1.5;
|
||||||
|
margin: 0 0 10px;
|
||||||
|
opacity: 0.85;
|
||||||
|
}
|
||||||
|
.at-news-tags {
|
||||||
|
font-family: var(--font-mono);
|
||||||
|
font-size: 10px;
|
||||||
|
opacity: 0.55;
|
||||||
|
margin-bottom: 8px;
|
||||||
|
}
|
||||||
|
.at-tag {
|
||||||
|
display: inline-block;
|
||||||
|
padding: 1px 6px;
|
||||||
|
background: var(--ecg-bg-subtle);
|
||||||
|
border-radius: 3px;
|
||||||
|
margin-right: 4px;
|
||||||
|
}
|
||||||
|
.at-matches {
|
||||||
|
border-top: 1px solid var(--ecg-border);
|
||||||
|
margin-top: 10px;
|
||||||
|
padding-top: 10px;
|
||||||
|
}
|
||||||
|
.at-matches-label {
|
||||||
|
font-family: var(--font-mono);
|
||||||
|
font-size: 10px;
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: 0.05em;
|
||||||
|
opacity: 0.6;
|
||||||
|
margin-bottom: 6px;
|
||||||
|
}
|
||||||
|
.at-match {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 10px;
|
||||||
|
padding: 5px 0;
|
||||||
|
font-size: 12px;
|
||||||
|
border-bottom: 1px dotted var(--ecg-border);
|
||||||
|
}
|
||||||
|
.at-match:last-child { border-bottom: none; }
|
||||||
|
.at-score-pill {
|
||||||
|
display: inline-block;
|
||||||
|
padding: 1px 7px;
|
||||||
|
border-radius: 10px;
|
||||||
|
font-family: var(--font-mono);
|
||||||
|
font-size: 10px;
|
||||||
|
font-weight: 700;
|
||||||
|
background: var(--ecg-bg-subtle);
|
||||||
|
min-width: 28px;
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
.at-score-pill.s-high { background: rgba(136,158,51,0.25); color: #44570a; }
|
||||||
|
.at-score-pill.s-mid { background: rgba(247,148,29,0.18); color: #875e10; }
|
||||||
|
.at-score-pill.s-low { background: rgba(200,0,0,0.15); color: #931515; }
|
||||||
|
.at-sim {
|
||||||
|
font-family: var(--font-mono);
|
||||||
|
font-size: 10px;
|
||||||
|
opacity: 0.5;
|
||||||
|
}
|
||||||
|
.at-presse-btn {
|
||||||
|
background: var(--ecg-card-bg);
|
||||||
|
color: var(--ecg-teal);
|
||||||
|
border: 1px solid var(--ecg-teal);
|
||||||
|
border-radius: 3px;
|
||||||
|
font-family: var(--font-mono);
|
||||||
|
font-size: 10px;
|
||||||
|
padding: 3px 8px;
|
||||||
|
cursor: pointer;
|
||||||
|
margin-left: auto;
|
||||||
|
}
|
||||||
|
.at-presse-btn:hover { background: var(--ecg-teal); color: #fff; }
|
||||||
|
</style>
|
||||||
|
{% endblock %}
|
||||||
|
|
||||||
|
{% block main %}
|
||||||
|
<div style="padding:0 0 1.5rem;">
|
||||||
|
<h1 style="font-family:var(--font-display);font-size:22px;color:var(--ecg-teal);margin:0 0 4px;">Aktuelle Themen</h1>
|
||||||
|
<p style="font-size:12px;font-family:var(--font-mono);color:var(--ecg-dark);opacity:0.6;">
|
||||||
|
Tagesschau + Bundestag-RSS · gematcht mit deinen Anträgen ·
|
||||||
|
Pressemitteilungs-Vorschläge
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="v2-kasten outline-blue" style="margin-bottom:1rem;">
|
||||||
|
<p style="font-size:12px;line-height:1.5;margin:0 0 0.5rem;">
|
||||||
|
Die täglich aktuellen politischen Top-Themen aus
|
||||||
|
<strong>öffentlich-rechtlichen + parlamentarischen Quellen</strong>
|
||||||
|
(Tagesschau-API + Bundestag-RSS) werden semantisch mit den von dir
|
||||||
|
bewerteten Anträgen verschnitten. Pro News-Artikel siehst du die
|
||||||
|
GWÖ-Bewertung der dazu passendsten Anträge — und kannst per Klick
|
||||||
|
eine Pressemitteilung generieren lassen.
|
||||||
|
</p>
|
||||||
|
<p style="font-size:11px;line-height:1.5;opacity:0.75;margin:0;">
|
||||||
|
Bewusst <strong>nicht</strong> verwendet: Quellen mit AI-Bann in
|
||||||
|
robots.txt (z.B. RND.de). Die UI zeigt nur Titel + URL + erste Sätze
|
||||||
|
— Volltexte werden nicht persistiert.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="at-controls">
|
||||||
|
<label for="at-days">Zeitfenster:</label>
|
||||||
|
<select id="at-days" onchange="loadThemen()">
|
||||||
|
<option value="3">3 Tage</option>
|
||||||
|
<option value="7" selected>7 Tage</option>
|
||||||
|
<option value="14">14 Tage</option>
|
||||||
|
<option value="30">30 Tage</option>
|
||||||
|
</select>
|
||||||
|
<label for="at-topk">Top-N News:</label>
|
||||||
|
<input type="number" id="at-topk" value="15" min="3" max="50" style="width:60px;" onchange="loadThemen()" />
|
||||||
|
<label for="at-minsim">Min. Similarity:</label>
|
||||||
|
<select id="at-minsim" onchange="loadThemen()">
|
||||||
|
<option value="0.30">0.30 (locker)</option>
|
||||||
|
<option value="0.40" selected>0.40 (default)</option>
|
||||||
|
<option value="0.50">0.50 (streng)</option>
|
||||||
|
</select>
|
||||||
|
<button onclick="loadThemen()">Aktualisieren</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- News-Volumen-Chart -->
|
||||||
|
<h3 style="font-family:var(--font-display);font-size:14px;color:var(--ecg-teal);margin:1.5rem 0 0.5rem;">
|
||||||
|
News-Volumen pro Quelle (letzte 30 Tage)
|
||||||
|
</h3>
|
||||||
|
<div class="matrix-wrap" style="background:var(--ecg-card-bg);border:1px solid var(--ecg-border);border-radius:4px;padding:14px;">
|
||||||
|
<canvas id="at-zeitreihe-chart" style="max-height:280px;"></canvas>
|
||||||
|
</div>
|
||||||
|
<div id="at-zeitreihe-meta" class="meta-line" style="font-family:var(--font-mono);font-size:11px;opacity:0.6;margin:8px 0 1.5rem;"></div>
|
||||||
|
|
||||||
|
<!-- Top-Themen + Matches -->
|
||||||
|
<h3 style="font-family:var(--font-display);font-size:14px;color:var(--ecg-teal);margin:1.5rem 0 0.5rem;">
|
||||||
|
Top-Themen × passende Anträge
|
||||||
|
</h3>
|
||||||
|
<div id="at-themen-list">
|
||||||
|
<div style="font-family:var(--font-mono);font-size:12px;opacity:0.5;">Lade …</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Drafts-Liste -->
|
||||||
|
<h3 style="font-family:var(--font-display);font-size:14px;color:var(--ecg-teal);margin:2rem 0 0.5rem;">
|
||||||
|
Pressemitteilungs-Entwürfe (zuletzt generiert)
|
||||||
|
</h3>
|
||||||
|
<div id="at-drafts-list">
|
||||||
|
<div style="font-family:var(--font-mono);font-size:12px;opacity:0.5;">Lade Entwürfe …</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Modal für Draft-Anzeige -->
|
||||||
|
<div class="v2-modal-backdrop" id="at-modal-backdrop" onclick="atCloseModal(event)" style="display:none;position:fixed;inset:0;background:rgba(0,0,0,0.45);z-index:500;align-items:center;justify-content:center;">
|
||||||
|
<div class="v2-modal" onclick="event.stopPropagation()" style="background:var(--ecg-card-bg);border-radius:6px;padding:20px 24px;max-width:680px;width:90%;max-height:80vh;overflow-y:auto;position:relative;">
|
||||||
|
<button class="v2-modal-close" onclick="atCloseModal()" style="position:absolute;top:12px;right:14px;background:none;border:none;font-size:18px;cursor:pointer;opacity:0.5;">×</button>
|
||||||
|
<h2 id="at-modal-title" style="font-family:var(--font-display);font-size:16px;color:var(--ecg-teal);margin:0 0 12px;">Pressemitteilung</h2>
|
||||||
|
<div id="at-modal-body" style="font-size:13px;line-height:1.5;">Generiere …</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{% endblock %}
|
||||||
|
|
||||||
|
{% block body_scripts %}
|
||||||
|
<script>
|
||||||
|
let _atZeitreiheChart = null;
|
||||||
|
|
||||||
|
function atScoreClass(score) {
|
||||||
|
if (score == null) return '';
|
||||||
|
if (score >= 7) return 's-high';
|
||||||
|
if (score >= 4) return 's-mid';
|
||||||
|
return 's-low';
|
||||||
|
}
|
||||||
|
|
||||||
|
function atFmtDatum(s) {
|
||||||
|
if (!s || s.length < 10) return '';
|
||||||
|
return s.slice(0, 10);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function loadThemen() {
|
||||||
|
const days = document.getElementById('at-days').value;
|
||||||
|
const topk = document.getElementById('at-topk').value;
|
||||||
|
const minsim = document.getElementById('at-minsim').value;
|
||||||
|
const list = document.getElementById('at-themen-list');
|
||||||
|
list.innerHTML = '<div style="font-family:var(--font-mono);font-size:12px;opacity:0.5;">Lade …</div>';
|
||||||
|
|
||||||
|
try {
|
||||||
|
const r = await fetch(`/api/aktuelle-themen/top?days=${days}&top_k=${topk}&min_similarity=${minsim}&matches_per_news=3`);
|
||||||
|
const data = await r.json();
|
||||||
|
|
||||||
|
if (!data.buckets || !data.buckets.length) {
|
||||||
|
list.innerHTML = '<div style="font-family:var(--font-mono);font-size:12px;opacity:0.5;">Keine News im Zeitfenster oder noch nicht embedded.</div>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let html = '';
|
||||||
|
for (const b of data.buckets) {
|
||||||
|
const n = b.news;
|
||||||
|
const tags = (n.tags || []).map(t => `<span class="at-tag">${t}</span>`).join('');
|
||||||
|
html += '<div class="at-news-card">';
|
||||||
|
html += `<div class="at-news-head">${atFmtDatum(n.datum)} · ${n.source}${n.ressort ? ' / ' + n.ressort : ''}</div>`;
|
||||||
|
html += `<h4 class="at-news-title"><a href="${n.url}" target="_blank" rel="noopener">${n.titel}</a></h4>`;
|
||||||
|
if (n.summary) html += `<div class="at-news-summary">${n.summary}</div>`;
|
||||||
|
if (tags) html += `<div class="at-news-tags">${tags}</div>`;
|
||||||
|
|
||||||
|
if (b.matches && b.matches.length) {
|
||||||
|
html += '<div class="at-matches">';
|
||||||
|
html += '<div class="at-matches-label">Passende Anträge:</div>';
|
||||||
|
for (const m of b.matches) {
|
||||||
|
const sc = m.gwoe_score != null ? m.gwoe_score.toFixed(1) : '—';
|
||||||
|
const fr = (m.fraktionen || []).join(', ');
|
||||||
|
html += '<div class="at-match">';
|
||||||
|
html += `<span class="at-score-pill ${atScoreClass(m.gwoe_score)}">${sc}</span>`;
|
||||||
|
html += `<a href="/antrag/${encodeURIComponent(m.drucksache)}" style="color:var(--ecg-teal);text-decoration:none;font-weight:500;">${m.drucksache}</a>`;
|
||||||
|
html += `<span style="opacity:0.85;">${m.title || ''}</span>`;
|
||||||
|
if (fr) html += `<span style="opacity:0.6;font-size:11px;">— ${fr}</span>`;
|
||||||
|
html += `<span class="at-sim">sim ${m.similarity}</span>`;
|
||||||
|
html += `<button class="at-presse-btn" onclick="generatePresse('${m.drucksache.replace(/'/g, "\\'")}', '${encodeURIComponent(n.url)}', this)">PM-Vorschlag</button>`;
|
||||||
|
html += '</div>';
|
||||||
|
}
|
||||||
|
html += '</div>';
|
||||||
|
} else {
|
||||||
|
html += '<div class="at-matches"><div class="at-matches-label">Keine GWÖ-bewerteten Anträge passen — wäre ein Kandidat für eine neue Bewertung.</div></div>';
|
||||||
|
}
|
||||||
|
html += '</div>';
|
||||||
|
}
|
||||||
|
list.innerHTML = html;
|
||||||
|
} catch (e) {
|
||||||
|
list.innerHTML = `<div style="color:#c00;font-family:var(--font-mono);font-size:12px;">Fehler: ${e}</div>`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function loadZeitreihe() {
|
||||||
|
const meta = document.getElementById('at-zeitreihe-meta');
|
||||||
|
try {
|
||||||
|
const r = await fetch('/api/aktuelle-themen/zeitreihe?days=30');
|
||||||
|
const data = await r.json();
|
||||||
|
if (_atZeitreiheChart) _atZeitreiheChart.destroy();
|
||||||
|
|
||||||
|
if (!data.buckets || !data.buckets.length) {
|
||||||
|
meta.textContent = 'Noch keine News-Artikel in der DB.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const colors = ['rgba(0,157,165,0.7)', 'rgba(247,148,29,0.7)', 'rgba(136,158,51,0.7)',
|
||||||
|
'rgba(200,30,30,0.7)', 'rgba(150,100,200,0.7)'];
|
||||||
|
const datasets = data.sources.map((s, i) => ({
|
||||||
|
label: s,
|
||||||
|
data: data.series[s],
|
||||||
|
backgroundColor: colors[i % colors.length],
|
||||||
|
borderColor: colors[i % colors.length].replace('0.7', '1'),
|
||||||
|
fill: true,
|
||||||
|
tension: 0.2,
|
||||||
|
}));
|
||||||
|
|
||||||
|
const ctx = document.getElementById('at-zeitreihe-chart');
|
||||||
|
_atZeitreiheChart = new Chart(ctx, {
|
||||||
|
type: 'line',
|
||||||
|
data: { labels: data.buckets, datasets: datasets },
|
||||||
|
options: {
|
||||||
|
responsive: true,
|
||||||
|
scales: {
|
||||||
|
y: { beginAtZero: true, stacked: true, title: { display: true, text: 'Artikel/Tag' } },
|
||||||
|
x: { title: { display: true, text: 'Datum' } },
|
||||||
|
},
|
||||||
|
plugins: {
|
||||||
|
legend: { position: 'bottom' }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
const total = Object.values(data.series).reduce((s, arr) => s + arr.reduce((a, b) => a + b, 0), 0);
|
||||||
|
meta.textContent = `${total} News-Artikel über ${data.buckets.length} Tage, ${data.sources.length} Quellen.`;
|
||||||
|
} catch (e) {
|
||||||
|
meta.textContent = 'Fehler: ' + e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function loadDrafts() {
|
||||||
|
const wrap = document.getElementById('at-drafts-list');
|
||||||
|
try {
|
||||||
|
const r = await fetch('/api/aktuelle-themen/drafts?limit=10');
|
||||||
|
const data = await r.json();
|
||||||
|
if (!data.drafts || !data.drafts.length) {
|
||||||
|
wrap.innerHTML = '<div style="font-family:var(--font-mono);font-size:12px;opacity:0.5;">Noch keine Pressemitteilungen generiert.</div>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let html = '';
|
||||||
|
for (const d of data.drafts) {
|
||||||
|
html += '<div class="at-news-card" style="cursor:pointer;" onclick="showDraft(' + d.id + ')">';
|
||||||
|
html += `<div class="at-news-head">${atFmtDatum(d.created_at)} · DS ${d.drucksache} (${d.bundesland})</div>`;
|
||||||
|
html += `<h4 class="at-news-title">${d.titel}</h4>`;
|
||||||
|
html += `<div class="at-news-tags">Bezug: ${d.news_titel}</div>`;
|
||||||
|
html += '</div>';
|
||||||
|
}
|
||||||
|
wrap.innerHTML = html;
|
||||||
|
} catch (e) {
|
||||||
|
wrap.innerHTML = `<div style="color:#c00;font-family:var(--font-mono);font-size:12px;">Fehler: ${e}</div>`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function generatePresse(drucksache, newsUrlEnc, btn) {
|
||||||
|
if (!confirm(`Pressemitteilung generieren für Drucksache ${drucksache}?\n\nDas erzeugt einen LLM-Call (~2 Cent).`)) return;
|
||||||
|
btn.textContent = '…';
|
||||||
|
btn.disabled = true;
|
||||||
|
try {
|
||||||
|
const r = await fetch(`/api/aktuelle-themen/generate-presse?drucksache=${encodeURIComponent(drucksache)}&news_url=${newsUrlEnc}`, {
|
||||||
|
method: 'POST',
|
||||||
|
});
|
||||||
|
if (!r.ok) {
|
||||||
|
const err = await r.json();
|
||||||
|
alert('Fehler: ' + (err.detail || r.statusText));
|
||||||
|
btn.textContent = 'PM-Vorschlag';
|
||||||
|
btn.disabled = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const data = await r.json();
|
||||||
|
showDraftFromData(data);
|
||||||
|
loadDrafts();
|
||||||
|
} catch (e) {
|
||||||
|
alert('Fehler: ' + e);
|
||||||
|
} finally {
|
||||||
|
btn.textContent = 'PM-Vorschlag';
|
||||||
|
btn.disabled = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function showDraftFromData(d) {
|
||||||
|
const backdrop = document.getElementById('at-modal-backdrop');
|
||||||
|
document.getElementById('at-modal-title').textContent = d.titel;
|
||||||
|
document.getElementById('at-modal-body').innerHTML =
|
||||||
|
`<div style="font-family:var(--font-mono);font-size:11px;opacity:0.6;margin-bottom:10px;">
|
||||||
|
DS ${d.drucksache} (${d.bundesland}) · Bezug zu: <a href="${d.news_url}" target="_blank" rel="noopener" style="color:var(--ecg-teal);">${d.news_titel}</a>
|
||||||
|
</div>
|
||||||
|
<div style="white-space:pre-wrap;">${d.body.replace(/</g, '<')}</div>`;
|
||||||
|
backdrop.style.display = 'flex';
|
||||||
|
}
|
||||||
|
|
||||||
|
async function showDraft(id) {
|
||||||
|
try {
|
||||||
|
const r = await fetch(`/api/aktuelle-themen/drafts/${id}`);
|
||||||
|
const d = await r.json();
|
||||||
|
showDraftFromData(d);
|
||||||
|
} catch (e) {
|
||||||
|
alert('Fehler: ' + e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function atCloseModal(ev) {
|
||||||
|
if (!ev || ev.target.id === 'at-modal-backdrop') {
|
||||||
|
document.getElementById('at-modal-backdrop').style.display = 'none';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
document.addEventListener('keydown', (e) => {
|
||||||
|
if (e.key === 'Escape') document.getElementById('at-modal-backdrop').style.display = 'none';
|
||||||
|
});
|
||||||
|
|
||||||
|
// Init
|
||||||
|
loadZeitreihe();
|
||||||
|
loadThemen();
|
||||||
|
loadDrafts();
|
||||||
|
</script>
|
||||||
|
{% endblock %}
|
||||||
371
app/themen_matching.py
Normal file
371
app/themen_matching.py
Normal file
@ -0,0 +1,371 @@
|
|||||||
|
"""Themen × Anträge Matching fuer das Aktuelle-Themen-Dashboard
|
||||||
|
(#170 Phase 2).
|
||||||
|
|
||||||
|
Verschneidet News-Artikel-Embeddings (aus news_articles.summary_embedding)
|
||||||
|
mit Antrag-Embeddings (assessments.summary_embedding) per Cosine-Similarity.
|
||||||
|
Liefert pro News-Artikel die Top-K-passendsten Anträge.
|
||||||
|
|
||||||
|
Reuse:
|
||||||
|
- ``embeddings.cosine_similarity`` fuer den Vektor-Vergleich
|
||||||
|
- Beide Tabellen nutzen denselben Embedding-Modell-Vektorraum (qwen v4),
|
||||||
|
daher direkter Cross-Vergleich moeglich
|
||||||
|
- Filter ueber ``embedding_model``-Spalte, falls Migration laueft
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import sqlite3
|
||||||
|
from collections import defaultdict
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _load_embeddings(
|
||||||
|
db_path: Path,
|
||||||
|
table: str,
|
||||||
|
select_cols: list[str],
|
||||||
|
where_extra: str = "",
|
||||||
|
params: tuple = (),
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Generischer Loader fuer Tabellen mit ``summary_embedding``-Spalte.
|
||||||
|
|
||||||
|
Liefert Zeilen mit decoded Embedding-Vektor (oder filtert aus, wenn
|
||||||
|
Modell nicht zum aktuellen READ-Modell passt).
|
||||||
|
"""
|
||||||
|
from . import embeddings as emb
|
||||||
|
|
||||||
|
if not Path(db_path).exists():
|
||||||
|
return []
|
||||||
|
conn = sqlite3.connect(str(db_path))
|
||||||
|
try:
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
cols = ", ".join(select_cols)
|
||||||
|
sql = (
|
||||||
|
f"SELECT {cols}, summary_embedding, embedding_model "
|
||||||
|
f"FROM {table} "
|
||||||
|
f"WHERE summary_embedding IS NOT NULL {where_extra}"
|
||||||
|
)
|
||||||
|
rows = conn.execute(sql, params).fetchall()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
out = []
|
||||||
|
for r in rows:
|
||||||
|
if r["embedding_model"] != emb.EMBEDDING_MODEL_READ:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
vec = json.loads(r["summary_embedding"])
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
continue
|
||||||
|
d = dict(r)
|
||||||
|
d["_vec"] = vec
|
||||||
|
out.append(d)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def find_anträge_for_news(
|
||||||
|
news_url: str,
|
||||||
|
top_k: int = 5,
|
||||||
|
min_similarity: float = 0.4,
|
||||||
|
db_path: Optional[Path] = None,
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Pro gegebener News-URL: Top-K aehnlichste Antraege per Cosine-Match.
|
||||||
|
|
||||||
|
Filter ``min_similarity`` haelt den Cut-Off fuer "passt einigermassen".
|
||||||
|
0.4 ist empirisch der Punkt, ab dem qwen-v4-Embeddings semantisch
|
||||||
|
relevant matchen.
|
||||||
|
"""
|
||||||
|
from .config import settings
|
||||||
|
from . import embeddings as emb
|
||||||
|
|
||||||
|
path = db_path or settings.db_path
|
||||||
|
if not Path(path).exists():
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 1. News-Vektor laden
|
||||||
|
conn = sqlite3.connect(str(path))
|
||||||
|
try:
|
||||||
|
row = conn.execute(
|
||||||
|
"""SELECT summary_embedding, embedding_model
|
||||||
|
FROM news_articles WHERE url=?""",
|
||||||
|
(news_url,),
|
||||||
|
).fetchone()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
if not row or not row[0] or row[1] != emb.EMBEDDING_MODEL_READ:
|
||||||
|
return []
|
||||||
|
try:
|
||||||
|
news_vec = json.loads(row[0])
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 2. Alle Assessments mit Embedding laden + scoren
|
||||||
|
assessments = _load_embeddings(
|
||||||
|
Path(path),
|
||||||
|
"assessments",
|
||||||
|
["drucksache", "title", "bundesland", "fraktionen", "gwoe_score",
|
||||||
|
"empfehlung", "themen", "datum"],
|
||||||
|
)
|
||||||
|
scored = []
|
||||||
|
for a in assessments:
|
||||||
|
sim = emb.cosine_similarity(news_vec, a["_vec"])
|
||||||
|
if sim < min_similarity:
|
||||||
|
continue
|
||||||
|
scored.append({
|
||||||
|
"drucksache": a["drucksache"],
|
||||||
|
"title": a["title"],
|
||||||
|
"bundesland": a["bundesland"],
|
||||||
|
"fraktionen": json.loads(a["fraktionen"] or "[]"),
|
||||||
|
"gwoe_score": a["gwoe_score"],
|
||||||
|
"empfehlung": a["empfehlung"],
|
||||||
|
"themen": json.loads(a["themen"] or "[]"),
|
||||||
|
"datum": a["datum"],
|
||||||
|
"similarity": round(sim, 3),
|
||||||
|
})
|
||||||
|
scored.sort(key=lambda x: x["similarity"], reverse=True)
|
||||||
|
return scored[:top_k]
|
||||||
|
|
||||||
|
|
||||||
|
def find_news_for_antrag(
|
||||||
|
drucksache: str,
|
||||||
|
top_k: int = 5,
|
||||||
|
min_similarity: float = 0.4,
|
||||||
|
days_window: int = 90,
|
||||||
|
db_path: Optional[Path] = None,
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Pro gegebener Drucksache: Top-K aehnlichste News-Artikel per Cosine.
|
||||||
|
|
||||||
|
Filtert News auf ein Zeitfenster (Default 90 Tage), damit
|
||||||
|
Pressemitteilungen aus aktueller Aktualitaet stammen.
|
||||||
|
"""
|
||||||
|
from .config import settings
|
||||||
|
from . import embeddings as emb
|
||||||
|
|
||||||
|
path = db_path or settings.db_path
|
||||||
|
if not Path(path).exists():
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 1. Antrag-Vektor laden
|
||||||
|
conn = sqlite3.connect(str(path))
|
||||||
|
try:
|
||||||
|
row = conn.execute(
|
||||||
|
"""SELECT summary_embedding, embedding_model
|
||||||
|
FROM assessments WHERE drucksache=?""",
|
||||||
|
(drucksache,),
|
||||||
|
).fetchone()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
if not row or not row[0] or row[1] != emb.EMBEDDING_MODEL_READ:
|
||||||
|
return []
|
||||||
|
try:
|
||||||
|
antrag_vec = json.loads(row[0])
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 2. News mit Datums-Filter laden
|
||||||
|
cutoff = datetime.now(timezone.utc).timestamp() - days_window * 86400
|
||||||
|
news = _load_embeddings(
|
||||||
|
Path(path),
|
||||||
|
"news_articles",
|
||||||
|
["url", "titel", "summary", "datum", "source", "ressort", "tags"],
|
||||||
|
)
|
||||||
|
scored = []
|
||||||
|
for n in news:
|
||||||
|
sim = emb.cosine_similarity(antrag_vec, n["_vec"])
|
||||||
|
if sim < min_similarity:
|
||||||
|
continue
|
||||||
|
# Datums-Filter
|
||||||
|
try:
|
||||||
|
news_ts = datetime.fromisoformat(
|
||||||
|
n["datum"].replace("Z", "+00:00")
|
||||||
|
).timestamp()
|
||||||
|
if news_ts < cutoff:
|
||||||
|
continue
|
||||||
|
except (ValueError, AttributeError):
|
||||||
|
pass # Wenn Datum nicht parsbar, lass es durch
|
||||||
|
try:
|
||||||
|
tags = json.loads(n["tags"]) if n["tags"] else []
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
tags = []
|
||||||
|
scored.append({
|
||||||
|
"url": n["url"],
|
||||||
|
"titel": n["titel"],
|
||||||
|
"summary": n["summary"],
|
||||||
|
"datum": n["datum"],
|
||||||
|
"source": n["source"],
|
||||||
|
"ressort": n["ressort"],
|
||||||
|
"tags": tags,
|
||||||
|
"similarity": round(sim, 3),
|
||||||
|
})
|
||||||
|
scored.sort(key=lambda x: x["similarity"], reverse=True)
|
||||||
|
return scored[:top_k]
|
||||||
|
|
||||||
|
|
||||||
|
def aggregate_top_themen(
|
||||||
|
days_window: int = 7,
|
||||||
|
top_k: int = 10,
|
||||||
|
min_similarity: float = 0.4,
|
||||||
|
matches_per_news: int = 3,
|
||||||
|
db_path: Optional[Path] = None,
|
||||||
|
) -> dict:
|
||||||
|
"""Top-K aktuelle News (letzte N Tage) mit jeweils ihren passendsten
|
||||||
|
Antraegen — der primaere Dashboard-Endpoint.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
``{
|
||||||
|
"buckets": [{
|
||||||
|
"news": {url, titel, summary, datum, source, ressort, tags},
|
||||||
|
"matches": [{drucksache, title, gwoe_score, similarity, ...}]
|
||||||
|
}, ...],
|
||||||
|
"n_total_news": int,
|
||||||
|
"filter": {...}
|
||||||
|
}``
|
||||||
|
"""
|
||||||
|
from .config import settings
|
||||||
|
from . import embeddings as emb
|
||||||
|
|
||||||
|
path = db_path or settings.db_path
|
||||||
|
if not Path(path).exists():
|
||||||
|
return {"buckets": [], "n_total_news": 0, "filter": {
|
||||||
|
"days_window": days_window, "top_k": top_k,
|
||||||
|
"min_similarity": min_similarity,
|
||||||
|
}}
|
||||||
|
|
||||||
|
cutoff = (
|
||||||
|
datetime.now(timezone.utc).timestamp() - days_window * 86400
|
||||||
|
)
|
||||||
|
|
||||||
|
news_rows = _load_embeddings(
|
||||||
|
Path(path),
|
||||||
|
"news_articles",
|
||||||
|
["url", "titel", "summary", "datum", "source", "ressort", "tags"],
|
||||||
|
)
|
||||||
|
# Nach Datum filtern
|
||||||
|
fresh = []
|
||||||
|
for n in news_rows:
|
||||||
|
try:
|
||||||
|
news_ts = datetime.fromisoformat(
|
||||||
|
n["datum"].replace("Z", "+00:00")
|
||||||
|
).timestamp()
|
||||||
|
except (ValueError, AttributeError):
|
||||||
|
continue
|
||||||
|
if news_ts < cutoff:
|
||||||
|
continue
|
||||||
|
n["_ts"] = news_ts
|
||||||
|
fresh.append(n)
|
||||||
|
# Nach Datum desc sortieren, top_k cutten
|
||||||
|
fresh.sort(key=lambda x: x["_ts"], reverse=True)
|
||||||
|
fresh = fresh[:top_k]
|
||||||
|
|
||||||
|
# Pro News: alle Antraege scoren, Top matches_per_news behalten
|
||||||
|
assessments = _load_embeddings(
|
||||||
|
Path(path),
|
||||||
|
"assessments",
|
||||||
|
["drucksache", "title", "bundesland", "fraktionen", "gwoe_score",
|
||||||
|
"empfehlung", "themen", "datum"],
|
||||||
|
)
|
||||||
|
|
||||||
|
buckets = []
|
||||||
|
for n in fresh:
|
||||||
|
scored = []
|
||||||
|
for a in assessments:
|
||||||
|
sim = emb.cosine_similarity(n["_vec"], a["_vec"])
|
||||||
|
if sim < min_similarity:
|
||||||
|
continue
|
||||||
|
scored.append({
|
||||||
|
"drucksache": a["drucksache"],
|
||||||
|
"title": a["title"],
|
||||||
|
"bundesland": a["bundesland"],
|
||||||
|
"fraktionen": json.loads(a["fraktionen"] or "[]"),
|
||||||
|
"gwoe_score": a["gwoe_score"],
|
||||||
|
"empfehlung": a["empfehlung"],
|
||||||
|
"datum": a["datum"],
|
||||||
|
"similarity": round(sim, 3),
|
||||||
|
})
|
||||||
|
scored.sort(key=lambda x: x["similarity"], reverse=True)
|
||||||
|
try:
|
||||||
|
tags = json.loads(n["tags"]) if n["tags"] else []
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
tags = []
|
||||||
|
buckets.append({
|
||||||
|
"news": {
|
||||||
|
"url": n["url"],
|
||||||
|
"titel": n["titel"],
|
||||||
|
"summary": n["summary"],
|
||||||
|
"datum": n["datum"],
|
||||||
|
"source": n["source"],
|
||||||
|
"ressort": n["ressort"],
|
||||||
|
"tags": tags,
|
||||||
|
},
|
||||||
|
"matches": scored[:matches_per_news],
|
||||||
|
})
|
||||||
|
|
||||||
|
return {
|
||||||
|
"buckets": buckets,
|
||||||
|
"n_total_news": len(news_rows),
|
||||||
|
"filter": {
|
||||||
|
"days_window": days_window,
|
||||||
|
"top_k": top_k,
|
||||||
|
"min_similarity": min_similarity,
|
||||||
|
"matches_per_news": matches_per_news,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def aggregate_themen_zeitreihe(
|
||||||
|
days_window: int = 30,
|
||||||
|
db_path: Optional[Path] = None,
|
||||||
|
) -> dict:
|
||||||
|
"""News-Volumen pro (Tag, Source) ueber die letzten N Tage —
|
||||||
|
Stacked-Area-Chart.
|
||||||
|
|
||||||
|
Liefert Zeitreihe ohne Antrag-Match — nur die News-Aktivitaet pro
|
||||||
|
Quelle, damit das Dashboard sehen kann, welche Quellen wie aktiv waren.
|
||||||
|
"""
|
||||||
|
from .config import settings
|
||||||
|
|
||||||
|
path = db_path or settings.db_path
|
||||||
|
if not Path(path).exists():
|
||||||
|
return {"buckets": [], "sources": [], "series": {}}
|
||||||
|
|
||||||
|
cutoff_ts = datetime.now(timezone.utc).timestamp() - days_window * 86400
|
||||||
|
conn = sqlite3.connect(str(path))
|
||||||
|
try:
|
||||||
|
rows = conn.execute(
|
||||||
|
"SELECT datum, source FROM news_articles"
|
||||||
|
).fetchall()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
counts: defaultdict[tuple[str, str], int] = defaultdict(int)
|
||||||
|
sources_seen: set[str] = set()
|
||||||
|
days_seen: set[str] = set()
|
||||||
|
for datum, source in rows:
|
||||||
|
if not datum:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
ts = datetime.fromisoformat(datum.replace("Z", "+00:00")).timestamp()
|
||||||
|
except (ValueError, AttributeError):
|
||||||
|
continue
|
||||||
|
if ts < cutoff_ts:
|
||||||
|
continue
|
||||||
|
day = datum[:10] # YYYY-MM-DD
|
||||||
|
sources_seen.add(source)
|
||||||
|
days_seen.add(day)
|
||||||
|
counts[(day, source)] += 1
|
||||||
|
|
||||||
|
days_sorted = sorted(days_seen)
|
||||||
|
sources_sorted = sorted(sources_seen)
|
||||||
|
series = {
|
||||||
|
s: [counts[(d, s)] for d in days_sorted]
|
||||||
|
for s in sources_sorted
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
"buckets": days_sorted,
|
||||||
|
"sources": sources_sorted,
|
||||||
|
"series": series,
|
||||||
|
}
|
||||||
24
scripts/auto-fetch-news.sh
Executable file
24
scripts/auto-fetch-news.sh
Executable file
@ -0,0 +1,24 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Aktuelle-Themen-Dashboard: News-Aggregator-Cron (#170 Phase 1).
|
||||||
|
#
|
||||||
|
# Holt taeglich Headlines von Tagesschau-API + Bundestag-RSS, persistiert
|
||||||
|
# sie in news_articles und embeddet die neuen via Qwen-Embeddings-API.
|
||||||
|
# Idempotent (URL-PK), wiederhol-bar bei Fehlern.
|
||||||
|
#
|
||||||
|
# Wird via Cron taeglich morgens aufgerufen, vor auto-ingest-protocols.sh.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# auto-fetch-news.sh [CONTAINER]
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
CONTAINER="${1:-gwoe-antragspruefer}"
|
||||||
|
|
||||||
|
echo "=== auto-fetch-news $(date -Iseconds) ==="
|
||||||
|
|
||||||
|
docker exec -i "$CONTAINER" python <<'EOF'
|
||||||
|
from app.news_aggregator import run_aggregator
|
||||||
|
stats = run_aggregator()
|
||||||
|
print(f"News-Aggregator: inserted={stats['inserted']} updated={stats['updated']} embedded={stats['embedded']}")
|
||||||
|
EOF
|
||||||
|
|
||||||
|
echo "=== auto-fetch-news done $(date -Iseconds) ==="
|
||||||
262
tests/test_news_aggregator.py
Normal file
262
tests/test_news_aggregator.py
Normal file
@ -0,0 +1,262 @@
|
|||||||
|
"""Tests fuer app.news_aggregator (#170 Phase 1).
|
||||||
|
|
||||||
|
Testet Parser + DB-Persistierung gegen kontrollierte Fixtures, ohne
|
||||||
|
Live-HTTP-Calls (Tagesschau-API + Bundestag-RSS werden gemockt).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sqlite3
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from app.news_aggregator import (
|
||||||
|
_parse_rss_date,
|
||||||
|
_strip_html,
|
||||||
|
fetch_rss,
|
||||||
|
fetch_tagesschau,
|
||||||
|
upsert_articles,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Helper
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestStripHtml:
|
||||||
|
def test_removes_tags(self):
|
||||||
|
assert _strip_html("<p>Hello <b>world</b></p>") == "Hello world"
|
||||||
|
|
||||||
|
def test_decodes_cdata(self):
|
||||||
|
assert "Test" in _strip_html("<![CDATA[Test]]>")
|
||||||
|
|
||||||
|
def test_decodes_entities(self):
|
||||||
|
assert _strip_html("a & b") == "a & b"
|
||||||
|
|
||||||
|
def test_collapses_whitespace(self):
|
||||||
|
assert _strip_html("<p>a b\n c</p>") == "a b c"
|
||||||
|
|
||||||
|
def test_empty(self):
|
||||||
|
assert _strip_html("") == ""
|
||||||
|
|
||||||
|
|
||||||
|
class TestParseRssDate:
|
||||||
|
def test_rfc822_to_iso(self):
|
||||||
|
result = _parse_rss_date("Tue, 28 Apr 2026 10:45:12 GMT")
|
||||||
|
assert result.startswith("2026-04-28")
|
||||||
|
|
||||||
|
def test_invalid_returns_empty(self):
|
||||||
|
assert _parse_rss_date("garbage") == ""
|
||||||
|
assert _parse_rss_date("") == ""
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# fetch_tagesschau (mocked HTTP)
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
SAMPLE_TAGESSCHAU_JSON = json.dumps({
|
||||||
|
"news": [
|
||||||
|
{
|
||||||
|
"title": "Bundestag berät über Wohnungsbau",
|
||||||
|
"firstSentence": "Der Bundestag hat heute über das neue Wohnungsbau-Gesetz beraten.",
|
||||||
|
"shareURL": "https://www.tagesschau.de/inland/bundestag-wohnungsbau-100.html",
|
||||||
|
"date": "2026-04-28T10:00:00.000+02:00",
|
||||||
|
"ressort": "inland",
|
||||||
|
"tags": [{"tag": "Wohnungsbau"}, {"tag": "Bundestag"}],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "EU-Kommission stellt Klimapaket vor",
|
||||||
|
"firstSentence": "Die EU plant ehrgeizige Klimaziele.",
|
||||||
|
"shareURL": "https://www.tagesschau.de/ausland/eu-klima-100.html",
|
||||||
|
"date": "2026-04-28T11:00:00.000+02:00",
|
||||||
|
"ressort": "ausland",
|
||||||
|
"tags": [{"tag": "Klima"}, {"tag": "EU"}],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# Dieser hat keinen shareURL — sollte uebersprungen werden
|
||||||
|
"title": "Kein Link",
|
||||||
|
"firstSentence": "Skip mich",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}).encode("utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
class TestFetchTagesschau:
|
||||||
|
def test_parses_news_array(self):
|
||||||
|
with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON):
|
||||||
|
articles = fetch_tagesschau(ressorts=["inland"])
|
||||||
|
# Deduplication ueber URL → 2 unique
|
||||||
|
assert len(articles) == 2
|
||||||
|
first = articles[0]
|
||||||
|
assert first["url"] == "https://www.tagesschau.de/inland/bundestag-wohnungsbau-100.html"
|
||||||
|
assert first["titel"] == "Bundestag berät über Wohnungsbau"
|
||||||
|
assert "Wohnungsbau" in first["summary"]
|
||||||
|
assert first["source"] == "tagesschau"
|
||||||
|
assert first["ressort"] == "inland"
|
||||||
|
assert "Wohnungsbau" in first["tags"]
|
||||||
|
|
||||||
|
def test_skips_items_without_link(self):
|
||||||
|
with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON):
|
||||||
|
articles = fetch_tagesschau(ressorts=["inland"])
|
||||||
|
assert all(a["url"] for a in articles)
|
||||||
|
|
||||||
|
def test_returns_empty_on_http_error(self):
|
||||||
|
with patch("app.news_aggregator._http_get", return_value=None):
|
||||||
|
articles = fetch_tagesschau(ressorts=["inland"])
|
||||||
|
assert articles == []
|
||||||
|
|
||||||
|
def test_dedup_across_ressorts(self):
|
||||||
|
"""Wenn dasselbe Item in zwei Ressorts erscheint, wird es nur 1× geliefert."""
|
||||||
|
with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON):
|
||||||
|
articles = fetch_tagesschau(ressorts=["inland", "ausland"])
|
||||||
|
urls = [a["url"] for a in articles]
|
||||||
|
assert len(urls) == len(set(urls))
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# fetch_rss (mocked HTTP)
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
SAMPLE_RSS = """<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<rss version="2.0"><channel><title>BT Aktuell</title>
|
||||||
|
<item>
|
||||||
|
<title><![CDATA[Bundestag berät Antrag zum Wohnungsbau]]></title>
|
||||||
|
<link>https://www.bundestag.de/dokumente/textarchiv/2026/kw18-wohnungsbau-1170388</link>
|
||||||
|
<description><![CDATA[Der Bundestag hat heute den Antrag zum Wohnungsbau beraten.]]></description>
|
||||||
|
<pubDate>Tue, 28 Apr 2026 10:45:12 GMT</pubDate>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<title>Antrag zur Klimapolitik</title>
|
||||||
|
<link>https://www.bundestag.de/klima</link>
|
||||||
|
<description>Klimaschutz im Bundestag</description>
|
||||||
|
<pubDate>Mon, 27 Apr 2026 10:00:00 GMT</pubDate>
|
||||||
|
</item>
|
||||||
|
</channel></rss>""".encode("utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
class TestFetchRss:
|
||||||
|
def test_parses_rss_items(self):
|
||||||
|
with patch("app.news_aggregator._http_get", return_value=SAMPLE_RSS):
|
||||||
|
articles = fetch_rss("bundestag-aktuell", "https://example.com/rss")
|
||||||
|
assert len(articles) == 2
|
||||||
|
first = articles[0]
|
||||||
|
assert "Wohnungsbau" in first["titel"]
|
||||||
|
assert first["url"].startswith("https://www.bundestag.de")
|
||||||
|
assert first["source"] == "bundestag-aktuell"
|
||||||
|
assert first["datum"].startswith("2026-04-28")
|
||||||
|
assert "Bundestag" in first["summary"]
|
||||||
|
|
||||||
|
def test_strips_cdata_and_html(self):
|
||||||
|
with patch("app.news_aggregator._http_get", return_value=SAMPLE_RSS):
|
||||||
|
articles = fetch_rss("bundestag-aktuell", "https://example.com/rss")
|
||||||
|
for a in articles:
|
||||||
|
assert "<![CDATA[" not in a["titel"]
|
||||||
|
assert "<![CDATA[" not in a["summary"]
|
||||||
|
|
||||||
|
def test_empty_on_http_error(self):
|
||||||
|
with patch("app.news_aggregator._http_get", return_value=None):
|
||||||
|
articles = fetch_rss("x", "https://example.com/rss")
|
||||||
|
assert articles == []
|
||||||
|
|
||||||
|
def test_skips_items_without_title_or_link(self):
|
||||||
|
bad = b"""<?xml version="1.0"?><rss><channel>
|
||||||
|
<item><title>Nur Titel</title></item>
|
||||||
|
<item><link>nur-link</link></item>
|
||||||
|
</channel></rss>"""
|
||||||
|
with patch("app.news_aggregator._http_get", return_value=bad):
|
||||||
|
articles = fetch_rss("x", "https://example.com/rss")
|
||||||
|
assert articles == []
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# upsert_articles
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def empty_db(tmp_path: Path) -> Path:
|
||||||
|
db = tmp_path / "test_news.db"
|
||||||
|
conn = sqlite3.connect(str(db))
|
||||||
|
conn.execute("""
|
||||||
|
CREATE TABLE news_articles (
|
||||||
|
url TEXT PRIMARY KEY,
|
||||||
|
titel TEXT NOT NULL,
|
||||||
|
summary TEXT,
|
||||||
|
datum TEXT NOT NULL,
|
||||||
|
source TEXT NOT NULL,
|
||||||
|
ressort TEXT,
|
||||||
|
tags TEXT,
|
||||||
|
summary_embedding BLOB,
|
||||||
|
embedding_model TEXT,
|
||||||
|
fetched_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
return db
|
||||||
|
|
||||||
|
|
||||||
|
SAMPLE_ARTICLES = [
|
||||||
|
{
|
||||||
|
"url": "https://example.com/a",
|
||||||
|
"titel": "Wohnungsbau",
|
||||||
|
"summary": "Heute im Bundestag",
|
||||||
|
"datum": "2026-04-28",
|
||||||
|
"source": "tagesschau",
|
||||||
|
"ressort": "inland",
|
||||||
|
"tags": ["Wohnungsbau"],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url": "https://example.com/b",
|
||||||
|
"titel": "Klima",
|
||||||
|
"summary": "EU plant Klimaziele",
|
||||||
|
"datum": "2026-04-28",
|
||||||
|
"source": "tagesschau",
|
||||||
|
"ressort": "ausland",
|
||||||
|
"tags": ["Klima", "EU"],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class TestUpsertArticles:
|
||||||
|
def test_inserts_new_articles(self, empty_db):
|
||||||
|
stats = upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False)
|
||||||
|
assert stats["inserted"] == 2
|
||||||
|
assert stats["updated"] == 0
|
||||||
|
|
||||||
|
def test_updates_existing_articles(self, empty_db):
|
||||||
|
upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False)
|
||||||
|
# Re-run with same URLs but different titel
|
||||||
|
modified = [{**a, "titel": a["titel"] + " (neu)"} for a in SAMPLE_ARTICLES]
|
||||||
|
stats = upsert_articles(modified, db_path=empty_db, embed=False)
|
||||||
|
assert stats["updated"] == 2
|
||||||
|
assert stats["inserted"] == 0
|
||||||
|
# Verify the title was updated
|
||||||
|
conn = sqlite3.connect(str(empty_db))
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT titel FROM news_articles WHERE url=?",
|
||||||
|
(SAMPLE_ARTICLES[0]["url"],),
|
||||||
|
).fetchone()
|
||||||
|
conn.close()
|
||||||
|
assert row[0].endswith("(neu)")
|
||||||
|
|
||||||
|
def test_persists_tags_as_json(self, empty_db):
|
||||||
|
upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False)
|
||||||
|
conn = sqlite3.connect(str(empty_db))
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT tags FROM news_articles WHERE url=?",
|
||||||
|
(SAMPLE_ARTICLES[0]["url"],),
|
||||||
|
).fetchone()
|
||||||
|
conn.close()
|
||||||
|
tags = json.loads(row[0])
|
||||||
|
assert tags == ["Wohnungsbau"]
|
||||||
|
|
||||||
|
def test_missing_db_returns_zeros(self, tmp_path):
|
||||||
|
stats = upsert_articles(SAMPLE_ARTICLES,
|
||||||
|
db_path=tmp_path / "missing.db", embed=False)
|
||||||
|
assert stats == {"inserted": 0, "updated": 0, "embedded": 0}
|
||||||
224
tests/test_presse_generator.py
Normal file
224
tests/test_presse_generator.py
Normal file
@ -0,0 +1,224 @@
|
|||||||
|
"""Tests fuer app.presse_generator (#170 Phase 4)."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sqlite3
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from app.presse_generator import (
|
||||||
|
_build_user_prompt,
|
||||||
|
generate_draft,
|
||||||
|
get_draft,
|
||||||
|
list_drafts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Fixture: DB mit Antrag + News
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def db_with_antrag_and_news(tmp_path: Path) -> Path:
|
||||||
|
db = tmp_path / "test_presse.db"
|
||||||
|
conn = sqlite3.connect(str(db))
|
||||||
|
conn.execute("""
|
||||||
|
CREATE TABLE assessments (
|
||||||
|
drucksache TEXT PRIMARY KEY,
|
||||||
|
title TEXT,
|
||||||
|
bundesland TEXT,
|
||||||
|
antrag_zusammenfassung TEXT,
|
||||||
|
gwoe_score REAL,
|
||||||
|
gwoe_begruendung TEXT,
|
||||||
|
empfehlung TEXT
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
conn.execute("""
|
||||||
|
CREATE TABLE news_articles (
|
||||||
|
url TEXT PRIMARY KEY,
|
||||||
|
titel TEXT NOT NULL,
|
||||||
|
summary TEXT
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
conn.execute("""
|
||||||
|
CREATE TABLE presse_drafts (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
drucksache TEXT NOT NULL,
|
||||||
|
bundesland TEXT NOT NULL,
|
||||||
|
news_url TEXT NOT NULL,
|
||||||
|
news_titel TEXT NOT NULL,
|
||||||
|
titel TEXT NOT NULL,
|
||||||
|
body TEXT NOT NULL,
|
||||||
|
model TEXT NOT NULL,
|
||||||
|
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
conn.execute(
|
||||||
|
"""INSERT INTO assessments
|
||||||
|
(drucksache, title, bundesland, antrag_zusammenfassung,
|
||||||
|
gwoe_score, gwoe_begruendung, empfehlung)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?)""",
|
||||||
|
(
|
||||||
|
"18/A", "Wohnungsbau-Reform-Antrag", "NRW",
|
||||||
|
"Antrag fuer mehr sozialen Wohnungsbau",
|
||||||
|
8.5, "Stark gemeinwohlorientiert",
|
||||||
|
"Uneingeschränkt unterstützen",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO news_articles (url, titel, summary) VALUES (?, ?, ?)",
|
||||||
|
(
|
||||||
|
"https://example.com/wohnen",
|
||||||
|
"Wohnungsmarkt im Umbruch",
|
||||||
|
"Die Mietpreise steigen weiter, der Bundestag berät heute",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
return db
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# _build_user_prompt
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestBuildUserPrompt:
|
||||||
|
def test_includes_drucksache(self):
|
||||||
|
prompt = _build_user_prompt(
|
||||||
|
drucksache="18/A", bundesland="NRW",
|
||||||
|
antrag_titel="Test", antrag_zusammenfassung="Summary",
|
||||||
|
gwoe_score=7.5, gwoe_begruendung="ok",
|
||||||
|
empfehlung="Unterstützen",
|
||||||
|
news_titel="News", news_summary="Lead",
|
||||||
|
news_url="https://example.com",
|
||||||
|
)
|
||||||
|
assert "18/A" in prompt
|
||||||
|
assert "NRW" in prompt
|
||||||
|
assert "7.5" in prompt
|
||||||
|
assert "News" in prompt
|
||||||
|
|
||||||
|
def test_handles_missing_zusammenfassung(self):
|
||||||
|
prompt = _build_user_prompt(
|
||||||
|
drucksache="x", bundesland="x", antrag_titel="x",
|
||||||
|
antrag_zusammenfassung="", gwoe_score=5.0,
|
||||||
|
gwoe_begruendung="", empfehlung="",
|
||||||
|
news_titel="x", news_summary="", news_url="",
|
||||||
|
)
|
||||||
|
assert "(keine vorhanden)" in prompt
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# generate_draft (mocked QwenBewerter)
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class FakeBewerter:
|
||||||
|
"""Mock fuer QwenBewerter, gibt fixe LLM-Response zurueck."""
|
||||||
|
|
||||||
|
def __init__(self, response: dict):
|
||||||
|
self._response = response
|
||||||
|
self.last_request = None
|
||||||
|
|
||||||
|
async def bewerte(self, request):
|
||||||
|
self.last_request = request
|
||||||
|
return self._response
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_generate_draft_persists_record(db_with_antrag_and_news, monkeypatch):
|
||||||
|
bewerter = FakeBewerter({
|
||||||
|
"titel": "Wohnungsbau jetzt",
|
||||||
|
"body": "Der vorliegende Antrag der Drucksache 18/A ..."
|
||||||
|
* 10, # langer Body
|
||||||
|
})
|
||||||
|
# Patch settings.dashscope_model fuer den INSERT
|
||||||
|
from app.config import settings as real_settings
|
||||||
|
monkeypatch.setattr(real_settings, "llm_model_default", "qwen-test")
|
||||||
|
result = await generate_draft(
|
||||||
|
drucksache="18/A",
|
||||||
|
news_url="https://example.com/wohnen",
|
||||||
|
db_path=db_with_antrag_and_news,
|
||||||
|
bewerter=bewerter,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result["id"] == 1
|
||||||
|
assert result["drucksache"] == "18/A"
|
||||||
|
assert result["bundesland"] == "NRW"
|
||||||
|
assert result["news_titel"] == "Wohnungsmarkt im Umbruch"
|
||||||
|
assert result["titel"] == "Wohnungsbau jetzt"
|
||||||
|
assert "18/A" in result["body"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_generate_draft_unknown_drucksache(db_with_antrag_and_news):
|
||||||
|
bewerter = FakeBewerter({"titel": "x", "body": "y"})
|
||||||
|
with pytest.raises(ValueError, match="Drucksache"):
|
||||||
|
await generate_draft(
|
||||||
|
drucksache="99/MISSING",
|
||||||
|
news_url="https://example.com/wohnen",
|
||||||
|
db_path=db_with_antrag_and_news,
|
||||||
|
bewerter=bewerter,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_generate_draft_unknown_news(db_with_antrag_and_news):
|
||||||
|
bewerter = FakeBewerter({"titel": "x", "body": "y"})
|
||||||
|
with pytest.raises(ValueError, match="News-URL"):
|
||||||
|
await generate_draft(
|
||||||
|
drucksache="18/A",
|
||||||
|
news_url="https://example.com/missing",
|
||||||
|
db_path=db_with_antrag_and_news,
|
||||||
|
bewerter=bewerter,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_generate_draft_empty_response_raises(db_with_antrag_and_news, monkeypatch):
|
||||||
|
bewerter = FakeBewerter({"titel": "", "body": ""})
|
||||||
|
from app.config import settings as real_settings
|
||||||
|
monkeypatch.setattr(real_settings, "llm_model_default", "qwen-test")
|
||||||
|
with pytest.raises(ValueError, match="unvollständig"):
|
||||||
|
await generate_draft(
|
||||||
|
drucksache="18/A",
|
||||||
|
news_url="https://example.com/wohnen",
|
||||||
|
db_path=db_with_antrag_and_news,
|
||||||
|
bewerter=bewerter,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# list_drafts + get_draft
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestListAndGetDrafts:
|
||||||
|
def test_empty(self, db_with_antrag_and_news):
|
||||||
|
assert list_drafts(db_path=db_with_antrag_and_news) == []
|
||||||
|
assert get_draft(99, db_path=db_with_antrag_and_news) is None
|
||||||
|
|
||||||
|
def test_after_insert(self, db_with_antrag_and_news):
|
||||||
|
# Direct DB-Insert (test setup)
|
||||||
|
conn = sqlite3.connect(str(db_with_antrag_and_news))
|
||||||
|
conn.execute(
|
||||||
|
"""INSERT INTO presse_drafts
|
||||||
|
(drucksache, bundesland, news_url, news_titel, titel, body, model)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?)""",
|
||||||
|
("18/A", "NRW", "https://x.de/n", "News-Titel",
|
||||||
|
"PM-Titel", "PM-Body", "test-model"),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
drafts = list_drafts(db_path=db_with_antrag_and_news)
|
||||||
|
assert len(drafts) == 1
|
||||||
|
assert drafts[0]["drucksache"] == "18/A"
|
||||||
|
assert drafts[0]["titel"] == "PM-Titel"
|
||||||
|
|
||||||
|
d = get_draft(drafts[0]["id"], db_path=db_with_antrag_and_news)
|
||||||
|
assert d is not None
|
||||||
|
assert d["body"] == "PM-Body"
|
||||||
297
tests/test_themen_matching.py
Normal file
297
tests/test_themen_matching.py
Normal file
@ -0,0 +1,297 @@
|
|||||||
|
"""Tests fuer app.themen_matching (#170 Phase 2)."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sqlite3
|
||||||
|
from datetime import datetime, timezone, timedelta
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from app.themen_matching import (
|
||||||
|
aggregate_themen_zeitreihe,
|
||||||
|
aggregate_top_themen,
|
||||||
|
find_anträge_for_news,
|
||||||
|
find_news_for_antrag,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Fixture: DB mit News + Assessments + Embeddings
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _vec(dim: int = 8, val: float = 0.1) -> bytes:
|
||||||
|
"""Konstruiert einen einfachen Vektor als JSON-Bytes."""
|
||||||
|
return json.dumps([val] * dim).encode()
|
||||||
|
|
||||||
|
|
||||||
|
def _vec_from(values: list[float]) -> bytes:
|
||||||
|
return json.dumps(values).encode()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def populated_db(tmp_path: Path) -> Path:
|
||||||
|
db = tmp_path / "test_match.db"
|
||||||
|
conn = sqlite3.connect(str(db))
|
||||||
|
conn.execute("""
|
||||||
|
CREATE TABLE news_articles (
|
||||||
|
url TEXT PRIMARY KEY,
|
||||||
|
titel TEXT NOT NULL,
|
||||||
|
summary TEXT,
|
||||||
|
datum TEXT NOT NULL,
|
||||||
|
source TEXT NOT NULL,
|
||||||
|
ressort TEXT,
|
||||||
|
tags TEXT,
|
||||||
|
summary_embedding BLOB,
|
||||||
|
embedding_model TEXT,
|
||||||
|
fetched_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
conn.execute("""
|
||||||
|
CREATE TABLE assessments (
|
||||||
|
drucksache TEXT PRIMARY KEY,
|
||||||
|
title TEXT,
|
||||||
|
fraktionen TEXT,
|
||||||
|
datum TEXT,
|
||||||
|
link TEXT,
|
||||||
|
bundesland TEXT,
|
||||||
|
gwoe_score REAL,
|
||||||
|
gwoe_begruendung TEXT,
|
||||||
|
gwoe_matrix TEXT,
|
||||||
|
gwoe_schwerpunkt TEXT,
|
||||||
|
wahlprogramm_scores TEXT,
|
||||||
|
verbesserungen TEXT,
|
||||||
|
staerken TEXT,
|
||||||
|
schwaechen TEXT,
|
||||||
|
empfehlung TEXT,
|
||||||
|
empfehlung_symbol TEXT,
|
||||||
|
verbesserungspotenzial TEXT,
|
||||||
|
themen TEXT,
|
||||||
|
antrag_zusammenfassung TEXT,
|
||||||
|
antrag_kernpunkte TEXT,
|
||||||
|
source TEXT,
|
||||||
|
model TEXT,
|
||||||
|
created_at TEXT,
|
||||||
|
updated_at TEXT,
|
||||||
|
summary_embedding BLOB,
|
||||||
|
embedding_model TEXT
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
today = datetime.now(timezone.utc).isoformat()
|
||||||
|
yesterday = (datetime.now(timezone.utc) - timedelta(days=1)).isoformat()
|
||||||
|
old = (datetime.now(timezone.utc) - timedelta(days=200)).isoformat()
|
||||||
|
|
||||||
|
# News-Artikel mit unterschiedlichen Embeddings
|
||||||
|
news = [
|
||||||
|
# Wohnungsbau-News (vec orientiert auf [1,0,0,...])
|
||||||
|
("https://example.com/n1", "Wohnungsbau-Reform",
|
||||||
|
"Bundestag berät Wohnungsbau", today, "tagesschau", "inland",
|
||||||
|
'["Wohnungsbau"]',
|
||||||
|
_vec_from([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])),
|
||||||
|
# Klima-News (vec orientiert auf [0,1,0,...])
|
||||||
|
("https://example.com/n2", "Klimaschutzgesetz",
|
||||||
|
"EU plant Klimaziele", today, "tagesschau", "ausland",
|
||||||
|
'["Klima"]',
|
||||||
|
_vec_from([0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])),
|
||||||
|
# Old news, sollte aus Zeitfenster filtern
|
||||||
|
("https://example.com/n3", "Alte News", "", old, "tagesschau", "inland",
|
||||||
|
'[]', _vec_from([0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])),
|
||||||
|
]
|
||||||
|
for url, titel, summary, datum, source, ressort, tags, vec in news:
|
||||||
|
conn.execute(
|
||||||
|
"""INSERT INTO news_articles
|
||||||
|
(url, titel, summary, datum, source, ressort, tags,
|
||||||
|
summary_embedding, embedding_model)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, 'qwen-embedding-v4')""",
|
||||||
|
(url, titel, summary, datum, source, ressort, tags, vec),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Assessments mit Embeddings:
|
||||||
|
# - 18/A passt zu Wohnungsbau-News (vec [1,0,...])
|
||||||
|
# - 18/B passt zu Klima-News
|
||||||
|
# - 18/C ist orthogonal — sollte nirgends matchen
|
||||||
|
now_iso = datetime.now().isoformat()
|
||||||
|
assessments = [
|
||||||
|
("18/A", "Wohnungsbau-Antrag", '["GRÜNE"]', "2026-04-15", "NRW",
|
||||||
|
8.0, "Uneingeschränkt unterstützen",
|
||||||
|
_vec_from([0.95, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])),
|
||||||
|
("18/B", "Klima-Antrag", '["SPD"]', "2026-04-16", "NRW",
|
||||||
|
7.0, "Unterstützen mit Änderungen",
|
||||||
|
_vec_from([0.0, 0.95, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0])),
|
||||||
|
("18/C", "Sonstiges", '["CDU"]', "2026-04-17", "NRW",
|
||||||
|
5.0, "Überarbeiten",
|
||||||
|
_vec_from([0.0, 0.0, 0.0, 0.0, 0.95, 0.0, 0.0, 0.0])),
|
||||||
|
]
|
||||||
|
for ds, title, fr, dat, bl, sc, emp, vec in assessments:
|
||||||
|
conn.execute(
|
||||||
|
"""INSERT INTO assessments
|
||||||
|
(drucksache, title, fraktionen, datum, bundesland, gwoe_score,
|
||||||
|
empfehlung, themen, source, model, created_at, updated_at,
|
||||||
|
summary_embedding, embedding_model)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, '[]', 'test', 'test', ?, ?,
|
||||||
|
?, 'qwen-embedding-v4')""",
|
||||||
|
(ds, title, fr, dat, bl, sc, emp, now_iso, now_iso, vec),
|
||||||
|
)
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
return db
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def mock_embedding_model():
|
||||||
|
"""Stellt sicher, dass EMBEDDING_MODEL_READ=qwen-embedding-v4 fuer Tests."""
|
||||||
|
with patch("app.embeddings.EMBEDDING_MODEL_READ", "qwen-embedding-v4"):
|
||||||
|
yield
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# find_anträge_for_news
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestFindAnträgeForNews:
|
||||||
|
def test_wohnungsbau_news_matches_wohnungsbau_antrag(self, populated_db):
|
||||||
|
result = find_anträge_for_news(
|
||||||
|
"https://example.com/n1", db_path=populated_db,
|
||||||
|
min_similarity=0.5,
|
||||||
|
)
|
||||||
|
assert len(result) >= 1
|
||||||
|
# Top-Match sollte 18/A sein
|
||||||
|
assert result[0]["drucksache"] == "18/A"
|
||||||
|
assert result[0]["similarity"] > 0.9
|
||||||
|
|
||||||
|
def test_klima_news_matches_klima_antrag(self, populated_db):
|
||||||
|
result = find_anträge_for_news(
|
||||||
|
"https://example.com/n2", db_path=populated_db,
|
||||||
|
min_similarity=0.5,
|
||||||
|
)
|
||||||
|
assert len(result) >= 1
|
||||||
|
assert result[0]["drucksache"] == "18/B"
|
||||||
|
|
||||||
|
def test_min_similarity_filters_orthogonal(self, populated_db):
|
||||||
|
"""Mit hohem min_similarity-Cutoff darf kein orthogonaler Antrag drin sein."""
|
||||||
|
result = find_anträge_for_news(
|
||||||
|
"https://example.com/n1", db_path=populated_db,
|
||||||
|
min_similarity=0.9,
|
||||||
|
)
|
||||||
|
druck = [r["drucksache"] for r in result]
|
||||||
|
assert "18/C" not in druck # 18/C ist orthogonal zu allem
|
||||||
|
|
||||||
|
def test_unknown_news_returns_empty(self, populated_db):
|
||||||
|
assert find_anträge_for_news(
|
||||||
|
"https://example.com/missing", db_path=populated_db,
|
||||||
|
) == []
|
||||||
|
|
||||||
|
def test_empty_db(self, tmp_path):
|
||||||
|
assert find_anträge_for_news(
|
||||||
|
"x", db_path=tmp_path / "missing.db",
|
||||||
|
) == []
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# find_news_for_antrag
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestFindNewsForAntrag:
|
||||||
|
def test_wohnungsbau_antrag_matches_wohnungsbau_news(self, populated_db):
|
||||||
|
result = find_news_for_antrag(
|
||||||
|
"18/A", db_path=populated_db, min_similarity=0.5,
|
||||||
|
)
|
||||||
|
assert len(result) >= 1
|
||||||
|
assert result[0]["url"] == "https://example.com/n1"
|
||||||
|
|
||||||
|
def test_old_news_filtered_out(self, populated_db):
|
||||||
|
"""News aus dem 200-Tage-alten Bucket darf nicht im 90-Tage-Fenster auftauchen."""
|
||||||
|
result = find_news_for_antrag(
|
||||||
|
"18/A", db_path=populated_db, min_similarity=0.0,
|
||||||
|
days_window=90,
|
||||||
|
)
|
||||||
|
urls = [r["url"] for r in result]
|
||||||
|
assert "https://example.com/n3" not in urls
|
||||||
|
|
||||||
|
def test_top_k_limits(self, populated_db):
|
||||||
|
"""top_k=1 liefert nur den besten Match."""
|
||||||
|
result = find_news_for_antrag(
|
||||||
|
"18/A", db_path=populated_db, min_similarity=0.0,
|
||||||
|
top_k=1,
|
||||||
|
)
|
||||||
|
assert len(result) <= 1
|
||||||
|
|
||||||
|
def test_unknown_antrag(self, populated_db):
|
||||||
|
assert find_news_for_antrag(
|
||||||
|
"99/Missing", db_path=populated_db,
|
||||||
|
) == []
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# aggregate_top_themen
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestAggregateTopThemen:
|
||||||
|
def test_returns_buckets(self, populated_db):
|
||||||
|
result = aggregate_top_themen(
|
||||||
|
db_path=populated_db, min_similarity=0.5,
|
||||||
|
)
|
||||||
|
# Heute gibt es 2 News-Artikel, beide mit Match
|
||||||
|
assert len(result["buckets"]) == 2
|
||||||
|
assert "n_total_news" in result
|
||||||
|
|
||||||
|
def test_each_bucket_has_news_and_matches(self, populated_db):
|
||||||
|
result = aggregate_top_themen(
|
||||||
|
db_path=populated_db, min_similarity=0.5,
|
||||||
|
)
|
||||||
|
for b in result["buckets"]:
|
||||||
|
assert "news" in b
|
||||||
|
assert "matches" in b
|
||||||
|
assert "url" in b["news"]
|
||||||
|
assert "titel" in b["news"]
|
||||||
|
|
||||||
|
def test_days_window_filter(self, populated_db):
|
||||||
|
"""Mit kleinem Fenster nur die fresh News, alte raus."""
|
||||||
|
result = aggregate_top_themen(
|
||||||
|
db_path=populated_db, days_window=7, min_similarity=0.5,
|
||||||
|
)
|
||||||
|
for b in result["buckets"]:
|
||||||
|
assert b["news"]["url"] != "https://example.com/n3"
|
||||||
|
|
||||||
|
def test_min_similarity_filter(self, populated_db):
|
||||||
|
"""Mit hohem min_sim verschwinden Cross-Matches."""
|
||||||
|
result = aggregate_top_themen(
|
||||||
|
db_path=populated_db, min_similarity=0.99,
|
||||||
|
)
|
||||||
|
# Nur exakte Matches sollten überleben
|
||||||
|
for b in result["buckets"]:
|
||||||
|
for m in b["matches"]:
|
||||||
|
assert m["similarity"] > 0.99
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# aggregate_themen_zeitreihe
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestAggregateZeitreihe:
|
||||||
|
def test_structure(self, populated_db):
|
||||||
|
result = aggregate_themen_zeitreihe(db_path=populated_db, days_window=7)
|
||||||
|
assert "buckets" in result
|
||||||
|
assert "sources" in result
|
||||||
|
assert "series" in result
|
||||||
|
|
||||||
|
def test_only_recent(self, populated_db):
|
||||||
|
"""Mit days_window=7 darf das alte News nicht im Bucket auftauchen."""
|
||||||
|
result = aggregate_themen_zeitreihe(db_path=populated_db, days_window=7)
|
||||||
|
# Nur heutige News (n1, n2) — n3 ist 200 Tage alt
|
||||||
|
total = sum(sum(s) for s in result["series"].values())
|
||||||
|
assert total == 2
|
||||||
|
|
||||||
|
def test_series_aligned(self, populated_db):
|
||||||
|
"""Pro Source: series-Liste muss exakt so lang sein wie buckets."""
|
||||||
|
result = aggregate_themen_zeitreihe(db_path=populated_db, days_window=7)
|
||||||
|
for source in result["sources"]:
|
||||||
|
assert len(result["series"][source]) == len(result["buckets"])
|
||||||
Loading…
Reference in New Issue
Block a user