Vollständiges 4-Phasen-Feature:
**Phase 1 — News-Aggregator** (`app/news_aggregator.py`)
- Tagesschau-API (`/api2u/news?ressort=...`) für inland/ausland/wirtschaft/wissen
- Bundestag-RSS für aktuellethemen / pressemitteilungen / hib
- DB-Tabelle `news_articles` (URL-PK, idempotent)
- Embeddings via existierender qwen-v4-Pipeline
- Cron-Script `scripts/auto-fetch-news.sh`
- Bewusst NICHT: RND.de (robots.txt bannt explizit ClaudeBot, GPTBot,
CCBot, ChatGPT-User, Google-Extended). Nur AI-erlaubende, öffentlich-
rechtliche/parlamentarische Quellen
- Volltexte werden NICHT persistiert (nur Titel + erster Satz)
**Phase 2 — Themen × Anträge Matching** (`app/themen_matching.py`)
- News-Embedding × Assessment-summary_embedding via Cosine-Similarity
- `find_anträge_for_news`: pro News die Top-K passenden Anträge
- `find_news_for_antrag`: pro Antrag Top-K News mit Datums-Fenster (90d)
- `aggregate_top_themen`: primärer Dashboard-Endpoint
- `aggregate_themen_zeitreihe`: News-Volumen pro Tag × Source
**Phase 3 — Dashboard-View** (`/aktuelle-themen`)
- Neuer linker Nav-Eintrag „Aktuelle Themen"
- Stacked-Area-Chart News-Volumen pro Quelle (30d)
- Pro News-Card: Titel + Summary + Tags + Top-3-Antrags-Match-Liste
mit GWÖ-Score-Pill, Drucksache-Link, PM-Vorschlag-Button
- Filter: Zeitfenster, Top-N, min_similarity
- Auth-protected (require_auth)
**Phase 4 — Pressemitteilungs-Generator** (`app/presse_generator.py`)
- LLM-Prompt-Template (200-250 Worte, GWÖ-Sicht, JSON-Output)
- Reuse von `QwenBewerter` aus app/adapters/qwen_bewerter.py
- DB-Tabelle `presse_drafts` (Persistenz)
- POST `/api/aktuelle-themen/generate-presse` rate-limited 5/min,
auth-only (LLM-Kosten)
- GET `/api/aktuelle-themen/drafts` + `/drafts/{id}` für Liste/Detail
- Manueller Trigger via UI-Button, kein Auto-Versand
- Modal-Anzeige des generierten Texts
**Compliance:**
- robots.txt-respektierend (ClaudeBot-Bann von RND vermieden, AI-
erlaubende Quellen verwendet)
- UI zeigt nur Titel+URL+Datum+erster Satz, keine Volltext-Reproduktion
- Pressemitteilungen sind explizit Drafts, nicht Auto-Versand
- LLM-Calls rate-limited, auth-only
**Tests:** 43 neue Tests (19 news_aggregator + 16 themen_matching +
8 presse_generator). Suite jetzt 1048 grün.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
298 lines
12 KiB
Python
298 lines
12 KiB
Python
"""Tests fuer app.themen_matching (#170 Phase 2)."""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import sqlite3
|
|
from datetime import datetime, timezone, timedelta
|
|
from pathlib import Path
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
|
|
from app.themen_matching import (
|
|
aggregate_themen_zeitreihe,
|
|
aggregate_top_themen,
|
|
find_anträge_for_news,
|
|
find_news_for_antrag,
|
|
)
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Fixture: DB mit News + Assessments + Embeddings
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
def _vec(dim: int = 8, val: float = 0.1) -> bytes:
|
|
"""Konstruiert einen einfachen Vektor als JSON-Bytes."""
|
|
return json.dumps([val] * dim).encode()
|
|
|
|
|
|
def _vec_from(values: list[float]) -> bytes:
|
|
return json.dumps(values).encode()
|
|
|
|
|
|
@pytest.fixture
|
|
def populated_db(tmp_path: Path) -> Path:
|
|
db = tmp_path / "test_match.db"
|
|
conn = sqlite3.connect(str(db))
|
|
conn.execute("""
|
|
CREATE TABLE news_articles (
|
|
url TEXT PRIMARY KEY,
|
|
titel TEXT NOT NULL,
|
|
summary TEXT,
|
|
datum TEXT NOT NULL,
|
|
source TEXT NOT NULL,
|
|
ressort TEXT,
|
|
tags TEXT,
|
|
summary_embedding BLOB,
|
|
embedding_model TEXT,
|
|
fetched_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
)
|
|
""")
|
|
conn.execute("""
|
|
CREATE TABLE assessments (
|
|
drucksache TEXT PRIMARY KEY,
|
|
title TEXT,
|
|
fraktionen TEXT,
|
|
datum TEXT,
|
|
link TEXT,
|
|
bundesland TEXT,
|
|
gwoe_score REAL,
|
|
gwoe_begruendung TEXT,
|
|
gwoe_matrix TEXT,
|
|
gwoe_schwerpunkt TEXT,
|
|
wahlprogramm_scores TEXT,
|
|
verbesserungen TEXT,
|
|
staerken TEXT,
|
|
schwaechen TEXT,
|
|
empfehlung TEXT,
|
|
empfehlung_symbol TEXT,
|
|
verbesserungspotenzial TEXT,
|
|
themen TEXT,
|
|
antrag_zusammenfassung TEXT,
|
|
antrag_kernpunkte TEXT,
|
|
source TEXT,
|
|
model TEXT,
|
|
created_at TEXT,
|
|
updated_at TEXT,
|
|
summary_embedding BLOB,
|
|
embedding_model TEXT
|
|
)
|
|
""")
|
|
|
|
today = datetime.now(timezone.utc).isoformat()
|
|
yesterday = (datetime.now(timezone.utc) - timedelta(days=1)).isoformat()
|
|
old = (datetime.now(timezone.utc) - timedelta(days=200)).isoformat()
|
|
|
|
# News-Artikel mit unterschiedlichen Embeddings
|
|
news = [
|
|
# Wohnungsbau-News (vec orientiert auf [1,0,0,...])
|
|
("https://example.com/n1", "Wohnungsbau-Reform",
|
|
"Bundestag berät Wohnungsbau", today, "tagesschau", "inland",
|
|
'["Wohnungsbau"]',
|
|
_vec_from([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])),
|
|
# Klima-News (vec orientiert auf [0,1,0,...])
|
|
("https://example.com/n2", "Klimaschutzgesetz",
|
|
"EU plant Klimaziele", today, "tagesschau", "ausland",
|
|
'["Klima"]',
|
|
_vec_from([0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])),
|
|
# Old news, sollte aus Zeitfenster filtern
|
|
("https://example.com/n3", "Alte News", "", old, "tagesschau", "inland",
|
|
'[]', _vec_from([0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])),
|
|
]
|
|
for url, titel, summary, datum, source, ressort, tags, vec in news:
|
|
conn.execute(
|
|
"""INSERT INTO news_articles
|
|
(url, titel, summary, datum, source, ressort, tags,
|
|
summary_embedding, embedding_model)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, 'qwen-embedding-v4')""",
|
|
(url, titel, summary, datum, source, ressort, tags, vec),
|
|
)
|
|
|
|
# Assessments mit Embeddings:
|
|
# - 18/A passt zu Wohnungsbau-News (vec [1,0,...])
|
|
# - 18/B passt zu Klima-News
|
|
# - 18/C ist orthogonal — sollte nirgends matchen
|
|
now_iso = datetime.now().isoformat()
|
|
assessments = [
|
|
("18/A", "Wohnungsbau-Antrag", '["GRÜNE"]', "2026-04-15", "NRW",
|
|
8.0, "Uneingeschränkt unterstützen",
|
|
_vec_from([0.95, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])),
|
|
("18/B", "Klima-Antrag", '["SPD"]', "2026-04-16", "NRW",
|
|
7.0, "Unterstützen mit Änderungen",
|
|
_vec_from([0.0, 0.95, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0])),
|
|
("18/C", "Sonstiges", '["CDU"]', "2026-04-17", "NRW",
|
|
5.0, "Überarbeiten",
|
|
_vec_from([0.0, 0.0, 0.0, 0.0, 0.95, 0.0, 0.0, 0.0])),
|
|
]
|
|
for ds, title, fr, dat, bl, sc, emp, vec in assessments:
|
|
conn.execute(
|
|
"""INSERT INTO assessments
|
|
(drucksache, title, fraktionen, datum, bundesland, gwoe_score,
|
|
empfehlung, themen, source, model, created_at, updated_at,
|
|
summary_embedding, embedding_model)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, '[]', 'test', 'test', ?, ?,
|
|
?, 'qwen-embedding-v4')""",
|
|
(ds, title, fr, dat, bl, sc, emp, now_iso, now_iso, vec),
|
|
)
|
|
|
|
conn.commit()
|
|
conn.close()
|
|
return db
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def mock_embedding_model():
|
|
"""Stellt sicher, dass EMBEDDING_MODEL_READ=qwen-embedding-v4 fuer Tests."""
|
|
with patch("app.embeddings.EMBEDDING_MODEL_READ", "qwen-embedding-v4"):
|
|
yield
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# find_anträge_for_news
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestFindAnträgeForNews:
|
|
def test_wohnungsbau_news_matches_wohnungsbau_antrag(self, populated_db):
|
|
result = find_anträge_for_news(
|
|
"https://example.com/n1", db_path=populated_db,
|
|
min_similarity=0.5,
|
|
)
|
|
assert len(result) >= 1
|
|
# Top-Match sollte 18/A sein
|
|
assert result[0]["drucksache"] == "18/A"
|
|
assert result[0]["similarity"] > 0.9
|
|
|
|
def test_klima_news_matches_klima_antrag(self, populated_db):
|
|
result = find_anträge_for_news(
|
|
"https://example.com/n2", db_path=populated_db,
|
|
min_similarity=0.5,
|
|
)
|
|
assert len(result) >= 1
|
|
assert result[0]["drucksache"] == "18/B"
|
|
|
|
def test_min_similarity_filters_orthogonal(self, populated_db):
|
|
"""Mit hohem min_similarity-Cutoff darf kein orthogonaler Antrag drin sein."""
|
|
result = find_anträge_for_news(
|
|
"https://example.com/n1", db_path=populated_db,
|
|
min_similarity=0.9,
|
|
)
|
|
druck = [r["drucksache"] for r in result]
|
|
assert "18/C" not in druck # 18/C ist orthogonal zu allem
|
|
|
|
def test_unknown_news_returns_empty(self, populated_db):
|
|
assert find_anträge_for_news(
|
|
"https://example.com/missing", db_path=populated_db,
|
|
) == []
|
|
|
|
def test_empty_db(self, tmp_path):
|
|
assert find_anträge_for_news(
|
|
"x", db_path=tmp_path / "missing.db",
|
|
) == []
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# find_news_for_antrag
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestFindNewsForAntrag:
|
|
def test_wohnungsbau_antrag_matches_wohnungsbau_news(self, populated_db):
|
|
result = find_news_for_antrag(
|
|
"18/A", db_path=populated_db, min_similarity=0.5,
|
|
)
|
|
assert len(result) >= 1
|
|
assert result[0]["url"] == "https://example.com/n1"
|
|
|
|
def test_old_news_filtered_out(self, populated_db):
|
|
"""News aus dem 200-Tage-alten Bucket darf nicht im 90-Tage-Fenster auftauchen."""
|
|
result = find_news_for_antrag(
|
|
"18/A", db_path=populated_db, min_similarity=0.0,
|
|
days_window=90,
|
|
)
|
|
urls = [r["url"] for r in result]
|
|
assert "https://example.com/n3" not in urls
|
|
|
|
def test_top_k_limits(self, populated_db):
|
|
"""top_k=1 liefert nur den besten Match."""
|
|
result = find_news_for_antrag(
|
|
"18/A", db_path=populated_db, min_similarity=0.0,
|
|
top_k=1,
|
|
)
|
|
assert len(result) <= 1
|
|
|
|
def test_unknown_antrag(self, populated_db):
|
|
assert find_news_for_antrag(
|
|
"99/Missing", db_path=populated_db,
|
|
) == []
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# aggregate_top_themen
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestAggregateTopThemen:
|
|
def test_returns_buckets(self, populated_db):
|
|
result = aggregate_top_themen(
|
|
db_path=populated_db, min_similarity=0.5,
|
|
)
|
|
# Heute gibt es 2 News-Artikel, beide mit Match
|
|
assert len(result["buckets"]) == 2
|
|
assert "n_total_news" in result
|
|
|
|
def test_each_bucket_has_news_and_matches(self, populated_db):
|
|
result = aggregate_top_themen(
|
|
db_path=populated_db, min_similarity=0.5,
|
|
)
|
|
for b in result["buckets"]:
|
|
assert "news" in b
|
|
assert "matches" in b
|
|
assert "url" in b["news"]
|
|
assert "titel" in b["news"]
|
|
|
|
def test_days_window_filter(self, populated_db):
|
|
"""Mit kleinem Fenster nur die fresh News, alte raus."""
|
|
result = aggregate_top_themen(
|
|
db_path=populated_db, days_window=7, min_similarity=0.5,
|
|
)
|
|
for b in result["buckets"]:
|
|
assert b["news"]["url"] != "https://example.com/n3"
|
|
|
|
def test_min_similarity_filter(self, populated_db):
|
|
"""Mit hohem min_sim verschwinden Cross-Matches."""
|
|
result = aggregate_top_themen(
|
|
db_path=populated_db, min_similarity=0.99,
|
|
)
|
|
# Nur exakte Matches sollten überleben
|
|
for b in result["buckets"]:
|
|
for m in b["matches"]:
|
|
assert m["similarity"] > 0.99
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# aggregate_themen_zeitreihe
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestAggregateZeitreihe:
|
|
def test_structure(self, populated_db):
|
|
result = aggregate_themen_zeitreihe(db_path=populated_db, days_window=7)
|
|
assert "buckets" in result
|
|
assert "sources" in result
|
|
assert "series" in result
|
|
|
|
def test_only_recent(self, populated_db):
|
|
"""Mit days_window=7 darf das alte News nicht im Bucket auftauchen."""
|
|
result = aggregate_themen_zeitreihe(db_path=populated_db, days_window=7)
|
|
# Nur heutige News (n1, n2) — n3 ist 200 Tage alt
|
|
total = sum(sum(s) for s in result["series"].values())
|
|
assert total == 2
|
|
|
|
def test_series_aligned(self, populated_db):
|
|
"""Pro Source: series-Liste muss exakt so lang sein wie buckets."""
|
|
result = aggregate_themen_zeitreihe(db_path=populated_db, days_window=7)
|
|
for source in result["sources"]:
|
|
assert len(result["series"][source]) == len(result["buckets"])
|