gwoe-antragspruefer/tests/test_news_aggregator.py

263 lines
10 KiB
Python
Raw Normal View History

feat(#170): Aktuelle-Themen-Dashboard — News × Anträge × Pressemitteilungen Vollständiges 4-Phasen-Feature: **Phase 1 — News-Aggregator** (`app/news_aggregator.py`) - Tagesschau-API (`/api2u/news?ressort=...`) für inland/ausland/wirtschaft/wissen - Bundestag-RSS für aktuellethemen / pressemitteilungen / hib - DB-Tabelle `news_articles` (URL-PK, idempotent) - Embeddings via existierender qwen-v4-Pipeline - Cron-Script `scripts/auto-fetch-news.sh` - Bewusst NICHT: RND.de (robots.txt bannt explizit ClaudeBot, GPTBot, CCBot, ChatGPT-User, Google-Extended). Nur AI-erlaubende, öffentlich- rechtliche/parlamentarische Quellen - Volltexte werden NICHT persistiert (nur Titel + erster Satz) **Phase 2 — Themen × Anträge Matching** (`app/themen_matching.py`) - News-Embedding × Assessment-summary_embedding via Cosine-Similarity - `find_anträge_for_news`: pro News die Top-K passenden Anträge - `find_news_for_antrag`: pro Antrag Top-K News mit Datums-Fenster (90d) - `aggregate_top_themen`: primärer Dashboard-Endpoint - `aggregate_themen_zeitreihe`: News-Volumen pro Tag × Source **Phase 3 — Dashboard-View** (`/aktuelle-themen`) - Neuer linker Nav-Eintrag „Aktuelle Themen" - Stacked-Area-Chart News-Volumen pro Quelle (30d) - Pro News-Card: Titel + Summary + Tags + Top-3-Antrags-Match-Liste mit GWÖ-Score-Pill, Drucksache-Link, PM-Vorschlag-Button - Filter: Zeitfenster, Top-N, min_similarity - Auth-protected (require_auth) **Phase 4 — Pressemitteilungs-Generator** (`app/presse_generator.py`) - LLM-Prompt-Template (200-250 Worte, GWÖ-Sicht, JSON-Output) - Reuse von `QwenBewerter` aus app/adapters/qwen_bewerter.py - DB-Tabelle `presse_drafts` (Persistenz) - POST `/api/aktuelle-themen/generate-presse` rate-limited 5/min, auth-only (LLM-Kosten) - GET `/api/aktuelle-themen/drafts` + `/drafts/{id}` für Liste/Detail - Manueller Trigger via UI-Button, kein Auto-Versand - Modal-Anzeige des generierten Texts **Compliance:** - robots.txt-respektierend (ClaudeBot-Bann von RND vermieden, AI- erlaubende Quellen verwendet) - UI zeigt nur Titel+URL+Datum+erster Satz, keine Volltext-Reproduktion - Pressemitteilungen sind explizit Drafts, nicht Auto-Versand - LLM-Calls rate-limited, auth-only **Tests:** 43 neue Tests (19 news_aggregator + 16 themen_matching + 8 presse_generator). Suite jetzt 1048 grün. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 12:39:36 +02:00
"""Tests fuer app.news_aggregator (#170 Phase 1).
Testet Parser + DB-Persistierung gegen kontrollierte Fixtures, ohne
Live-HTTP-Calls (Tagesschau-API + Bundestag-RSS werden gemockt).
"""
from __future__ import annotations
import json
import sqlite3
from pathlib import Path
from unittest.mock import patch
import pytest
from app.news_aggregator import (
_parse_rss_date,
_strip_html,
fetch_rss,
fetch_tagesschau,
upsert_articles,
)
# ─────────────────────────────────────────────────────────────────────────────
# Helper
# ─────────────────────────────────────────────────────────────────────────────
class TestStripHtml:
def test_removes_tags(self):
assert _strip_html("<p>Hello <b>world</b></p>") == "Hello world"
def test_decodes_cdata(self):
assert "Test" in _strip_html("<![CDATA[Test]]>")
def test_decodes_entities(self):
assert _strip_html("a &amp; b") == "a & b"
def test_collapses_whitespace(self):
assert _strip_html("<p>a b\n c</p>") == "a b c"
def test_empty(self):
assert _strip_html("") == ""
class TestParseRssDate:
def test_rfc822_to_iso(self):
result = _parse_rss_date("Tue, 28 Apr 2026 10:45:12 GMT")
assert result.startswith("2026-04-28")
def test_invalid_returns_empty(self):
assert _parse_rss_date("garbage") == ""
assert _parse_rss_date("") == ""
# ─────────────────────────────────────────────────────────────────────────────
# fetch_tagesschau (mocked HTTP)
# ─────────────────────────────────────────────────────────────────────────────
SAMPLE_TAGESSCHAU_JSON = json.dumps({
"news": [
{
"title": "Bundestag berät über Wohnungsbau",
"firstSentence": "Der Bundestag hat heute über das neue Wohnungsbau-Gesetz beraten.",
"shareURL": "https://www.tagesschau.de/inland/bundestag-wohnungsbau-100.html",
"date": "2026-04-28T10:00:00.000+02:00",
"ressort": "inland",
"tags": [{"tag": "Wohnungsbau"}, {"tag": "Bundestag"}],
},
{
"title": "EU-Kommission stellt Klimapaket vor",
"firstSentence": "Die EU plant ehrgeizige Klimaziele.",
"shareURL": "https://www.tagesschau.de/ausland/eu-klima-100.html",
"date": "2026-04-28T11:00:00.000+02:00",
"ressort": "ausland",
"tags": [{"tag": "Klima"}, {"tag": "EU"}],
},
{
# Dieser hat keinen shareURL — sollte uebersprungen werden
"title": "Kein Link",
"firstSentence": "Skip mich",
},
],
}).encode("utf-8")
class TestFetchTagesschau:
def test_parses_news_array(self):
with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON):
articles = fetch_tagesschau(ressorts=["inland"])
# Deduplication ueber URL → 2 unique
assert len(articles) == 2
first = articles[0]
assert first["url"] == "https://www.tagesschau.de/inland/bundestag-wohnungsbau-100.html"
assert first["titel"] == "Bundestag berät über Wohnungsbau"
assert "Wohnungsbau" in first["summary"]
assert first["source"] == "tagesschau"
assert first["ressort"] == "inland"
assert "Wohnungsbau" in first["tags"]
def test_skips_items_without_link(self):
with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON):
articles = fetch_tagesschau(ressorts=["inland"])
assert all(a["url"] for a in articles)
def test_returns_empty_on_http_error(self):
with patch("app.news_aggregator._http_get", return_value=None):
articles = fetch_tagesschau(ressorts=["inland"])
assert articles == []
def test_dedup_across_ressorts(self):
"""Wenn dasselbe Item in zwei Ressorts erscheint, wird es nur 1× geliefert."""
with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON):
articles = fetch_tagesschau(ressorts=["inland", "ausland"])
urls = [a["url"] for a in articles]
assert len(urls) == len(set(urls))
# ─────────────────────────────────────────────────────────────────────────────
# fetch_rss (mocked HTTP)
# ─────────────────────────────────────────────────────────────────────────────
SAMPLE_RSS = """<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"><channel><title>BT Aktuell</title>
<item>
<title><![CDATA[Bundestag berät Antrag zum Wohnungsbau]]></title>
<link>https://www.bundestag.de/dokumente/textarchiv/2026/kw18-wohnungsbau-1170388</link>
<description><![CDATA[Der Bundestag hat heute den Antrag zum Wohnungsbau beraten.]]></description>
<pubDate>Tue, 28 Apr 2026 10:45:12 GMT</pubDate>
</item>
<item>
<title>Antrag zur Klimapolitik</title>
<link>https://www.bundestag.de/klima</link>
<description>Klimaschutz im Bundestag</description>
<pubDate>Mon, 27 Apr 2026 10:00:00 GMT</pubDate>
</item>
</channel></rss>""".encode("utf-8")
class TestFetchRss:
def test_parses_rss_items(self):
with patch("app.news_aggregator._http_get", return_value=SAMPLE_RSS):
articles = fetch_rss("bundestag-aktuell", "https://example.com/rss")
assert len(articles) == 2
first = articles[0]
assert "Wohnungsbau" in first["titel"]
assert first["url"].startswith("https://www.bundestag.de")
assert first["source"] == "bundestag-aktuell"
assert first["datum"].startswith("2026-04-28")
assert "Bundestag" in first["summary"]
def test_strips_cdata_and_html(self):
with patch("app.news_aggregator._http_get", return_value=SAMPLE_RSS):
articles = fetch_rss("bundestag-aktuell", "https://example.com/rss")
for a in articles:
assert "<![CDATA[" not in a["titel"]
assert "<![CDATA[" not in a["summary"]
def test_empty_on_http_error(self):
with patch("app.news_aggregator._http_get", return_value=None):
articles = fetch_rss("x", "https://example.com/rss")
assert articles == []
def test_skips_items_without_title_or_link(self):
bad = b"""<?xml version="1.0"?><rss><channel>
<item><title>Nur Titel</title></item>
<item><link>nur-link</link></item>
</channel></rss>"""
with patch("app.news_aggregator._http_get", return_value=bad):
articles = fetch_rss("x", "https://example.com/rss")
assert articles == []
# ─────────────────────────────────────────────────────────────────────────────
# upsert_articles
# ─────────────────────────────────────────────────────────────────────────────
@pytest.fixture
def empty_db(tmp_path: Path) -> Path:
db = tmp_path / "test_news.db"
conn = sqlite3.connect(str(db))
conn.execute("""
CREATE TABLE news_articles (
url TEXT PRIMARY KEY,
titel TEXT NOT NULL,
summary TEXT,
datum TEXT NOT NULL,
source TEXT NOT NULL,
ressort TEXT,
tags TEXT,
summary_embedding BLOB,
embedding_model TEXT,
fetched_at TEXT NOT NULL DEFAULT (datetime('now'))
)
""")
conn.commit()
conn.close()
return db
SAMPLE_ARTICLES = [
{
"url": "https://example.com/a",
"titel": "Wohnungsbau",
"summary": "Heute im Bundestag",
"datum": "2026-04-28",
"source": "tagesschau",
"ressort": "inland",
"tags": ["Wohnungsbau"],
},
{
"url": "https://example.com/b",
"titel": "Klima",
"summary": "EU plant Klimaziele",
"datum": "2026-04-28",
"source": "tagesschau",
"ressort": "ausland",
"tags": ["Klima", "EU"],
},
]
class TestUpsertArticles:
def test_inserts_new_articles(self, empty_db):
stats = upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False)
assert stats["inserted"] == 2
assert stats["updated"] == 0
def test_updates_existing_articles(self, empty_db):
upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False)
# Re-run with same URLs but different titel
modified = [{**a, "titel": a["titel"] + " (neu)"} for a in SAMPLE_ARTICLES]
stats = upsert_articles(modified, db_path=empty_db, embed=False)
assert stats["updated"] == 2
assert stats["inserted"] == 0
# Verify the title was updated
conn = sqlite3.connect(str(empty_db))
row = conn.execute(
"SELECT titel FROM news_articles WHERE url=?",
(SAMPLE_ARTICLES[0]["url"],),
).fetchone()
conn.close()
assert row[0].endswith("(neu)")
def test_persists_tags_as_json(self, empty_db):
upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False)
conn = sqlite3.connect(str(empty_db))
row = conn.execute(
"SELECT tags FROM news_articles WHERE url=?",
(SAMPLE_ARTICLES[0]["url"],),
).fetchone()
conn.close()
tags = json.loads(row[0])
assert tags == ["Wohnungsbau"]
def test_missing_db_returns_zeros(self, tmp_path):
stats = upsert_articles(SAMPLE_ARTICLES,
db_path=tmp_path / "missing.db", embed=False)
assert stats == {"inserted": 0, "updated": 0, "embedded": 0}