gwoe-antragspruefer/tests/test_news_aggregator.py
Dotty Dotter d54ce23e42 feat(#170): Aktuelle-Themen-Dashboard — News × Anträge × Pressemitteilungen
Vollständiges 4-Phasen-Feature:

**Phase 1 — News-Aggregator** (`app/news_aggregator.py`)
- Tagesschau-API (`/api2u/news?ressort=...`) für inland/ausland/wirtschaft/wissen
- Bundestag-RSS für aktuellethemen / pressemitteilungen / hib
- DB-Tabelle `news_articles` (URL-PK, idempotent)
- Embeddings via existierender qwen-v4-Pipeline
- Cron-Script `scripts/auto-fetch-news.sh`
- Bewusst NICHT: RND.de (robots.txt bannt explizit ClaudeBot, GPTBot,
  CCBot, ChatGPT-User, Google-Extended). Nur AI-erlaubende, öffentlich-
  rechtliche/parlamentarische Quellen
- Volltexte werden NICHT persistiert (nur Titel + erster Satz)

**Phase 2 — Themen × Anträge Matching** (`app/themen_matching.py`)
- News-Embedding × Assessment-summary_embedding via Cosine-Similarity
- `find_anträge_for_news`: pro News die Top-K passenden Anträge
- `find_news_for_antrag`: pro Antrag Top-K News mit Datums-Fenster (90d)
- `aggregate_top_themen`: primärer Dashboard-Endpoint
- `aggregate_themen_zeitreihe`: News-Volumen pro Tag × Source

**Phase 3 — Dashboard-View** (`/aktuelle-themen`)
- Neuer linker Nav-Eintrag „Aktuelle Themen"
- Stacked-Area-Chart News-Volumen pro Quelle (30d)
- Pro News-Card: Titel + Summary + Tags + Top-3-Antrags-Match-Liste
  mit GWÖ-Score-Pill, Drucksache-Link, PM-Vorschlag-Button
- Filter: Zeitfenster, Top-N, min_similarity
- Auth-protected (require_auth)

**Phase 4 — Pressemitteilungs-Generator** (`app/presse_generator.py`)
- LLM-Prompt-Template (200-250 Worte, GWÖ-Sicht, JSON-Output)
- Reuse von `QwenBewerter` aus app/adapters/qwen_bewerter.py
- DB-Tabelle `presse_drafts` (Persistenz)
- POST `/api/aktuelle-themen/generate-presse` rate-limited 5/min,
  auth-only (LLM-Kosten)
- GET `/api/aktuelle-themen/drafts` + `/drafts/{id}` für Liste/Detail
- Manueller Trigger via UI-Button, kein Auto-Versand
- Modal-Anzeige des generierten Texts

**Compliance:**
- robots.txt-respektierend (ClaudeBot-Bann von RND vermieden, AI-
  erlaubende Quellen verwendet)
- UI zeigt nur Titel+URL+Datum+erster Satz, keine Volltext-Reproduktion
- Pressemitteilungen sind explizit Drafts, nicht Auto-Versand
- LLM-Calls rate-limited, auth-only

**Tests:** 43 neue Tests (19 news_aggregator + 16 themen_matching +
8 presse_generator). Suite jetzt 1048 grün.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 12:39:36 +02:00

263 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Tests fuer app.news_aggregator (#170 Phase 1).
Testet Parser + DB-Persistierung gegen kontrollierte Fixtures, ohne
Live-HTTP-Calls (Tagesschau-API + Bundestag-RSS werden gemockt).
"""
from __future__ import annotations
import json
import sqlite3
from pathlib import Path
from unittest.mock import patch
import pytest
from app.news_aggregator import (
_parse_rss_date,
_strip_html,
fetch_rss,
fetch_tagesschau,
upsert_articles,
)
# ─────────────────────────────────────────────────────────────────────────────
# Helper
# ─────────────────────────────────────────────────────────────────────────────
class TestStripHtml:
def test_removes_tags(self):
assert _strip_html("<p>Hello <b>world</b></p>") == "Hello world"
def test_decodes_cdata(self):
assert "Test" in _strip_html("<![CDATA[Test]]>")
def test_decodes_entities(self):
assert _strip_html("a &amp; b") == "a & b"
def test_collapses_whitespace(self):
assert _strip_html("<p>a b\n c</p>") == "a b c"
def test_empty(self):
assert _strip_html("") == ""
class TestParseRssDate:
def test_rfc822_to_iso(self):
result = _parse_rss_date("Tue, 28 Apr 2026 10:45:12 GMT")
assert result.startswith("2026-04-28")
def test_invalid_returns_empty(self):
assert _parse_rss_date("garbage") == ""
assert _parse_rss_date("") == ""
# ─────────────────────────────────────────────────────────────────────────────
# fetch_tagesschau (mocked HTTP)
# ─────────────────────────────────────────────────────────────────────────────
SAMPLE_TAGESSCHAU_JSON = json.dumps({
"news": [
{
"title": "Bundestag berät über Wohnungsbau",
"firstSentence": "Der Bundestag hat heute über das neue Wohnungsbau-Gesetz beraten.",
"shareURL": "https://www.tagesschau.de/inland/bundestag-wohnungsbau-100.html",
"date": "2026-04-28T10:00:00.000+02:00",
"ressort": "inland",
"tags": [{"tag": "Wohnungsbau"}, {"tag": "Bundestag"}],
},
{
"title": "EU-Kommission stellt Klimapaket vor",
"firstSentence": "Die EU plant ehrgeizige Klimaziele.",
"shareURL": "https://www.tagesschau.de/ausland/eu-klima-100.html",
"date": "2026-04-28T11:00:00.000+02:00",
"ressort": "ausland",
"tags": [{"tag": "Klima"}, {"tag": "EU"}],
},
{
# Dieser hat keinen shareURL — sollte uebersprungen werden
"title": "Kein Link",
"firstSentence": "Skip mich",
},
],
}).encode("utf-8")
class TestFetchTagesschau:
def test_parses_news_array(self):
with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON):
articles = fetch_tagesschau(ressorts=["inland"])
# Deduplication ueber URL → 2 unique
assert len(articles) == 2
first = articles[0]
assert first["url"] == "https://www.tagesschau.de/inland/bundestag-wohnungsbau-100.html"
assert first["titel"] == "Bundestag berät über Wohnungsbau"
assert "Wohnungsbau" in first["summary"]
assert first["source"] == "tagesschau"
assert first["ressort"] == "inland"
assert "Wohnungsbau" in first["tags"]
def test_skips_items_without_link(self):
with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON):
articles = fetch_tagesschau(ressorts=["inland"])
assert all(a["url"] for a in articles)
def test_returns_empty_on_http_error(self):
with patch("app.news_aggregator._http_get", return_value=None):
articles = fetch_tagesschau(ressorts=["inland"])
assert articles == []
def test_dedup_across_ressorts(self):
"""Wenn dasselbe Item in zwei Ressorts erscheint, wird es nur 1× geliefert."""
with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON):
articles = fetch_tagesschau(ressorts=["inland", "ausland"])
urls = [a["url"] for a in articles]
assert len(urls) == len(set(urls))
# ─────────────────────────────────────────────────────────────────────────────
# fetch_rss (mocked HTTP)
# ─────────────────────────────────────────────────────────────────────────────
SAMPLE_RSS = """<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"><channel><title>BT Aktuell</title>
<item>
<title><![CDATA[Bundestag berät Antrag zum Wohnungsbau]]></title>
<link>https://www.bundestag.de/dokumente/textarchiv/2026/kw18-wohnungsbau-1170388</link>
<description><![CDATA[Der Bundestag hat heute den Antrag zum Wohnungsbau beraten.]]></description>
<pubDate>Tue, 28 Apr 2026 10:45:12 GMT</pubDate>
</item>
<item>
<title>Antrag zur Klimapolitik</title>
<link>https://www.bundestag.de/klima</link>
<description>Klimaschutz im Bundestag</description>
<pubDate>Mon, 27 Apr 2026 10:00:00 GMT</pubDate>
</item>
</channel></rss>""".encode("utf-8")
class TestFetchRss:
def test_parses_rss_items(self):
with patch("app.news_aggregator._http_get", return_value=SAMPLE_RSS):
articles = fetch_rss("bundestag-aktuell", "https://example.com/rss")
assert len(articles) == 2
first = articles[0]
assert "Wohnungsbau" in first["titel"]
assert first["url"].startswith("https://www.bundestag.de")
assert first["source"] == "bundestag-aktuell"
assert first["datum"].startswith("2026-04-28")
assert "Bundestag" in first["summary"]
def test_strips_cdata_and_html(self):
with patch("app.news_aggregator._http_get", return_value=SAMPLE_RSS):
articles = fetch_rss("bundestag-aktuell", "https://example.com/rss")
for a in articles:
assert "<![CDATA[" not in a["titel"]
assert "<![CDATA[" not in a["summary"]
def test_empty_on_http_error(self):
with patch("app.news_aggregator._http_get", return_value=None):
articles = fetch_rss("x", "https://example.com/rss")
assert articles == []
def test_skips_items_without_title_or_link(self):
bad = b"""<?xml version="1.0"?><rss><channel>
<item><title>Nur Titel</title></item>
<item><link>nur-link</link></item>
</channel></rss>"""
with patch("app.news_aggregator._http_get", return_value=bad):
articles = fetch_rss("x", "https://example.com/rss")
assert articles == []
# ─────────────────────────────────────────────────────────────────────────────
# upsert_articles
# ─────────────────────────────────────────────────────────────────────────────
@pytest.fixture
def empty_db(tmp_path: Path) -> Path:
db = tmp_path / "test_news.db"
conn = sqlite3.connect(str(db))
conn.execute("""
CREATE TABLE news_articles (
url TEXT PRIMARY KEY,
titel TEXT NOT NULL,
summary TEXT,
datum TEXT NOT NULL,
source TEXT NOT NULL,
ressort TEXT,
tags TEXT,
summary_embedding BLOB,
embedding_model TEXT,
fetched_at TEXT NOT NULL DEFAULT (datetime('now'))
)
""")
conn.commit()
conn.close()
return db
SAMPLE_ARTICLES = [
{
"url": "https://example.com/a",
"titel": "Wohnungsbau",
"summary": "Heute im Bundestag",
"datum": "2026-04-28",
"source": "tagesschau",
"ressort": "inland",
"tags": ["Wohnungsbau"],
},
{
"url": "https://example.com/b",
"titel": "Klima",
"summary": "EU plant Klimaziele",
"datum": "2026-04-28",
"source": "tagesschau",
"ressort": "ausland",
"tags": ["Klima", "EU"],
},
]
class TestUpsertArticles:
def test_inserts_new_articles(self, empty_db):
stats = upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False)
assert stats["inserted"] == 2
assert stats["updated"] == 0
def test_updates_existing_articles(self, empty_db):
upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False)
# Re-run with same URLs but different titel
modified = [{**a, "titel": a["titel"] + " (neu)"} for a in SAMPLE_ARTICLES]
stats = upsert_articles(modified, db_path=empty_db, embed=False)
assert stats["updated"] == 2
assert stats["inserted"] == 0
# Verify the title was updated
conn = sqlite3.connect(str(empty_db))
row = conn.execute(
"SELECT titel FROM news_articles WHERE url=?",
(SAMPLE_ARTICLES[0]["url"],),
).fetchone()
conn.close()
assert row[0].endswith("(neu)")
def test_persists_tags_as_json(self, empty_db):
upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False)
conn = sqlite3.connect(str(empty_db))
row = conn.execute(
"SELECT tags FROM news_articles WHERE url=?",
(SAMPLE_ARTICLES[0]["url"],),
).fetchone()
conn.close()
tags = json.loads(row[0])
assert tags == ["Wohnungsbau"]
def test_missing_db_returns_zeros(self, tmp_path):
stats = upsert_articles(SAMPLE_ARTICLES,
db_path=tmp_path / "missing.db", embed=False)
assert stats == {"inserted": 0, "updated": 0, "embedded": 0}