Vollständiges 4-Phasen-Feature:
**Phase 1 — News-Aggregator** (`app/news_aggregator.py`)
- Tagesschau-API (`/api2u/news?ressort=...`) für inland/ausland/wirtschaft/wissen
- Bundestag-RSS für aktuellethemen / pressemitteilungen / hib
- DB-Tabelle `news_articles` (URL-PK, idempotent)
- Embeddings via existierender qwen-v4-Pipeline
- Cron-Script `scripts/auto-fetch-news.sh`
- Bewusst NICHT: RND.de (robots.txt bannt explizit ClaudeBot, GPTBot,
CCBot, ChatGPT-User, Google-Extended). Nur AI-erlaubende, öffentlich-
rechtliche/parlamentarische Quellen
- Volltexte werden NICHT persistiert (nur Titel + erster Satz)
**Phase 2 — Themen × Anträge Matching** (`app/themen_matching.py`)
- News-Embedding × Assessment-summary_embedding via Cosine-Similarity
- `find_anträge_for_news`: pro News die Top-K passenden Anträge
- `find_news_for_antrag`: pro Antrag Top-K News mit Datums-Fenster (90d)
- `aggregate_top_themen`: primärer Dashboard-Endpoint
- `aggregate_themen_zeitreihe`: News-Volumen pro Tag × Source
**Phase 3 — Dashboard-View** (`/aktuelle-themen`)
- Neuer linker Nav-Eintrag „Aktuelle Themen"
- Stacked-Area-Chart News-Volumen pro Quelle (30d)
- Pro News-Card: Titel + Summary + Tags + Top-3-Antrags-Match-Liste
mit GWÖ-Score-Pill, Drucksache-Link, PM-Vorschlag-Button
- Filter: Zeitfenster, Top-N, min_similarity
- Auth-protected (require_auth)
**Phase 4 — Pressemitteilungs-Generator** (`app/presse_generator.py`)
- LLM-Prompt-Template (200-250 Worte, GWÖ-Sicht, JSON-Output)
- Reuse von `QwenBewerter` aus app/adapters/qwen_bewerter.py
- DB-Tabelle `presse_drafts` (Persistenz)
- POST `/api/aktuelle-themen/generate-presse` rate-limited 5/min,
auth-only (LLM-Kosten)
- GET `/api/aktuelle-themen/drafts` + `/drafts/{id}` für Liste/Detail
- Manueller Trigger via UI-Button, kein Auto-Versand
- Modal-Anzeige des generierten Texts
**Compliance:**
- robots.txt-respektierend (ClaudeBot-Bann von RND vermieden, AI-
erlaubende Quellen verwendet)
- UI zeigt nur Titel+URL+Datum+erster Satz, keine Volltext-Reproduktion
- Pressemitteilungen sind explizit Drafts, nicht Auto-Versand
- LLM-Calls rate-limited, auth-only
**Tests:** 43 neue Tests (19 news_aggregator + 16 themen_matching +
8 presse_generator). Suite jetzt 1048 grün.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
263 lines
10 KiB
Python
263 lines
10 KiB
Python
"""Tests fuer app.news_aggregator (#170 Phase 1).
|
||
|
||
Testet Parser + DB-Persistierung gegen kontrollierte Fixtures, ohne
|
||
Live-HTTP-Calls (Tagesschau-API + Bundestag-RSS werden gemockt).
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import sqlite3
|
||
from pathlib import Path
|
||
from unittest.mock import patch
|
||
|
||
import pytest
|
||
|
||
from app.news_aggregator import (
|
||
_parse_rss_date,
|
||
_strip_html,
|
||
fetch_rss,
|
||
fetch_tagesschau,
|
||
upsert_articles,
|
||
)
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Helper
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
class TestStripHtml:
|
||
def test_removes_tags(self):
|
||
assert _strip_html("<p>Hello <b>world</b></p>") == "Hello world"
|
||
|
||
def test_decodes_cdata(self):
|
||
assert "Test" in _strip_html("<![CDATA[Test]]>")
|
||
|
||
def test_decodes_entities(self):
|
||
assert _strip_html("a & b") == "a & b"
|
||
|
||
def test_collapses_whitespace(self):
|
||
assert _strip_html("<p>a b\n c</p>") == "a b c"
|
||
|
||
def test_empty(self):
|
||
assert _strip_html("") == ""
|
||
|
||
|
||
class TestParseRssDate:
|
||
def test_rfc822_to_iso(self):
|
||
result = _parse_rss_date("Tue, 28 Apr 2026 10:45:12 GMT")
|
||
assert result.startswith("2026-04-28")
|
||
|
||
def test_invalid_returns_empty(self):
|
||
assert _parse_rss_date("garbage") == ""
|
||
assert _parse_rss_date("") == ""
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# fetch_tagesschau (mocked HTTP)
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
SAMPLE_TAGESSCHAU_JSON = json.dumps({
|
||
"news": [
|
||
{
|
||
"title": "Bundestag berät über Wohnungsbau",
|
||
"firstSentence": "Der Bundestag hat heute über das neue Wohnungsbau-Gesetz beraten.",
|
||
"shareURL": "https://www.tagesschau.de/inland/bundestag-wohnungsbau-100.html",
|
||
"date": "2026-04-28T10:00:00.000+02:00",
|
||
"ressort": "inland",
|
||
"tags": [{"tag": "Wohnungsbau"}, {"tag": "Bundestag"}],
|
||
},
|
||
{
|
||
"title": "EU-Kommission stellt Klimapaket vor",
|
||
"firstSentence": "Die EU plant ehrgeizige Klimaziele.",
|
||
"shareURL": "https://www.tagesschau.de/ausland/eu-klima-100.html",
|
||
"date": "2026-04-28T11:00:00.000+02:00",
|
||
"ressort": "ausland",
|
||
"tags": [{"tag": "Klima"}, {"tag": "EU"}],
|
||
},
|
||
{
|
||
# Dieser hat keinen shareURL — sollte uebersprungen werden
|
||
"title": "Kein Link",
|
||
"firstSentence": "Skip mich",
|
||
},
|
||
],
|
||
}).encode("utf-8")
|
||
|
||
|
||
class TestFetchTagesschau:
|
||
def test_parses_news_array(self):
|
||
with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON):
|
||
articles = fetch_tagesschau(ressorts=["inland"])
|
||
# Deduplication ueber URL → 2 unique
|
||
assert len(articles) == 2
|
||
first = articles[0]
|
||
assert first["url"] == "https://www.tagesschau.de/inland/bundestag-wohnungsbau-100.html"
|
||
assert first["titel"] == "Bundestag berät über Wohnungsbau"
|
||
assert "Wohnungsbau" in first["summary"]
|
||
assert first["source"] == "tagesschau"
|
||
assert first["ressort"] == "inland"
|
||
assert "Wohnungsbau" in first["tags"]
|
||
|
||
def test_skips_items_without_link(self):
|
||
with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON):
|
||
articles = fetch_tagesschau(ressorts=["inland"])
|
||
assert all(a["url"] for a in articles)
|
||
|
||
def test_returns_empty_on_http_error(self):
|
||
with patch("app.news_aggregator._http_get", return_value=None):
|
||
articles = fetch_tagesschau(ressorts=["inland"])
|
||
assert articles == []
|
||
|
||
def test_dedup_across_ressorts(self):
|
||
"""Wenn dasselbe Item in zwei Ressorts erscheint, wird es nur 1× geliefert."""
|
||
with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON):
|
||
articles = fetch_tagesschau(ressorts=["inland", "ausland"])
|
||
urls = [a["url"] for a in articles]
|
||
assert len(urls) == len(set(urls))
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# fetch_rss (mocked HTTP)
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
SAMPLE_RSS = """<?xml version="1.0" encoding="UTF-8"?>
|
||
<rss version="2.0"><channel><title>BT Aktuell</title>
|
||
<item>
|
||
<title><![CDATA[Bundestag berät Antrag zum Wohnungsbau]]></title>
|
||
<link>https://www.bundestag.de/dokumente/textarchiv/2026/kw18-wohnungsbau-1170388</link>
|
||
<description><![CDATA[Der Bundestag hat heute den Antrag zum Wohnungsbau beraten.]]></description>
|
||
<pubDate>Tue, 28 Apr 2026 10:45:12 GMT</pubDate>
|
||
</item>
|
||
<item>
|
||
<title>Antrag zur Klimapolitik</title>
|
||
<link>https://www.bundestag.de/klima</link>
|
||
<description>Klimaschutz im Bundestag</description>
|
||
<pubDate>Mon, 27 Apr 2026 10:00:00 GMT</pubDate>
|
||
</item>
|
||
</channel></rss>""".encode("utf-8")
|
||
|
||
|
||
class TestFetchRss:
|
||
def test_parses_rss_items(self):
|
||
with patch("app.news_aggregator._http_get", return_value=SAMPLE_RSS):
|
||
articles = fetch_rss("bundestag-aktuell", "https://example.com/rss")
|
||
assert len(articles) == 2
|
||
first = articles[0]
|
||
assert "Wohnungsbau" in first["titel"]
|
||
assert first["url"].startswith("https://www.bundestag.de")
|
||
assert first["source"] == "bundestag-aktuell"
|
||
assert first["datum"].startswith("2026-04-28")
|
||
assert "Bundestag" in first["summary"]
|
||
|
||
def test_strips_cdata_and_html(self):
|
||
with patch("app.news_aggregator._http_get", return_value=SAMPLE_RSS):
|
||
articles = fetch_rss("bundestag-aktuell", "https://example.com/rss")
|
||
for a in articles:
|
||
assert "<![CDATA[" not in a["titel"]
|
||
assert "<![CDATA[" not in a["summary"]
|
||
|
||
def test_empty_on_http_error(self):
|
||
with patch("app.news_aggregator._http_get", return_value=None):
|
||
articles = fetch_rss("x", "https://example.com/rss")
|
||
assert articles == []
|
||
|
||
def test_skips_items_without_title_or_link(self):
|
||
bad = b"""<?xml version="1.0"?><rss><channel>
|
||
<item><title>Nur Titel</title></item>
|
||
<item><link>nur-link</link></item>
|
||
</channel></rss>"""
|
||
with patch("app.news_aggregator._http_get", return_value=bad):
|
||
articles = fetch_rss("x", "https://example.com/rss")
|
||
assert articles == []
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# upsert_articles
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
@pytest.fixture
|
||
def empty_db(tmp_path: Path) -> Path:
|
||
db = tmp_path / "test_news.db"
|
||
conn = sqlite3.connect(str(db))
|
||
conn.execute("""
|
||
CREATE TABLE news_articles (
|
||
url TEXT PRIMARY KEY,
|
||
titel TEXT NOT NULL,
|
||
summary TEXT,
|
||
datum TEXT NOT NULL,
|
||
source TEXT NOT NULL,
|
||
ressort TEXT,
|
||
tags TEXT,
|
||
summary_embedding BLOB,
|
||
embedding_model TEXT,
|
||
fetched_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||
)
|
||
""")
|
||
conn.commit()
|
||
conn.close()
|
||
return db
|
||
|
||
|
||
SAMPLE_ARTICLES = [
|
||
{
|
||
"url": "https://example.com/a",
|
||
"titel": "Wohnungsbau",
|
||
"summary": "Heute im Bundestag",
|
||
"datum": "2026-04-28",
|
||
"source": "tagesschau",
|
||
"ressort": "inland",
|
||
"tags": ["Wohnungsbau"],
|
||
},
|
||
{
|
||
"url": "https://example.com/b",
|
||
"titel": "Klima",
|
||
"summary": "EU plant Klimaziele",
|
||
"datum": "2026-04-28",
|
||
"source": "tagesschau",
|
||
"ressort": "ausland",
|
||
"tags": ["Klima", "EU"],
|
||
},
|
||
]
|
||
|
||
|
||
class TestUpsertArticles:
|
||
def test_inserts_new_articles(self, empty_db):
|
||
stats = upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False)
|
||
assert stats["inserted"] == 2
|
||
assert stats["updated"] == 0
|
||
|
||
def test_updates_existing_articles(self, empty_db):
|
||
upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False)
|
||
# Re-run with same URLs but different titel
|
||
modified = [{**a, "titel": a["titel"] + " (neu)"} for a in SAMPLE_ARTICLES]
|
||
stats = upsert_articles(modified, db_path=empty_db, embed=False)
|
||
assert stats["updated"] == 2
|
||
assert stats["inserted"] == 0
|
||
# Verify the title was updated
|
||
conn = sqlite3.connect(str(empty_db))
|
||
row = conn.execute(
|
||
"SELECT titel FROM news_articles WHERE url=?",
|
||
(SAMPLE_ARTICLES[0]["url"],),
|
||
).fetchone()
|
||
conn.close()
|
||
assert row[0].endswith("(neu)")
|
||
|
||
def test_persists_tags_as_json(self, empty_db):
|
||
upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False)
|
||
conn = sqlite3.connect(str(empty_db))
|
||
row = conn.execute(
|
||
"SELECT tags FROM news_articles WHERE url=?",
|
||
(SAMPLE_ARTICLES[0]["url"],),
|
||
).fetchone()
|
||
conn.close()
|
||
tags = json.loads(row[0])
|
||
assert tags == ["Wohnungsbau"]
|
||
|
||
def test_missing_db_returns_zeros(self, tmp_path):
|
||
stats = upsert_articles(SAMPLE_ARTICLES,
|
||
db_path=tmp_path / "missing.db", embed=False)
|
||
assert stats == {"inserted": 0, "updated": 0, "embedded": 0}
|