"""Tests fuer app.news_aggregator (#170 Phase 1). Testet Parser + DB-Persistierung gegen kontrollierte Fixtures, ohne Live-HTTP-Calls (Tagesschau-API + Bundestag-RSS werden gemockt). """ from __future__ import annotations import json import sqlite3 from pathlib import Path from unittest.mock import patch import pytest from app.news_aggregator import ( _parse_rss_date, _strip_html, fetch_rss, fetch_tagesschau, upsert_articles, ) # ───────────────────────────────────────────────────────────────────────────── # Helper # ───────────────────────────────────────────────────────────────────────────── class TestStripHtml: def test_removes_tags(self): assert _strip_html("

Hello world

") == "Hello world" def test_decodes_cdata(self): assert "Test" in _strip_html("") def test_decodes_entities(self): assert _strip_html("a & b") == "a & b" def test_collapses_whitespace(self): assert _strip_html("

a b\n c

") == "a b c" def test_empty(self): assert _strip_html("") == "" class TestParseRssDate: def test_rfc822_to_iso(self): result = _parse_rss_date("Tue, 28 Apr 2026 10:45:12 GMT") assert result.startswith("2026-04-28") def test_invalid_returns_empty(self): assert _parse_rss_date("garbage") == "" assert _parse_rss_date("") == "" # ───────────────────────────────────────────────────────────────────────────── # fetch_tagesschau (mocked HTTP) # ───────────────────────────────────────────────────────────────────────────── SAMPLE_TAGESSCHAU_JSON = json.dumps({ "news": [ { "title": "Bundestag berät über Wohnungsbau", "firstSentence": "Der Bundestag hat heute über das neue Wohnungsbau-Gesetz beraten.", "shareURL": "https://www.tagesschau.de/inland/bundestag-wohnungsbau-100.html", "date": "2026-04-28T10:00:00.000+02:00", "ressort": "inland", "tags": [{"tag": "Wohnungsbau"}, {"tag": "Bundestag"}], }, { "title": "EU-Kommission stellt Klimapaket vor", "firstSentence": "Die EU plant ehrgeizige Klimaziele.", "shareURL": "https://www.tagesschau.de/ausland/eu-klima-100.html", "date": "2026-04-28T11:00:00.000+02:00", "ressort": "ausland", "tags": [{"tag": "Klima"}, {"tag": "EU"}], }, { # Dieser hat keinen shareURL — sollte uebersprungen werden "title": "Kein Link", "firstSentence": "Skip mich", }, ], }).encode("utf-8") class TestFetchTagesschau: def test_parses_news_array(self): with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON): articles = fetch_tagesschau(ressorts=["inland"]) # Deduplication ueber URL → 2 unique assert len(articles) == 2 first = articles[0] assert first["url"] == "https://www.tagesschau.de/inland/bundestag-wohnungsbau-100.html" assert first["titel"] == "Bundestag berät über Wohnungsbau" assert "Wohnungsbau" in first["summary"] assert first["source"] == "tagesschau" assert first["ressort"] == "inland" assert "Wohnungsbau" in first["tags"] def test_skips_items_without_link(self): with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON): articles = fetch_tagesschau(ressorts=["inland"]) assert all(a["url"] for a in articles) def test_returns_empty_on_http_error(self): with patch("app.news_aggregator._http_get", return_value=None): articles = fetch_tagesschau(ressorts=["inland"]) assert articles == [] def test_dedup_across_ressorts(self): """Wenn dasselbe Item in zwei Ressorts erscheint, wird es nur 1× geliefert.""" with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON): articles = fetch_tagesschau(ressorts=["inland", "ausland"]) urls = [a["url"] for a in articles] assert len(urls) == len(set(urls)) # ───────────────────────────────────────────────────────────────────────────── # fetch_rss (mocked HTTP) # ───────────────────────────────────────────────────────────────────────────── SAMPLE_RSS = """ BT Aktuell <![CDATA[Bundestag berät Antrag zum Wohnungsbau]]> https://www.bundestag.de/dokumente/textarchiv/2026/kw18-wohnungsbau-1170388 Tue, 28 Apr 2026 10:45:12 GMT Antrag zur Klimapolitik https://www.bundestag.de/klima Klimaschutz im Bundestag Mon, 27 Apr 2026 10:00:00 GMT """.encode("utf-8") class TestFetchRss: def test_parses_rss_items(self): with patch("app.news_aggregator._http_get", return_value=SAMPLE_RSS): articles = fetch_rss("bundestag-aktuell", "https://example.com/rss") assert len(articles) == 2 first = articles[0] assert "Wohnungsbau" in first["titel"] assert first["url"].startswith("https://www.bundestag.de") assert first["source"] == "bundestag-aktuell" assert first["datum"].startswith("2026-04-28") assert "Bundestag" in first["summary"] def test_strips_cdata_and_html(self): with patch("app.news_aggregator._http_get", return_value=SAMPLE_RSS): articles = fetch_rss("bundestag-aktuell", "https://example.com/rss") for a in articles: assert " Nur Titel nur-link """ with patch("app.news_aggregator._http_get", return_value=bad): articles = fetch_rss("x", "https://example.com/rss") assert articles == [] # ───────────────────────────────────────────────────────────────────────────── # upsert_articles # ───────────────────────────────────────────────────────────────────────────── @pytest.fixture def empty_db(tmp_path: Path) -> Path: db = tmp_path / "test_news.db" conn = sqlite3.connect(str(db)) conn.execute(""" CREATE TABLE news_articles ( url TEXT PRIMARY KEY, titel TEXT NOT NULL, summary TEXT, datum TEXT NOT NULL, source TEXT NOT NULL, ressort TEXT, tags TEXT, summary_embedding BLOB, embedding_model TEXT, fetched_at TEXT NOT NULL DEFAULT (datetime('now')) ) """) conn.commit() conn.close() return db SAMPLE_ARTICLES = [ { "url": "https://example.com/a", "titel": "Wohnungsbau", "summary": "Heute im Bundestag", "datum": "2026-04-28", "source": "tagesschau", "ressort": "inland", "tags": ["Wohnungsbau"], }, { "url": "https://example.com/b", "titel": "Klima", "summary": "EU plant Klimaziele", "datum": "2026-04-28", "source": "tagesschau", "ressort": "ausland", "tags": ["Klima", "EU"], }, ] class TestUpsertArticles: def test_inserts_new_articles(self, empty_db): stats = upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False) assert stats["inserted"] == 2 assert stats["updated"] == 0 def test_updates_existing_articles(self, empty_db): upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False) # Re-run with same URLs but different titel modified = [{**a, "titel": a["titel"] + " (neu)"} for a in SAMPLE_ARTICLES] stats = upsert_articles(modified, db_path=empty_db, embed=False) assert stats["updated"] == 2 assert stats["inserted"] == 0 # Verify the title was updated conn = sqlite3.connect(str(empty_db)) row = conn.execute( "SELECT titel FROM news_articles WHERE url=?", (SAMPLE_ARTICLES[0]["url"],), ).fetchone() conn.close() assert row[0].endswith("(neu)") def test_persists_tags_as_json(self, empty_db): upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False) conn = sqlite3.connect(str(empty_db)) row = conn.execute( "SELECT tags FROM news_articles WHERE url=?", (SAMPLE_ARTICLES[0]["url"],), ).fetchone() conn.close() tags = json.loads(row[0]) assert tags == ["Wohnungsbau"] def test_missing_db_returns_zeros(self, tmp_path): stats = upsert_articles(SAMPLE_ARTICLES, db_path=tmp_path / "missing.db", embed=False) assert stats == {"inserted": 0, "updated": 0, "embedded": 0}