"""Tests fuer app.news_aggregator (#170 Phase 1). Testet Parser + DB-Persistierung gegen kontrollierte Fixtures, ohne Live-HTTP-Calls (Tagesschau-API + Bundestag-RSS werden gemockt). """ from __future__ import annotations import json import sqlite3 from pathlib import Path from unittest.mock import patch import pytest from app.news_aggregator import ( _parse_rss_date, _strip_html, fetch_rss, fetch_tagesschau, upsert_articles, ) # ───────────────────────────────────────────────────────────────────────────── # Helper # ───────────────────────────────────────────────────────────────────────────── class TestStripHtml: def test_removes_tags(self): assert _strip_html("
Hello world
") == "Hello world" def test_decodes_cdata(self): assert "Test" in _strip_html("") def test_decodes_entities(self): assert _strip_html("a & b") == "a & b" def test_collapses_whitespace(self): assert _strip_html("a b\n c
") == "a b c" def test_empty(self): assert _strip_html("") == "" class TestParseRssDate: def test_rfc822_to_iso(self): result = _parse_rss_date("Tue, 28 Apr 2026 10:45:12 GMT") assert result.startswith("2026-04-28") def test_invalid_returns_empty(self): assert _parse_rss_date("garbage") == "" assert _parse_rss_date("") == "" # ───────────────────────────────────────────────────────────────────────────── # fetch_tagesschau (mocked HTTP) # ───────────────────────────────────────────────────────────────────────────── SAMPLE_TAGESSCHAU_JSON = json.dumps({ "news": [ { "title": "Bundestag berät über Wohnungsbau", "firstSentence": "Der Bundestag hat heute über das neue Wohnungsbau-Gesetz beraten.", "shareURL": "https://www.tagesschau.de/inland/bundestag-wohnungsbau-100.html", "date": "2026-04-28T10:00:00.000+02:00", "ressort": "inland", "tags": [{"tag": "Wohnungsbau"}, {"tag": "Bundestag"}], }, { "title": "EU-Kommission stellt Klimapaket vor", "firstSentence": "Die EU plant ehrgeizige Klimaziele.", "shareURL": "https://www.tagesschau.de/ausland/eu-klima-100.html", "date": "2026-04-28T11:00:00.000+02:00", "ressort": "ausland", "tags": [{"tag": "Klima"}, {"tag": "EU"}], }, { # Dieser hat keinen shareURL — sollte uebersprungen werden "title": "Kein Link", "firstSentence": "Skip mich", }, ], }).encode("utf-8") class TestFetchTagesschau: def test_parses_news_array(self): with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON): articles = fetch_tagesschau(ressorts=["inland"]) # Deduplication ueber URL → 2 unique assert len(articles) == 2 first = articles[0] assert first["url"] == "https://www.tagesschau.de/inland/bundestag-wohnungsbau-100.html" assert first["titel"] == "Bundestag berät über Wohnungsbau" assert "Wohnungsbau" in first["summary"] assert first["source"] == "tagesschau" assert first["ressort"] == "inland" assert "Wohnungsbau" in first["tags"] def test_skips_items_without_link(self): with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON): articles = fetch_tagesschau(ressorts=["inland"]) assert all(a["url"] for a in articles) def test_returns_empty_on_http_error(self): with patch("app.news_aggregator._http_get", return_value=None): articles = fetch_tagesschau(ressorts=["inland"]) assert articles == [] def test_dedup_across_ressorts(self): """Wenn dasselbe Item in zwei Ressorts erscheint, wird es nur 1× geliefert.""" with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON): articles = fetch_tagesschau(ressorts=["inland", "ausland"]) urls = [a["url"] for a in articles] assert len(urls) == len(set(urls)) # ───────────────────────────────────────────────────────────────────────────── # fetch_rss (mocked HTTP) # ───────────────────────────────────────────────────────────────────────────── SAMPLE_RSS = """