263 lines
10 KiB
Python
263 lines
10 KiB
Python
|
|
"""Tests fuer app.news_aggregator (#170 Phase 1).
|
|||
|
|
|
|||
|
|
Testet Parser + DB-Persistierung gegen kontrollierte Fixtures, ohne
|
|||
|
|
Live-HTTP-Calls (Tagesschau-API + Bundestag-RSS werden gemockt).
|
|||
|
|
"""
|
|||
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import json
|
|||
|
|
import sqlite3
|
|||
|
|
from pathlib import Path
|
|||
|
|
from unittest.mock import patch
|
|||
|
|
|
|||
|
|
import pytest
|
|||
|
|
|
|||
|
|
from app.news_aggregator import (
|
|||
|
|
_parse_rss_date,
|
|||
|
|
_strip_html,
|
|||
|
|
fetch_rss,
|
|||
|
|
fetch_tagesschau,
|
|||
|
|
upsert_articles,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|||
|
|
# Helper
|
|||
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TestStripHtml:
|
|||
|
|
def test_removes_tags(self):
|
|||
|
|
assert _strip_html("<p>Hello <b>world</b></p>") == "Hello world"
|
|||
|
|
|
|||
|
|
def test_decodes_cdata(self):
|
|||
|
|
assert "Test" in _strip_html("<![CDATA[Test]]>")
|
|||
|
|
|
|||
|
|
def test_decodes_entities(self):
|
|||
|
|
assert _strip_html("a & b") == "a & b"
|
|||
|
|
|
|||
|
|
def test_collapses_whitespace(self):
|
|||
|
|
assert _strip_html("<p>a b\n c</p>") == "a b c"
|
|||
|
|
|
|||
|
|
def test_empty(self):
|
|||
|
|
assert _strip_html("") == ""
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TestParseRssDate:
|
|||
|
|
def test_rfc822_to_iso(self):
|
|||
|
|
result = _parse_rss_date("Tue, 28 Apr 2026 10:45:12 GMT")
|
|||
|
|
assert result.startswith("2026-04-28")
|
|||
|
|
|
|||
|
|
def test_invalid_returns_empty(self):
|
|||
|
|
assert _parse_rss_date("garbage") == ""
|
|||
|
|
assert _parse_rss_date("") == ""
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|||
|
|
# fetch_tagesschau (mocked HTTP)
|
|||
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
|
|||
|
|
SAMPLE_TAGESSCHAU_JSON = json.dumps({
|
|||
|
|
"news": [
|
|||
|
|
{
|
|||
|
|
"title": "Bundestag berät über Wohnungsbau",
|
|||
|
|
"firstSentence": "Der Bundestag hat heute über das neue Wohnungsbau-Gesetz beraten.",
|
|||
|
|
"shareURL": "https://www.tagesschau.de/inland/bundestag-wohnungsbau-100.html",
|
|||
|
|
"date": "2026-04-28T10:00:00.000+02:00",
|
|||
|
|
"ressort": "inland",
|
|||
|
|
"tags": [{"tag": "Wohnungsbau"}, {"tag": "Bundestag"}],
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"title": "EU-Kommission stellt Klimapaket vor",
|
|||
|
|
"firstSentence": "Die EU plant ehrgeizige Klimaziele.",
|
|||
|
|
"shareURL": "https://www.tagesschau.de/ausland/eu-klima-100.html",
|
|||
|
|
"date": "2026-04-28T11:00:00.000+02:00",
|
|||
|
|
"ressort": "ausland",
|
|||
|
|
"tags": [{"tag": "Klima"}, {"tag": "EU"}],
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
# Dieser hat keinen shareURL — sollte uebersprungen werden
|
|||
|
|
"title": "Kein Link",
|
|||
|
|
"firstSentence": "Skip mich",
|
|||
|
|
},
|
|||
|
|
],
|
|||
|
|
}).encode("utf-8")
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TestFetchTagesschau:
|
|||
|
|
def test_parses_news_array(self):
|
|||
|
|
with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON):
|
|||
|
|
articles = fetch_tagesschau(ressorts=["inland"])
|
|||
|
|
# Deduplication ueber URL → 2 unique
|
|||
|
|
assert len(articles) == 2
|
|||
|
|
first = articles[0]
|
|||
|
|
assert first["url"] == "https://www.tagesschau.de/inland/bundestag-wohnungsbau-100.html"
|
|||
|
|
assert first["titel"] == "Bundestag berät über Wohnungsbau"
|
|||
|
|
assert "Wohnungsbau" in first["summary"]
|
|||
|
|
assert first["source"] == "tagesschau"
|
|||
|
|
assert first["ressort"] == "inland"
|
|||
|
|
assert "Wohnungsbau" in first["tags"]
|
|||
|
|
|
|||
|
|
def test_skips_items_without_link(self):
|
|||
|
|
with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON):
|
|||
|
|
articles = fetch_tagesschau(ressorts=["inland"])
|
|||
|
|
assert all(a["url"] for a in articles)
|
|||
|
|
|
|||
|
|
def test_returns_empty_on_http_error(self):
|
|||
|
|
with patch("app.news_aggregator._http_get", return_value=None):
|
|||
|
|
articles = fetch_tagesschau(ressorts=["inland"])
|
|||
|
|
assert articles == []
|
|||
|
|
|
|||
|
|
def test_dedup_across_ressorts(self):
|
|||
|
|
"""Wenn dasselbe Item in zwei Ressorts erscheint, wird es nur 1× geliefert."""
|
|||
|
|
with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON):
|
|||
|
|
articles = fetch_tagesschau(ressorts=["inland", "ausland"])
|
|||
|
|
urls = [a["url"] for a in articles]
|
|||
|
|
assert len(urls) == len(set(urls))
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|||
|
|
# fetch_rss (mocked HTTP)
|
|||
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
|
|||
|
|
SAMPLE_RSS = """<?xml version="1.0" encoding="UTF-8"?>
|
|||
|
|
<rss version="2.0"><channel><title>BT Aktuell</title>
|
|||
|
|
<item>
|
|||
|
|
<title><![CDATA[Bundestag berät Antrag zum Wohnungsbau]]></title>
|
|||
|
|
<link>https://www.bundestag.de/dokumente/textarchiv/2026/kw18-wohnungsbau-1170388</link>
|
|||
|
|
<description><![CDATA[Der Bundestag hat heute den Antrag zum Wohnungsbau beraten.]]></description>
|
|||
|
|
<pubDate>Tue, 28 Apr 2026 10:45:12 GMT</pubDate>
|
|||
|
|
</item>
|
|||
|
|
<item>
|
|||
|
|
<title>Antrag zur Klimapolitik</title>
|
|||
|
|
<link>https://www.bundestag.de/klima</link>
|
|||
|
|
<description>Klimaschutz im Bundestag</description>
|
|||
|
|
<pubDate>Mon, 27 Apr 2026 10:00:00 GMT</pubDate>
|
|||
|
|
</item>
|
|||
|
|
</channel></rss>""".encode("utf-8")
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TestFetchRss:
|
|||
|
|
def test_parses_rss_items(self):
|
|||
|
|
with patch("app.news_aggregator._http_get", return_value=SAMPLE_RSS):
|
|||
|
|
articles = fetch_rss("bundestag-aktuell", "https://example.com/rss")
|
|||
|
|
assert len(articles) == 2
|
|||
|
|
first = articles[0]
|
|||
|
|
assert "Wohnungsbau" in first["titel"]
|
|||
|
|
assert first["url"].startswith("https://www.bundestag.de")
|
|||
|
|
assert first["source"] == "bundestag-aktuell"
|
|||
|
|
assert first["datum"].startswith("2026-04-28")
|
|||
|
|
assert "Bundestag" in first["summary"]
|
|||
|
|
|
|||
|
|
def test_strips_cdata_and_html(self):
|
|||
|
|
with patch("app.news_aggregator._http_get", return_value=SAMPLE_RSS):
|
|||
|
|
articles = fetch_rss("bundestag-aktuell", "https://example.com/rss")
|
|||
|
|
for a in articles:
|
|||
|
|
assert "<![CDATA[" not in a["titel"]
|
|||
|
|
assert "<![CDATA[" not in a["summary"]
|
|||
|
|
|
|||
|
|
def test_empty_on_http_error(self):
|
|||
|
|
with patch("app.news_aggregator._http_get", return_value=None):
|
|||
|
|
articles = fetch_rss("x", "https://example.com/rss")
|
|||
|
|
assert articles == []
|
|||
|
|
|
|||
|
|
def test_skips_items_without_title_or_link(self):
|
|||
|
|
bad = b"""<?xml version="1.0"?><rss><channel>
|
|||
|
|
<item><title>Nur Titel</title></item>
|
|||
|
|
<item><link>nur-link</link></item>
|
|||
|
|
</channel></rss>"""
|
|||
|
|
with patch("app.news_aggregator._http_get", return_value=bad):
|
|||
|
|
articles = fetch_rss("x", "https://example.com/rss")
|
|||
|
|
assert articles == []
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|||
|
|
# upsert_articles
|
|||
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
|
|||
|
|
@pytest.fixture
|
|||
|
|
def empty_db(tmp_path: Path) -> Path:
|
|||
|
|
db = tmp_path / "test_news.db"
|
|||
|
|
conn = sqlite3.connect(str(db))
|
|||
|
|
conn.execute("""
|
|||
|
|
CREATE TABLE news_articles (
|
|||
|
|
url TEXT PRIMARY KEY,
|
|||
|
|
titel TEXT NOT NULL,
|
|||
|
|
summary TEXT,
|
|||
|
|
datum TEXT NOT NULL,
|
|||
|
|
source TEXT NOT NULL,
|
|||
|
|
ressort TEXT,
|
|||
|
|
tags TEXT,
|
|||
|
|
summary_embedding BLOB,
|
|||
|
|
embedding_model TEXT,
|
|||
|
|
fetched_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|||
|
|
)
|
|||
|
|
""")
|
|||
|
|
conn.commit()
|
|||
|
|
conn.close()
|
|||
|
|
return db
|
|||
|
|
|
|||
|
|
|
|||
|
|
SAMPLE_ARTICLES = [
|
|||
|
|
{
|
|||
|
|
"url": "https://example.com/a",
|
|||
|
|
"titel": "Wohnungsbau",
|
|||
|
|
"summary": "Heute im Bundestag",
|
|||
|
|
"datum": "2026-04-28",
|
|||
|
|
"source": "tagesschau",
|
|||
|
|
"ressort": "inland",
|
|||
|
|
"tags": ["Wohnungsbau"],
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"url": "https://example.com/b",
|
|||
|
|
"titel": "Klima",
|
|||
|
|
"summary": "EU plant Klimaziele",
|
|||
|
|
"datum": "2026-04-28",
|
|||
|
|
"source": "tagesschau",
|
|||
|
|
"ressort": "ausland",
|
|||
|
|
"tags": ["Klima", "EU"],
|
|||
|
|
},
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TestUpsertArticles:
|
|||
|
|
def test_inserts_new_articles(self, empty_db):
|
|||
|
|
stats = upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False)
|
|||
|
|
assert stats["inserted"] == 2
|
|||
|
|
assert stats["updated"] == 0
|
|||
|
|
|
|||
|
|
def test_updates_existing_articles(self, empty_db):
|
|||
|
|
upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False)
|
|||
|
|
# Re-run with same URLs but different titel
|
|||
|
|
modified = [{**a, "titel": a["titel"] + " (neu)"} for a in SAMPLE_ARTICLES]
|
|||
|
|
stats = upsert_articles(modified, db_path=empty_db, embed=False)
|
|||
|
|
assert stats["updated"] == 2
|
|||
|
|
assert stats["inserted"] == 0
|
|||
|
|
# Verify the title was updated
|
|||
|
|
conn = sqlite3.connect(str(empty_db))
|
|||
|
|
row = conn.execute(
|
|||
|
|
"SELECT titel FROM news_articles WHERE url=?",
|
|||
|
|
(SAMPLE_ARTICLES[0]["url"],),
|
|||
|
|
).fetchone()
|
|||
|
|
conn.close()
|
|||
|
|
assert row[0].endswith("(neu)")
|
|||
|
|
|
|||
|
|
def test_persists_tags_as_json(self, empty_db):
|
|||
|
|
upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False)
|
|||
|
|
conn = sqlite3.connect(str(empty_db))
|
|||
|
|
row = conn.execute(
|
|||
|
|
"SELECT tags FROM news_articles WHERE url=?",
|
|||
|
|
(SAMPLE_ARTICLES[0]["url"],),
|
|||
|
|
).fetchone()
|
|||
|
|
conn.close()
|
|||
|
|
tags = json.loads(row[0])
|
|||
|
|
assert tags == ["Wohnungsbau"]
|
|||
|
|
|
|||
|
|
def test_missing_db_returns_zeros(self, tmp_path):
|
|||
|
|
stats = upsert_articles(SAMPLE_ARTICLES,
|
|||
|
|
db_path=tmp_path / "missing.db", embed=False)
|
|||
|
|
assert stats == {"inserted": 0, "updated": 0, "embedded": 0}
|