gwoe-antragspruefer/tests/test_news_aggregator.py

"""Tests fuer app.news_aggregator (#170 Phase 1).

Testet Parser + DB-Persistierung gegen kontrollierte Fixtures, ohne
Live-HTTP-Calls (Tagesschau-API + Bundestag-RSS werden gemockt).
"""
from __future__ import annotations

import json
import sqlite3
from pathlib import Path
from unittest.mock import patch

import pytest

from app.news_aggregator import (
    _parse_rss_date,
    _strip_html,
    fetch_rss,
    fetch_tagesschau,
    upsert_articles,
)


# ─────────────────────────────────────────────────────────────────────────────
# Helper
# ─────────────────────────────────────────────────────────────────────────────


class TestStripHtml:
    def test_removes_tags(self):
        assert _strip_html("<p>Hello <b>world</b></p>") == "Hello world"

    def test_decodes_cdata(self):
        assert "Test" in _strip_html("<![CDATA[Test]]>")

    def test_decodes_entities(self):
        assert _strip_html("a &amp; b") == "a & b"

    def test_collapses_whitespace(self):
        assert _strip_html("<p>a   b\n c</p>") == "a b c"

    def test_empty(self):
        assert _strip_html("") == ""


class TestParseRssDate:
    def test_rfc822_to_iso(self):
        result = _parse_rss_date("Tue, 28 Apr 2026 10:45:12 GMT")
        assert result.startswith("2026-04-28")

    def test_invalid_returns_empty(self):
        assert _parse_rss_date("garbage") == ""
        assert _parse_rss_date("") == ""


# ─────────────────────────────────────────────────────────────────────────────
# fetch_tagesschau (mocked HTTP)
# ─────────────────────────────────────────────────────────────────────────────


SAMPLE_TAGESSCHAU_JSON = json.dumps({
    "news": [
        {
            "title": "Bundestag berät über Wohnungsbau",
            "firstSentence": "Der Bundestag hat heute über das neue Wohnungsbau-Gesetz beraten.",
            "shareURL": "https://www.tagesschau.de/inland/bundestag-wohnungsbau-100.html",
            "date": "2026-04-28T10:00:00.000+02:00",
            "ressort": "inland",
            "tags": [{"tag": "Wohnungsbau"}, {"tag": "Bundestag"}],
        },
        {
            "title": "EU-Kommission stellt Klimapaket vor",
            "firstSentence": "Die EU plant ehrgeizige Klimaziele.",
            "shareURL": "https://www.tagesschau.de/ausland/eu-klima-100.html",
            "date": "2026-04-28T11:00:00.000+02:00",
            "ressort": "ausland",
            "tags": [{"tag": "Klima"}, {"tag": "EU"}],
        },
        {
            # Dieser hat keinen shareURL — sollte uebersprungen werden
            "title": "Kein Link",
            "firstSentence": "Skip mich",
        },
    ],
}).encode("utf-8")


class TestFetchTagesschau:
    def test_parses_news_array(self):
        with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON):
            articles = fetch_tagesschau(ressorts=["inland"])
        # Deduplication ueber URL → 2 unique
        assert len(articles) == 2
        first = articles[0]
        assert first["url"] == "https://www.tagesschau.de/inland/bundestag-wohnungsbau-100.html"
        assert first["titel"] == "Bundestag berät über Wohnungsbau"
        assert "Wohnungsbau" in first["summary"]
        assert first["source"] == "tagesschau"
        assert first["ressort"] == "inland"
        assert "Wohnungsbau" in first["tags"]

    def test_skips_items_without_link(self):
        with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON):
            articles = fetch_tagesschau(ressorts=["inland"])
        assert all(a["url"] for a in articles)

    def test_returns_empty_on_http_error(self):
        with patch("app.news_aggregator._http_get", return_value=None):
            articles = fetch_tagesschau(ressorts=["inland"])
        assert articles == []

    def test_dedup_across_ressorts(self):
        """Wenn dasselbe Item in zwei Ressorts erscheint, wird es nur 1× geliefert."""
        with patch("app.news_aggregator._http_get", return_value=SAMPLE_TAGESSCHAU_JSON):
            articles = fetch_tagesschau(ressorts=["inland", "ausland"])
        urls = [a["url"] for a in articles]
        assert len(urls) == len(set(urls))


# ─────────────────────────────────────────────────────────────────────────────
# fetch_rss (mocked HTTP)
# ─────────────────────────────────────────────────────────────────────────────


SAMPLE_RSS = """<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"><channel><title>BT Aktuell</title>
<item>
<title><![CDATA[Bundestag berät Antrag zum Wohnungsbau]]></title>
<link>https://www.bundestag.de/dokumente/textarchiv/2026/kw18-wohnungsbau-1170388</link>
<description><![CDATA[Der Bundestag hat heute den Antrag zum Wohnungsbau beraten.]]></description>
<pubDate>Tue, 28 Apr 2026 10:45:12 GMT</pubDate>
</item>
<item>
<title>Antrag zur Klimapolitik</title>
<link>https://www.bundestag.de/klima</link>
<description>Klimaschutz im Bundestag</description>
<pubDate>Mon, 27 Apr 2026 10:00:00 GMT</pubDate>
</item>
</channel></rss>""".encode("utf-8")


class TestFetchRss:
    def test_parses_rss_items(self):
        with patch("app.news_aggregator._http_get", return_value=SAMPLE_RSS):
            articles = fetch_rss("bundestag-aktuell", "https://example.com/rss")
        assert len(articles) == 2
        first = articles[0]
        assert "Wohnungsbau" in first["titel"]
        assert first["url"].startswith("https://www.bundestag.de")
        assert first["source"] == "bundestag-aktuell"
        assert first["datum"].startswith("2026-04-28")
        assert "Bundestag" in first["summary"]

    def test_strips_cdata_and_html(self):
        with patch("app.news_aggregator._http_get", return_value=SAMPLE_RSS):
            articles = fetch_rss("bundestag-aktuell", "https://example.com/rss")
        for a in articles:
            assert "<![CDATA[" not in a["titel"]
            assert "<![CDATA[" not in a["summary"]

    def test_empty_on_http_error(self):
        with patch("app.news_aggregator._http_get", return_value=None):
            articles = fetch_rss("x", "https://example.com/rss")
        assert articles == []

    def test_skips_items_without_title_or_link(self):
        bad = b"""<?xml version="1.0"?><rss><channel>
        <item><title>Nur Titel</title></item>
        <item><link>nur-link</link></item>
        </channel></rss>"""
        with patch("app.news_aggregator._http_get", return_value=bad):
            articles = fetch_rss("x", "https://example.com/rss")
        assert articles == []


# ─────────────────────────────────────────────────────────────────────────────
# upsert_articles
# ─────────────────────────────────────────────────────────────────────────────


@pytest.fixture
def empty_db(tmp_path: Path) -> Path:
    db = tmp_path / "test_news.db"
    conn = sqlite3.connect(str(db))
    conn.execute("""
        CREATE TABLE news_articles (
            url TEXT PRIMARY KEY,
            titel TEXT NOT NULL,
            summary TEXT,
            datum TEXT NOT NULL,
            source TEXT NOT NULL,
            ressort TEXT,
            tags TEXT,
            summary_embedding BLOB,
            embedding_model TEXT,
            fetched_at TEXT NOT NULL DEFAULT (datetime('now'))
        )
    """)
    conn.commit()
    conn.close()
    return db


SAMPLE_ARTICLES = [
    {
        "url": "https://example.com/a",
        "titel": "Wohnungsbau",
        "summary": "Heute im Bundestag",
        "datum": "2026-04-28",
        "source": "tagesschau",
        "ressort": "inland",
        "tags": ["Wohnungsbau"],
    },
    {
        "url": "https://example.com/b",
        "titel": "Klima",
        "summary": "EU plant Klimaziele",
        "datum": "2026-04-28",
        "source": "tagesschau",
        "ressort": "ausland",
        "tags": ["Klima", "EU"],
    },
]


class TestUpsertArticles:
    def test_inserts_new_articles(self, empty_db):
        stats = upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False)
        assert stats["inserted"] == 2
        assert stats["updated"] == 0

    def test_updates_existing_articles(self, empty_db):
        upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False)
        # Re-run with same URLs but different titel
        modified = [{**a, "titel": a["titel"] + " (neu)"} for a in SAMPLE_ARTICLES]
        stats = upsert_articles(modified, db_path=empty_db, embed=False)
        assert stats["updated"] == 2
        assert stats["inserted"] == 0
        # Verify the title was updated
        conn = sqlite3.connect(str(empty_db))
        row = conn.execute(
            "SELECT titel FROM news_articles WHERE url=?",
            (SAMPLE_ARTICLES[0]["url"],),
        ).fetchone()
        conn.close()
        assert row[0].endswith("(neu)")

    def test_persists_tags_as_json(self, empty_db):
        upsert_articles(SAMPLE_ARTICLES, db_path=empty_db, embed=False)
        conn = sqlite3.connect(str(empty_db))
        row = conn.execute(
            "SELECT tags FROM news_articles WHERE url=?",
            (SAMPLE_ARTICLES[0]["url"],),
        ).fetchone()
        conn.close()
        tags = json.loads(row[0])
        assert tags == ["Wohnungsbau"]

    def test_missing_db_returns_zeros(self, tmp_path):
        stats = upsert_articles(SAMPLE_ARTICLES,
                                 db_path=tmp_path / "missing.db", embed=False)
        assert stats == {"inserted": 0, "updated": 0, "embedded": 0}