test(#134): clustering.py Coverage 82.3% → 99.3%

- TestUnionFindRankSwap: rank-Asymmetrie-Branch (Line 69) - TestLoadAssessmentItems: tmp-DB mit korrekten + kaputten Embeddings, bundesland-Filter, vollstaendiges Item-Schema - TestBuildHierarchySubclusters: - max_cluster_size=3 zwingt grossen Cluster zu sub-clustern - kleiner Cluster bekommt subclusters=None Total Coverage: 49.9% → 50.4% (50%-Marke ueberschritten), 718 → 724 Tests.
2026-04-28 11:02:58 +02:00 · 2026-04-28 11:02:58 +02:00 · 581d1591b8
commit 581d1591b8
parent 999926b5f3
1 changed files with 153 additions and 0 deletions
--- a/tests/test_clustering.py
+++ b/tests/test_clustering.py
@ -436,3 +436,156 @@ class TestFindSimilarAssessments:
            result = run(clustering.find_similar_assessments(items[0]["drucksache"]))
        assert result == []
 # ─── Coverage-Backfill (#134) ────────────────────────────────────────────────
 class TestUnionFindRankSwap:
    """Wenn rank[ra] < rank[rb], muss parent[ra] auf rb zeigen (Line 69)."""
    def test_smaller_rank_attaches_to_larger(self):
        from app.clustering import UnionFind
        uf = UnionFind(4)
        # Erst zwei Trees mit unterschiedlichen Höhen aufbauen:
        # 0—1 (rank 1) und 2—3—... (rank 1)
        uf.union(0, 1)
        uf.union(2, 3)
        # Beide Roots haben rank 1 — uniteFurther durch Drittes:
        uf.union(2, 0)  # bringt einen rank-Tie auf rank=2 für einen
        # Jetzt eine Insertion mit Rank-Asymmetrie:
        # Erstellen wir eine Klasse mit höherem Rank
        big = UnionFind(8)
        big.union(0, 1)
        big.union(2, 3)
        big.union(0, 2)  # baut einen rank-2-Baum
        # Knoten 4 als Single (rank 0). union(4, 0) sollte 4 unter 0 hängen.
        big.union(4, 0)
        # 4 sollte jetzt im selben Set wie 0 sein
        assert big.find(4) == big.find(0)
 class TestLoadAssessmentItems:
    """Async DB-Lader; Tests gegen tmp-DB."""
    def _build_db(self, tmp_path):
        import sqlite3
        import json as _j
        db_path = tmp_path / "clust.db"
        conn = sqlite3.connect(str(db_path))
        conn.execute("""
            CREATE TABLE assessments (
                drucksache TEXT PRIMARY KEY, title TEXT,
                fraktionen TEXT, datum TEXT, bundesland TEXT,
                gwoe_score REAL, link TEXT,
                empfehlung TEXT, empfehlung_symbol TEXT,
                themen TEXT, summary_embedding BLOB
            )
        """)
        # Korrektes Embedding
        emb_ok = _j.dumps([0.1, 0.2, 0.3]).encode()
        conn.execute(
            "INSERT INTO assessments VALUES (?,?,?,?,?,?,?,?,?,?,?)",
            ("18/1", "T1", '["CDU"]', "2026-04-01", "NRW",
             7.0, "x", "Empfohlen", "+", '["Klima"]', emb_ok),
        )
        # Kaputtes Embedding (ungueltiges JSON)
        conn.execute(
            "INSERT INTO assessments VALUES (?,?,?,?,?,?,?,?,?,?,?)",
            ("18/2", "T2", '["SPD"]', "2026-04-02", "NRW",
             5.0, "y", "Empfohlen", "+", '["Klima"]', b"not-json"),
        )
        # Anderes BL (fuer bundesland-Filter)
        conn.execute(
            "INSERT INTO assessments VALUES (?,?,?,?,?,?,?,?,?,?,?)",
            ("8/1", "T3", '["AfD"]', "2026-04-03", "MV",
             3.0, "z", "Ablehnen", "-", "[]", emb_ok),
        )
        conn.commit()
        conn.close()
        return db_path
    def test_loads_only_valid_embeddings(self, tmp_path, monkeypatch):
        from app.config import settings
        from app import clustering
        db = self._build_db(tmp_path)
        monkeypatch.setattr(settings, "db_path", str(db))
        items = run(clustering.load_assessment_items())
        # 18/2 hat kaputtes Embedding und wird übersprungen
        ids = sorted(i["drucksache"] for i in items)
        assert "18/2" not in ids
        assert "18/1" in ids
        assert "8/1" in ids
    def test_bundesland_filter(self, tmp_path, monkeypatch):
        from app.config import settings
        from app import clustering
        db = self._build_db(tmp_path)
        monkeypatch.setattr(settings, "db_path", str(db))
        items = run(clustering.load_assessment_items(bundesland="NRW"))
        ids = [i["drucksache"] for i in items]
        assert ids == ["18/1"]
    def test_loaded_item_fields_present(self, tmp_path, monkeypatch):
        from app.config import settings
        from app import clustering
        db = self._build_db(tmp_path)
        monkeypatch.setattr(settings, "db_path", str(db))
        items = run(clustering.load_assessment_items(bundesland="NRW"))
        assert items
        item = items[0]
        for key in ("drucksache", "title", "fraktionen", "datum", "link",
                    "bundesland", "gwoe_score", "empfehlung",
                    "empfehlung_symbol", "themen", "embedding"):
            assert key in item
 class TestBuildHierarchySubclusters:
    """Wenn ein Cluster groesser als max_cluster_size ist, wird sub-clustered
    (Lines 256-262)."""
    def test_large_cluster_gets_subclustered(self):
        from app import clustering
        from unittest.mock import patch
        # 6 fast-identische Items → ein grosser Cluster, sub-Cluster sub > 1
        v = [1.0, 0.0, 0.0]
        items = [
            {**_make_items(1)[0], "drucksache": f"18/{i}",
             "embedding": [v[0] + 0.01 * i, v[1], v[2]]}
            for i in range(6)
        ]
        async def fake_load(bundesland=None):
            return items
        with patch.object(clustering, "load_assessment_items", side_effect=fake_load):
            # max_cluster_size=3 zwingt sub-Clustering
            result = run(clustering.build_hierarchy(
                threshold=0.95, max_cluster_size=3, subcluster_threshold=0.999,
            ))
        assert result["clusters"]
        # Mindestens ein Cluster muss subclusters haben
        assert any(c.get("subclusters") for c in result["clusters"])
    def test_small_cluster_has_subclusters_none(self):
        from app import clustering
        from unittest.mock import patch
        items = _make_items(2)
        # Setze dieselben embeddings, damit sie in einem Cluster sind
        items[0]["embedding"] = [1.0, 0.0, 0.0]
        items[1]["embedding"] = [1.0, 0.0, 0.0]
        async def fake_load(bundesland=None):
            return items
        with patch.object(clustering, "load_assessment_items", side_effect=fake_load):
            result = run(clustering.build_hierarchy(
                threshold=0.5, max_cluster_size=10,
            ))
        for c in result["clusters"]:
            assert c["subclusters"] is None