Neue Tests in dieser Migration: - test_database.py (Merkliste-CRUD, Subscriptions, abgeordnetenwatch-Joins) - test_clustering.py (82% Coverage) - test_drucksache_typen.py (100%) - test_mail.py (86%) - test_monitoring.py (23 Tests) - test_abgeordnetenwatch.py (23 Tests, inkl. Drucksache-Extraction) - test_redline_parser.py (20 Tests fuer §INS§/§DEL§-Marker) - test_bug_regressions.py (PRAGMA, JWT-azp, CDU-PDF, PFLICHT-FRAKTIONEN, NRW-Titel) - test_embeddings_v3_v4.py (WRITE/READ-Pattern) - test_wahlprogramm_check.py (#128) - test_wahlprogramm_fetch.py (#138) - test_antrag/bewertung/abonnement_repository.py + test_llm_bewerter.py (DDD) - test_domain_behavior.py (5 Domain-Methoden boundary tests) - tests/e2e/test_ui.py (Playwright) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
138 lines
5.1 KiB
Python
138 lines
5.1 KiB
Python
"""Tests für LlmBewerter-Port und QwenBewerter-Adapter (ADR 0008).
|
|
|
|
Der Adapter wird mit einem Fake-Client getestet — kein Netzwerk, kein
|
|
``openai``-Paket. Retry-Semantik (Temperatur steigt um 0.1 pro Versuch)
|
|
ist hier explizit getestet, damit die Migration die Semantik nicht
|
|
still verändert.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import json
|
|
import types
|
|
|
|
import pytest
|
|
|
|
from app.adapters.qwen_bewerter import QwenBewerter, _strip_markdown_fences
|
|
from app.ports.llm_bewerter import LlmBewerter, LlmRequest
|
|
|
|
|
|
def _run(coro):
|
|
return asyncio.get_event_loop().run_until_complete(coro)
|
|
|
|
|
|
def _make_fake_client(responses: list[str]):
|
|
"""Produziert einen Fake-OpenAI-Client, der pro Call einen Response aus
|
|
der Liste liefert und Metadaten (Temperatur) aufzeichnet."""
|
|
calls: list[dict] = []
|
|
|
|
class FakeCompletions:
|
|
async def create(self, **kwargs):
|
|
calls.append(dict(kwargs))
|
|
idx = len(calls) - 1
|
|
content = responses[min(idx, len(responses) - 1)]
|
|
return types.SimpleNamespace(
|
|
choices=[types.SimpleNamespace(
|
|
message=types.SimpleNamespace(content=content)
|
|
)]
|
|
)
|
|
|
|
class FakeChat:
|
|
completions = FakeCompletions()
|
|
|
|
class FakeClient:
|
|
chat = FakeChat()
|
|
|
|
return FakeClient(), calls
|
|
|
|
|
|
# ─── Strip-Fences ──────────────────────────────────────────────────────────
|
|
|
|
class TestStripMarkdownFences:
|
|
def test_plain_json_unchanged(self):
|
|
assert _strip_markdown_fences('{"a": 1}') == '{"a": 1}'
|
|
|
|
def test_json_fence(self):
|
|
assert _strip_markdown_fences('```json\n{"a": 1}\n```') == '{"a": 1}'
|
|
|
|
def test_plain_fence(self):
|
|
assert _strip_markdown_fences('```\n{"a": 1}\n```') == '{"a": 1}'
|
|
|
|
|
|
# ─── Protocol-Konformität ──────────────────────────────────────────────────
|
|
|
|
class TestProtocol:
|
|
def test_qwen_implements_llm_bewerter(self):
|
|
# runtime_checkable Protocol — Method bewerte existiert
|
|
qb = QwenBewerter(api_key="x", base_url="y", client=object())
|
|
assert isinstance(qb, LlmBewerter)
|
|
|
|
|
|
# ─── QwenBewerter mit FakeClient ───────────────────────────────────────────
|
|
|
|
class TestQwenBewerterHappyPath:
|
|
def test_single_successful_call(self):
|
|
fake, calls = _make_fake_client(['{"gwoeScore": 7.0}'])
|
|
qb = QwenBewerter(api_key="x", base_url="y", client=fake)
|
|
request = LlmRequest(system_prompt="sys", user_prompt="usr")
|
|
result = _run(qb.bewerte(request))
|
|
assert result == {"gwoeScore": 7.0}
|
|
assert len(calls) == 1
|
|
assert calls[0]["temperature"] == pytest.approx(0.3)
|
|
|
|
def test_markdown_fence_is_stripped(self):
|
|
fake, _ = _make_fake_client(['```json\n{"gwoeScore": 8.0}\n```'])
|
|
qb = QwenBewerter(client=fake)
|
|
result = _run(qb.bewerte(LlmRequest("sys", "usr")))
|
|
assert result == {"gwoeScore": 8.0}
|
|
|
|
def test_passes_model_through(self):
|
|
fake, calls = _make_fake_client(['{"a": 1}'])
|
|
qb = QwenBewerter(client=fake)
|
|
_run(qb.bewerte(LlmRequest("sys", "usr", model="qwen-turbo")))
|
|
assert calls[0]["model"] == "qwen-turbo"
|
|
|
|
|
|
class TestQwenBewerterRetries:
|
|
def test_retry_raises_temperature(self):
|
|
"""Bei JSON-Parse-Fehler steigt die Temperatur um 0.1 pro Versuch."""
|
|
fake, calls = _make_fake_client([
|
|
"nicht valides JSON",
|
|
"immer noch kaputt",
|
|
'{"gwoeScore": 6.0}', # 3. Versuch klappt
|
|
])
|
|
qb = QwenBewerter(client=fake)
|
|
request = LlmRequest("sys", "usr", max_retries=3)
|
|
result = _run(qb.bewerte(request))
|
|
assert result == {"gwoeScore": 6.0}
|
|
assert len(calls) == 3
|
|
assert calls[0]["temperature"] == pytest.approx(0.3)
|
|
assert calls[1]["temperature"] == pytest.approx(0.4)
|
|
assert calls[2]["temperature"] == pytest.approx(0.5)
|
|
|
|
def test_exhausted_retries_raise(self):
|
|
fake, _ = _make_fake_client([
|
|
"kaputt", "kaputt", "kaputt",
|
|
])
|
|
qb = QwenBewerter(client=fake)
|
|
request = LlmRequest("sys", "usr", max_retries=3)
|
|
with pytest.raises(json.JSONDecodeError):
|
|
_run(qb.bewerte(request))
|
|
|
|
def test_single_retry_is_respected(self):
|
|
"""max_retries=1 heißt: genau ein Versuch, kein Retry."""
|
|
fake, calls = _make_fake_client(["kaputt"])
|
|
qb = QwenBewerter(client=fake)
|
|
with pytest.raises(json.JSONDecodeError):
|
|
_run(qb.bewerte(LlmRequest("sys", "usr", max_retries=1)))
|
|
assert len(calls) == 1
|
|
|
|
|
|
class TestLlmRequestDefaults:
|
|
def test_defaults_match_legacy_analyzer(self):
|
|
req = LlmRequest("s", "u")
|
|
assert req.model == "qwen-plus"
|
|
assert req.max_retries == 3
|
|
assert req.max_tokens == 4000
|
|
assert req.base_temperature == 0.3
|