267 lines
11 KiB
Python
267 lines
11 KiB
Python
|
|
"""Tests für das WRITE/READ-Pattern der v3→v4-Embedding-Migration (ADR 0006, Issue #123).
|
||
|
|
|
||
|
|
Alle Tests verwenden eine gestubbte SQLite-In-Memory-DB und mocken den
|
||
|
|
OpenAI-Client — kein echter API-Aufruf findet statt.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import json
|
||
|
|
import sqlite3
|
||
|
|
import sys
|
||
|
|
import types
|
||
|
|
from pathlib import Path
|
||
|
|
from unittest.mock import MagicMock, patch
|
||
|
|
|
||
|
|
import pytest
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Helpers — gestubbte DB und Fake-Embeddings
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def _make_db(path: str) -> sqlite3.Connection:
|
||
|
|
"""Erstelle leere chunks-Tabelle mit model-Spalte."""
|
||
|
|
conn = sqlite3.connect(path)
|
||
|
|
conn.execute("""
|
||
|
|
CREATE TABLE chunks (
|
||
|
|
id INTEGER PRIMARY KEY,
|
||
|
|
programm_id TEXT NOT NULL,
|
||
|
|
partei TEXT NOT NULL,
|
||
|
|
typ TEXT NOT NULL,
|
||
|
|
seite INTEGER,
|
||
|
|
text TEXT NOT NULL,
|
||
|
|
embedding BLOB NOT NULL,
|
||
|
|
bundesland TEXT,
|
||
|
|
model TEXT NOT NULL DEFAULT 'text-embedding-v3',
|
||
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||
|
|
)
|
||
|
|
""")
|
||
|
|
conn.execute("CREATE INDEX idx_chunks_model ON chunks(model)")
|
||
|
|
conn.commit()
|
||
|
|
return conn
|
||
|
|
|
||
|
|
|
||
|
|
def _vec(seed: float, dim: int = 4) -> list[float]:
|
||
|
|
"""Einfacher Einheits-Vektor für Cosine-Tests (dim klein für Geschwindigkeit)."""
|
||
|
|
v = [seed * (i + 1) for i in range(dim)]
|
||
|
|
norm = sum(x * x for x in v) ** 0.5
|
||
|
|
return [x / norm for x in v]
|
||
|
|
|
||
|
|
|
||
|
|
def _insert_chunk(conn, programm_id, partei, typ, text, model, seite=1, bundesland=None):
|
||
|
|
emb = _vec(0.9)
|
||
|
|
conn.execute(
|
||
|
|
"INSERT INTO chunks (programm_id, partei, typ, seite, text, embedding, bundesland, model) "
|
||
|
|
"VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
|
||
|
|
(programm_id, partei, typ, seite, text, json.dumps(emb).encode(), bundesland, model),
|
||
|
|
)
|
||
|
|
conn.commit()
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Test 1: Query mit aktivem READ-Modell findet nur v4-Chunks, ignoriert v3
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def test_query_filters_by_read_model(tmp_path, monkeypatch):
|
||
|
|
"""find_relevant_chunks filtert auf EMBEDDING_MODEL_READ; v3-Rows werden ignoriert."""
|
||
|
|
db_path = tmp_path / "embeddings.db"
|
||
|
|
conn = _make_db(str(db_path))
|
||
|
|
|
||
|
|
# Einen v3- und einen v4-Chunk einfügen
|
||
|
|
_insert_chunk(conn, "spd-nrw-2022", "SPD", "wahlprogramm",
|
||
|
|
"Klimaschutz ist unsere Priorität v3", "text-embedding-v3")
|
||
|
|
_insert_chunk(conn, "spd-nrw-2022", "SPD", "wahlprogramm",
|
||
|
|
"Klimaschutz ist unsere Priorität v4", "text-embedding-v4")
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
# READ = v4
|
||
|
|
query_vec = _vec(0.9)
|
||
|
|
|
||
|
|
import app.embeddings as emb_mod
|
||
|
|
monkeypatch.setattr(emb_mod, "EMBEDDINGS_DB", db_path)
|
||
|
|
monkeypatch.setattr(emb_mod, "EMBEDDING_MODEL_READ", "text-embedding-v4")
|
||
|
|
monkeypatch.setattr(emb_mod, "create_embedding", lambda text, model=None: query_vec)
|
||
|
|
|
||
|
|
results = emb_mod.find_relevant_chunks("Klimaschutz", min_similarity=0.0)
|
||
|
|
texts = [r["text"] for r in results]
|
||
|
|
assert any("v4" in t for t in texts), "v4-Chunk muss im Ergebnis sein"
|
||
|
|
assert not any("v3" in t for t in texts), "v3-Chunk darf bei READ=v4 nicht zurückgegeben werden"
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Test 2: index_programm schreibt in WRITE-Modell
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def test_index_programm_writes_to_write_model(tmp_path, monkeypatch):
|
||
|
|
"""index_programm persistiert Chunks mit dem konfigurierten EMBEDDING_MODEL (write)."""
|
||
|
|
import app.embeddings as emb_mod
|
||
|
|
|
||
|
|
db_path = tmp_path / "embeddings.db"
|
||
|
|
# Erstelle leere DB mit Schema
|
||
|
|
conn = _make_db(str(db_path))
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
pdf_dir = tmp_path / "pdfs"
|
||
|
|
pdf_dir.mkdir()
|
||
|
|
|
||
|
|
# Stub: PDF-Extraktion gibt einen Fake-Page zurück
|
||
|
|
monkeypatch.setattr(emb_mod, "EMBEDDINGS_DB", db_path)
|
||
|
|
monkeypatch.setattr(emb_mod, "EMBEDDING_MODEL", "text-embedding-v4")
|
||
|
|
monkeypatch.setattr(
|
||
|
|
emb_mod, "extract_text_with_pages",
|
||
|
|
lambda path: [(1, "Gemeinwohl Solidarität Nachhaltigkeit " * 10)]
|
||
|
|
)
|
||
|
|
monkeypatch.setattr(emb_mod, "create_embedding", lambda text, model=None: _vec(0.5))
|
||
|
|
|
||
|
|
# PDF-Datei muss existieren (nur die exists()-Prüfung)
|
||
|
|
fake_pdf = pdf_dir / "spd-nrw-2022.pdf"
|
||
|
|
fake_pdf.write_bytes(b"%PDF-1.4 fake")
|
||
|
|
|
||
|
|
count = emb_mod.index_programm("spd-nrw-2022", pdf_dir)
|
||
|
|
assert count > 0, "index_programm muss mindestens einen Chunk indexieren"
|
||
|
|
|
||
|
|
conn = sqlite3.connect(str(db_path))
|
||
|
|
rows = conn.execute("SELECT model FROM chunks WHERE programm_id='spd-nrw-2022'").fetchall()
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
assert rows, "Es müssen Rows in der DB sein"
|
||
|
|
for (model,) in rows:
|
||
|
|
assert model == "text-embedding-v4", f"Gespeichertes Modell sollte text-embedding-v4 sein, ist {model!r}"
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Test 3: READ-Wechsel — neuer Chunk nach Switch nur im neuen Modell-Raum
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def test_read_switch_sees_only_new_model_chunks(tmp_path, monkeypatch):
|
||
|
|
"""Nach Switch READ=v4 liefert find_relevant_chunks nur v4-Rows, nicht v3."""
|
||
|
|
db_path = tmp_path / "embeddings.db"
|
||
|
|
conn = _make_db(str(db_path))
|
||
|
|
|
||
|
|
# Mehrere v3-Rows (alte Prod-Chunks)
|
||
|
|
for i in range(3):
|
||
|
|
_insert_chunk(conn, "cdu-nrw-2022", "CDU", "wahlprogramm",
|
||
|
|
f"Wirtschaft und Arbeit v3 chunk {i}", "text-embedding-v3")
|
||
|
|
# Ein neuer v4-Row nach Reindex
|
||
|
|
_insert_chunk(conn, "cdu-nrw-2022", "CDU", "wahlprogramm",
|
||
|
|
"Wirtschaft und Arbeit v4 chunk 0", "text-embedding-v4")
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
import app.embeddings as emb_mod
|
||
|
|
monkeypatch.setattr(emb_mod, "EMBEDDINGS_DB", db_path)
|
||
|
|
monkeypatch.setattr(emb_mod, "EMBEDDING_MODEL_READ", "text-embedding-v4")
|
||
|
|
monkeypatch.setattr(emb_mod, "create_embedding", lambda text, model=None: _vec(0.7))
|
||
|
|
|
||
|
|
results = emb_mod.find_relevant_chunks("Wirtschaft", min_similarity=0.0)
|
||
|
|
assert len(results) == 1
|
||
|
|
assert "v4" in results[0]["text"]
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Test 4: Gemischte DB — Query filtert modell-korrekt
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def test_mixed_db_query_filtered_correctly(tmp_path, monkeypatch):
|
||
|
|
"""Bei DB mit v3 + v4 Rows für mehrere Parteien liefert Query nur READ-Modell-Rows."""
|
||
|
|
db_path = tmp_path / "embeddings.db"
|
||
|
|
conn = _make_db(str(db_path))
|
||
|
|
|
||
|
|
parties = ["SPD", "CDU", "GRÜNE"]
|
||
|
|
for partei in parties:
|
||
|
|
_insert_chunk(conn, f"{partei.lower()}-prog", partei, "wahlprogramm",
|
||
|
|
f"{partei} Programm v3 Text", "text-embedding-v3")
|
||
|
|
_insert_chunk(conn, f"{partei.lower()}-prog", partei, "wahlprogramm",
|
||
|
|
f"{partei} Programm v4 Text", "text-embedding-v4")
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
import app.embeddings as emb_mod
|
||
|
|
monkeypatch.setattr(emb_mod, "EMBEDDINGS_DB", db_path)
|
||
|
|
monkeypatch.setattr(emb_mod, "EMBEDDING_MODEL_READ", "text-embedding-v4")
|
||
|
|
monkeypatch.setattr(emb_mod, "create_embedding", lambda text, model=None: _vec(0.8))
|
||
|
|
|
||
|
|
results = emb_mod.find_relevant_chunks("Programm", min_similarity=0.0, top_k=20)
|
||
|
|
for r in results:
|
||
|
|
# Alle zurückgegebenen Chunks müssen aus dem READ-Modell-Raum kommen
|
||
|
|
# (wir können model nicht direkt prüfen, aber den text-Suffix)
|
||
|
|
assert "v4" in r["text"], f"Unerwarteter v3-Chunk: {r['text']!r}"
|
||
|
|
assert len(results) == len(parties), "Je eine v4-Row pro Partei erwartet"
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Test 5: Index DELETE löscht nur WRITE-Modell-Rows — v3-Rows bleiben
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def test_reindex_deletes_only_write_model_rows(tmp_path, monkeypatch):
|
||
|
|
"""Beim Reindex (index_programm) werden alte v4-Rows gelöscht, v3 bleibt erhalten."""
|
||
|
|
db_path = tmp_path / "embeddings.db"
|
||
|
|
conn = _make_db(str(db_path))
|
||
|
|
|
||
|
|
# Vorhandene v3-Row (aus alter Migration)
|
||
|
|
_insert_chunk(conn, "spd-nrw-2022", "SPD", "wahlprogramm",
|
||
|
|
"Alte v3 Zeile bleibt stehen", "text-embedding-v3")
|
||
|
|
# Vorhandene v4-Row (wird beim Reindex ersetzt)
|
||
|
|
_insert_chunk(conn, "spd-nrw-2022", "SPD", "wahlprogramm",
|
||
|
|
"Alte v4 Zeile wird gelöscht", "text-embedding-v4")
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
import app.embeddings as emb_mod
|
||
|
|
pdf_dir = tmp_path / "pdfs"
|
||
|
|
pdf_dir.mkdir()
|
||
|
|
fake_pdf = pdf_dir / "spd-nrw-2022.pdf"
|
||
|
|
fake_pdf.write_bytes(b"%PDF-1.4 fake")
|
||
|
|
|
||
|
|
monkeypatch.setattr(emb_mod, "EMBEDDINGS_DB", db_path)
|
||
|
|
monkeypatch.setattr(emb_mod, "EMBEDDING_MODEL", "text-embedding-v4")
|
||
|
|
monkeypatch.setattr(
|
||
|
|
emb_mod, "extract_text_with_pages",
|
||
|
|
lambda path: [(1, "Neue v4 Zeile nach Reindex " * 10)]
|
||
|
|
)
|
||
|
|
monkeypatch.setattr(emb_mod, "create_embedding", lambda text, model=None: _vec(0.6))
|
||
|
|
|
||
|
|
emb_mod.index_programm("spd-nrw-2022", pdf_dir)
|
||
|
|
|
||
|
|
conn = sqlite3.connect(str(db_path))
|
||
|
|
v3_rows = conn.execute(
|
||
|
|
"SELECT text FROM chunks WHERE programm_id='spd-nrw-2022' AND model='text-embedding-v3'"
|
||
|
|
).fetchall()
|
||
|
|
v4_rows = conn.execute(
|
||
|
|
"SELECT text FROM chunks WHERE programm_id='spd-nrw-2022' AND model='text-embedding-v4'"
|
||
|
|
).fetchall()
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
assert len(v3_rows) == 1, "v3-Row muss erhalten bleiben"
|
||
|
|
assert "Alte v3 Zeile" in v3_rows[0][0]
|
||
|
|
assert all("Alte v4 Zeile" not in r[0] for r in v4_rows), "Alte v4-Row muss ersetzt worden sein"
|
||
|
|
assert any("Neue v4 Zeile" in r[0] for r in v4_rows), "Neue v4-Rows müssen vorhanden sein"
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Test 6: Query-Embedding nutzt READ-Modell als model-Parameter
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def test_query_embedding_uses_read_model(tmp_path, monkeypatch):
|
||
|
|
"""find_relevant_chunks ruft create_embedding mit EMBEDDING_MODEL_READ auf."""
|
||
|
|
db_path = tmp_path / "embeddings.db"
|
||
|
|
conn = _make_db(str(db_path))
|
||
|
|
_insert_chunk(conn, "spd-nrw-2022", "SPD", "wahlprogramm",
|
||
|
|
"Solidarität v4", "text-embedding-v4")
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
import app.embeddings as emb_mod
|
||
|
|
monkeypatch.setattr(emb_mod, "EMBEDDINGS_DB", db_path)
|
||
|
|
monkeypatch.setattr(emb_mod, "EMBEDDING_MODEL_READ", "text-embedding-v4")
|
||
|
|
|
||
|
|
called_with_model = []
|
||
|
|
|
||
|
|
def _fake_create_embedding(text, model=None):
|
||
|
|
called_with_model.append(model)
|
||
|
|
return _vec(0.9)
|
||
|
|
|
||
|
|
monkeypatch.setattr(emb_mod, "create_embedding", _fake_create_embedding)
|
||
|
|
|
||
|
|
emb_mod.find_relevant_chunks("Solidarität", min_similarity=0.0)
|
||
|
|
|
||
|
|
assert called_with_model, "create_embedding muss aufgerufen worden sein"
|
||
|
|
assert called_with_model[0] == "text-embedding-v4", (
|
||
|
|
f"Query-Embedding muss mit READ-Modell erzeugt werden, war aber {called_with_model[0]!r}"
|
||
|
|
)
|