feat(#178 Folge): Thread-Auto-Splitter + Quality-Audit-Skript
- _split_into_thread_posts() splittet zu lange Bodies an Satzgrenzen in mehrere Posts ≤ max_chars (Default 280). Greedy: möglichst viele Sätze pro Post. Hashtags am Ende bleiben erhalten. - generate_draft(style='thread') ruft den Splitter auf, wenn das LLM weniger als 3 Posts oder Posts > 290 chars liefert. - 7 Unit-Tests fuer den Splitter (test_thread_splitter.py). - scripts/pm-quality-audit.sh: prueft alle PM-Drafts gegen Verbotsliste (GWÖ-Score, Matrix-Codes, Floskeln) + Wortzahl + Absatzzahl + Post-Laengen. Markdown-Report-Output. Audit von 23 Drafts: 4/23 ohne Auffaelligkeit; Hauptbefund: PMs haeufig zu kurz, Threads splittten ohne Auto-Splitter nicht zuverlaessig — Splitter behebt das. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
62636b5a78
commit
ba1f104c8e
@ -287,6 +287,42 @@ def _find_existing_draft(
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _split_into_thread_posts(body: str, max_chars: int = 280) -> str:
|
||||||
|
"""Splittet einen langen Thread-Body an Satzgrenzen in mehrere Posts.
|
||||||
|
|
||||||
|
Wenn das LLM die Posts nicht selbst mit ``\\n\\n`` getrennt hat,
|
||||||
|
wird der Body zwischen Sätzen aufgeteilt — Greedy: möglichst
|
||||||
|
viele Sätze pro Post, ohne ``max_chars`` zu überschreiten.
|
||||||
|
|
||||||
|
Hashtags am Ende werden zusammen mit dem letzten Post gehalten.
|
||||||
|
"""
|
||||||
|
import re as _re
|
||||||
|
|
||||||
|
# Newlines vereinheitlichen
|
||||||
|
text = body.replace("\n\n", " ").replace("\n", " ").strip()
|
||||||
|
# An Satzgrenzen splitten (.!? gefolgt von Whitespace)
|
||||||
|
sentences = _re.split(r"(?<=[.!?])\s+(?=[A-ZÄÖÜ#])", text)
|
||||||
|
if not sentences:
|
||||||
|
return body
|
||||||
|
|
||||||
|
posts: list[str] = []
|
||||||
|
current = ""
|
||||||
|
for sent in sentences:
|
||||||
|
sent = sent.strip()
|
||||||
|
if not sent:
|
||||||
|
continue
|
||||||
|
# Wenn current + sent zu lang würde, current als Post sichern
|
||||||
|
candidate = (current + " " + sent).strip() if current else sent
|
||||||
|
if len(candidate) > max_chars and current:
|
||||||
|
posts.append(current.strip())
|
||||||
|
current = sent
|
||||||
|
else:
|
||||||
|
current = candidate
|
||||||
|
if current:
|
||||||
|
posts.append(current.strip())
|
||||||
|
return "\n\n".join(posts)
|
||||||
|
|
||||||
|
|
||||||
async def generate_draft(
|
async def generate_draft(
|
||||||
drucksache: str,
|
drucksache: str,
|
||||||
news_url: str,
|
news_url: str,
|
||||||
@ -398,6 +434,16 @@ async def generate_draft(
|
|||||||
# wahrscheinlich ein Trenn-Klumpen, kein semantischer Anfuehrer.
|
# wahrscheinlich ein Trenn-Klumpen, kein semantischer Anfuehrer.
|
||||||
import re as _re
|
import re as _re
|
||||||
body = _re.sub(r'([.!?])"([A-ZÄÖÜ])', r'\1\n\n\2', body)
|
body = _re.sub(r'([.!?])"([A-ZÄÖÜ])', r'\1\n\n\2', body)
|
||||||
|
|
||||||
|
# Thread-Auto-Splitter: wenn das Modell nur einen Block produziert
|
||||||
|
# (kein \n\n gesetzt) und body > 300 Zeichen, an Satzgrenzen splitten
|
||||||
|
# bis jeder Post ≤ 280 Zeichen. Konservativ: nur wenn `style='thread'`.
|
||||||
|
if style == "thread":
|
||||||
|
existing_posts = [p for p in body.split("\n\n") if p.strip()]
|
||||||
|
needs_split = len(existing_posts) < 3 or any(len(p) > 290 for p in existing_posts)
|
||||||
|
if needs_split:
|
||||||
|
body = _split_into_thread_posts(body)
|
||||||
|
|
||||||
if not titel or not body:
|
if not titel or not body:
|
||||||
raise ValueError("LLM-Response unvollständig (titel oder body leer)")
|
raise ValueError("LLM-Response unvollständig (titel oder body leer)")
|
||||||
|
|
||||||
|
|||||||
103
scripts/pm-quality-audit.sh
Executable file
103
scripts/pm-quality-audit.sh
Executable file
@ -0,0 +1,103 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# PM-Quality-Audit: prüft alle PM-Drafts gegen Verbotsliste, Längen-
|
||||||
|
# Limits und Markdown-Konsistenz. Ausgabe als Markdown-Report.
|
||||||
|
#
|
||||||
|
# Manueller Aufruf:
|
||||||
|
# ./scripts/pm-quality-audit.sh gwoe-antragspruefer-dev > pm-audit.md
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
CONTAINER="${1:-gwoe-antragspruefer-dev}"
|
||||||
|
|
||||||
|
if ! docker ps --format '{{.Names}}' | grep -q "^${CONTAINER}$"; then
|
||||||
|
echo "$(date -Iseconds) SKIP — ${CONTAINER} is not running" >&2
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
docker exec -i "$CONTAINER" python <<'EOF'
|
||||||
|
import sqlite3
|
||||||
|
import re
|
||||||
|
|
||||||
|
VERBOTSLISTE_PM = [
|
||||||
|
(r"GWÖ-Score\s*\d", "GWÖ-Score-Zahl"),
|
||||||
|
(r"\b\d+(?:[.,]\d)?\s*/\s*10\b", "X/10-Score"),
|
||||||
|
(r"\b[A-E][1-5]\b(?!\.\s)", "Matrix-Code (A1-E5)"),
|
||||||
|
(r"Würde×|Solidarität×|Nachhaltigkeit×", "Berührungsgruppe×Wert"),
|
||||||
|
(r"\bzukunftsweisend\b", "Floskel zukunftsweisend"),
|
||||||
|
(r"\binnovativ\b", "Floskel innovativ"),
|
||||||
|
(r"\brichtungsweisend\b", "Floskel richtungsweisend"),
|
||||||
|
(r"in den Bereichen Bürger\W+und Staat", "GWÖ-Berührungsgruppen-Sprache"),
|
||||||
|
]
|
||||||
|
VERBOTSLISTE_THREAD = VERBOTSLISTE_PM + [
|
||||||
|
(r"\*\*[^*\n]+\*\*", "Markdown-Bold (Thread sollte ohne)"),
|
||||||
|
(r"\\[\\[\\\]]", "Eckige Klammern \\[\\]"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def audit_pm(body: str) -> list:
|
||||||
|
flaws = []
|
||||||
|
word_count = len(body.split())
|
||||||
|
if word_count < 280 or word_count > 420:
|
||||||
|
flaws.append(f"Wortzahl {word_count} (Soll 320-380)")
|
||||||
|
for pattern, label in VERBOTSLISTE_PM:
|
||||||
|
if re.search(pattern, body):
|
||||||
|
flaws.append(f"Verbot: {label}")
|
||||||
|
paragraphs = [p for p in body.split("\n\n") if p.strip()]
|
||||||
|
if len(paragraphs) < 4:
|
||||||
|
flaws.append(f"nur {len(paragraphs)} Absätze (Soll 6)")
|
||||||
|
return flaws
|
||||||
|
|
||||||
|
|
||||||
|
def audit_thread(body: str) -> list:
|
||||||
|
flaws = []
|
||||||
|
posts = [p for p in body.split("\n\n") if p.strip()]
|
||||||
|
if not (3 <= len(posts) <= 5):
|
||||||
|
flaws.append(f"{len(posts)} Posts (Soll 3-5)")
|
||||||
|
for i, p in enumerate(posts, 1):
|
||||||
|
if len(p) > 280:
|
||||||
|
flaws.append(f"Post {i}: {len(p)} chars (>280)")
|
||||||
|
for pattern, label in VERBOTSLISTE_THREAD:
|
||||||
|
if re.search(pattern, body):
|
||||||
|
flaws.append(f"Verbot: {label}")
|
||||||
|
return flaws
|
||||||
|
|
||||||
|
|
||||||
|
conn = sqlite3.connect("/app/data/gwoe-antraege.db")
|
||||||
|
rows = conn.execute("""
|
||||||
|
SELECT id, drucksache, bundesland, style, titel, body, created_at
|
||||||
|
FROM presse_drafts ORDER BY id DESC
|
||||||
|
""").fetchall()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
print("# PM-Quality-Audit — Stand", "2026-05-06")
|
||||||
|
print()
|
||||||
|
print(f"**Total Drafts:** {len(rows)}\n")
|
||||||
|
|
||||||
|
ok = 0
|
||||||
|
flagged_pm = 0
|
||||||
|
flagged_thread = 0
|
||||||
|
print("| ID | DS | BL | Style | Titel-Länge | Body-Wörter | Status |")
|
||||||
|
print("|---|---|---|---|---|---|---|")
|
||||||
|
for r in rows:
|
||||||
|
rid, ds, bl, style, titel, body, created = r
|
||||||
|
style = style or "pm"
|
||||||
|
word_count = len(body.split())
|
||||||
|
audit = audit_thread(body) if style == "thread" else audit_pm(body)
|
||||||
|
if audit:
|
||||||
|
if style == "thread":
|
||||||
|
flagged_thread += 1
|
||||||
|
else:
|
||||||
|
flagged_pm += 1
|
||||||
|
status = "⚠ " + "; ".join(audit[:2])
|
||||||
|
if len(audit) > 2:
|
||||||
|
status += f" (+{len(audit)-2})"
|
||||||
|
else:
|
||||||
|
status = "✅"
|
||||||
|
ok += 1
|
||||||
|
titel_short = (titel or "")[:40] + ("…" if titel and len(titel) > 40 else "")
|
||||||
|
print(f"| {rid} | {ds} | {bl} | {style} | {len(titel or '')} | {word_count} | {status} |")
|
||||||
|
|
||||||
|
print()
|
||||||
|
print(f"**Zusammenfassung:** {ok}/{len(rows)} ohne Auffälligkeit · "
|
||||||
|
f"{flagged_pm} PMs flagged · {flagged_thread} Threads flagged")
|
||||||
|
EOF
|
||||||
64
tests/test_thread_splitter.py
Normal file
64
tests/test_thread_splitter.py
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
"""Tests fuer _split_into_thread_posts (#178 Folge)."""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
try:
|
||||||
|
from app.presse_generator import _split_into_thread_posts
|
||||||
|
_HAS_FN = True
|
||||||
|
except ImportError:
|
||||||
|
_HAS_FN = False
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.skipif(not _HAS_FN, reason="presse_generator nicht importierbar")
|
||||||
|
|
||||||
|
|
||||||
|
class TestSplitIntoThreadPosts:
|
||||||
|
def test_short_text_one_post(self):
|
||||||
|
text = "Kurzer Satz. Noch einer."
|
||||||
|
out = _split_into_thread_posts(text)
|
||||||
|
assert out.split("\n\n") == ["Kurzer Satz. Noch einer."]
|
||||||
|
|
||||||
|
def test_long_text_splits_at_sentences(self):
|
||||||
|
# 4 lange Sätze, jeder ~80 chars → 2-3 Posts
|
||||||
|
text = (
|
||||||
|
"Mieter haben ein Recht auf sichere Versorgung. "
|
||||||
|
"Der Antrag will das durch Strafrecht schützen. "
|
||||||
|
"Versorgungssicherheit ist lebenswichtig für Familien. "
|
||||||
|
"Wenn Vermieter Geld zurückhalten droht Wärme- und Wassersperre. "
|
||||||
|
"Aktuelle Krisen verschärfen das Problem für Mieter:innen. "
|
||||||
|
"Wir fordern eine klare Regelung. #GWO"
|
||||||
|
)
|
||||||
|
out = _split_into_thread_posts(text, max_chars=200)
|
||||||
|
posts = out.split("\n\n")
|
||||||
|
assert len(posts) >= 2
|
||||||
|
for p in posts:
|
||||||
|
assert len(p) <= 220 # mit etwas Toleranz für letzten Satz
|
||||||
|
|
||||||
|
def test_each_post_under_280(self):
|
||||||
|
# Realistischer Sample: 4-5 Sätze
|
||||||
|
text = " ".join(["Ein Satz mit etwa 60 Zeichen Länge zur Prüfung."] * 8)
|
||||||
|
out = _split_into_thread_posts(text)
|
||||||
|
for p in out.split("\n\n"):
|
||||||
|
assert len(p) <= 290
|
||||||
|
|
||||||
|
def test_handles_newlines(self):
|
||||||
|
text = "Erster Satz.\nZweiter.\n\nDritter Satz."
|
||||||
|
out = _split_into_thread_posts(text)
|
||||||
|
# die Original-Newlines wurden zu Spaces zusammengeführt
|
||||||
|
assert "\nZweiter" not in out
|
||||||
|
assert "Erster Satz." in out
|
||||||
|
assert "Dritter Satz." in out
|
||||||
|
|
||||||
|
def test_hashtags_preserved(self):
|
||||||
|
text = "Erster Satz mit Inhalt. Zweiter Satz mit Substanz. #GWO #Wohnrecht"
|
||||||
|
out = _split_into_thread_posts(text)
|
||||||
|
assert "#GWO" in out
|
||||||
|
assert "#Wohnrecht" in out
|
||||||
|
|
||||||
|
def test_empty_input(self):
|
||||||
|
assert _split_into_thread_posts("") == ""
|
||||||
|
|
||||||
|
def test_single_long_sentence_kept(self):
|
||||||
|
"""Wenn ein einzelner Satz > max_chars ist, wird er trotzdem nicht zerlegt."""
|
||||||
|
text = "A" * 350
|
||||||
|
out = _split_into_thread_posts(text, max_chars=280)
|
||||||
|
# Sollte NUR einen Post liefern, weil keine Satzgrenze
|
||||||
|
assert "\n\n" not in out
|
||||||
Loading…
Reference in New Issue
Block a user