文件预览
voice_match.py

查看 YouOS 技能包中的文件内容。
返回技能详情下载技能包打开来源页
文件内容
app/evaluation/voice_match.py

"""Voice-match scoring — how closely a draft resembles the user's *actual* reply.

The existing rule-based scores (keyword hit-rate, brevity, mode) measure whether
a draft is structurally acceptable. They do *not* measure the thing YouOS exists
to do: sound like **you**. Voice-match compares a candidate draft against the real
reply the user sent to the same inbound message (the "reference"), so we can tell
whether a model — a Qwen fine-tuned on your mail vs. a generic cloud model — is
actually reproducing your voice rather than just writing a plausible email.

This is the metric that makes a cross-model comparison meaningful: a frontier
cloud model will usually win the structural scores, but the question that decides
whether the privacy/cost trade is worth it is "does it sound more like me than my
own fine-tuned local model?" — and only voice-match answers that.

Design:
  * The core is **deterministic and dependency-free** (lexical overlap, length fit,
    greeting/closing match, stylometry) so it runs anywhere and in CI without a
    model download.
  * An optional **semantic** component uses an injected embedding function (e.g.
    ``app.core.embeddings.get_embedding``) when one is available, so the metric
    degrades gracefully on machines without the model loaded.

All sub-scores are in ``[0, 1]`` (higher = closer to the reference). The combined
``voice_match`` is a weighted blend of whichever components are available.
"""

from __future__ import annotations

import re
from collections.abc import Callable, Sequence
from difflib import SequenceMatcher
from typing import Any

# Reuse the generation layer's greeting/closing vocabulary so "has a greeting"
# means the same thing here as it does when drafting/repairing.
from app.generation.service import _CLOSING_TOKENS, _GREETING_TOKENS

_WORD_RE = re.compile(r"[A-Za-z0-9']+")
_SENT_RE = re.compile(r"[.!?]+")
_CONTRACTION_RE = re.compile(r"\b\w+'(t|re|ll|ve|m|s|d)\b", re.IGNORECASE)


def _tokens(text: str) -> list[str]:
    return _WORD_RE.findall(text.lower())


def _sentences(text: str) -> list[str]:
    return [s for s in (p.strip() for p in _SENT_RE.split(text)) if s]


def _lexical_overlap(draft: str, reference: str) -> float:
    """Surface word-sequence overlap (0–1) via difflib's ratio.

    Catches reused phrasing — the user's stock openers, sign-offs and turns of
    phrase that a model trained on their mail reproduces but a generic model
    paraphrases away.
    """
    a, b = _tokens(draft), _tokens(reference)
    if not a and not b:
        return 1.0
    if not a or not b:
        return 0.0
    return SequenceMatcher(None, a, b).ratio()


def _length_ratio(draft: str, reference: str) -> float:
    """How close the draft's length is to the reference's (1.0 = identical)."""
    a, b = len(_tokens(draft)), len(_tokens(reference))
    if a == 0 and b == 0:
        return 1.0
    if a == 0 or b == 0:
        return 0.0
    return min(a, b) / max(a, b)


def _first_line(text: str) -> str:
    return text.lstrip().split("\n", 1)[0].strip().lower()


def _tail(text: str) -> str:
    return "\n".join(text.rstrip().splitlines()[-3:]).lower()


def _has_greeting(text: str) -> bool:
    first = _first_line(text)
    return bool(first) and any(first.startswith(tok) for tok in _GREETING_TOKENS)


def _has_closing(text: str) -> bool:
    tail = _tail(text)
    return bool(tail) and any(tok in tail for tok in _CLOSING_TOKENS)


def _bool_match(a: bool, b: bool) -> float:
    """1.0 when both texts agree on a structural habit, else 0.0."""
    return 1.0 if a == b else 0.0


def _style_features(text: str) -> dict[str, float]:
    """A small, robust stylometric fingerprint of a reply.

    These are surface habits that survive paraphrase and identify a writer:
    sentence length, word length, how often they contract, how often they ask or
    exclaim. Each is later compared between draft and reference.
    """
    toks = _tokens(text)
    sents = _sentences(text)
    n_tokens = len(toks) or 1
    n_sents = len(sents) or 1
    return {
        "avg_sentence_len": len(toks) / n_sents,
        "avg_word_len": sum(len(t) for t in toks) / n_tokens,
        "contraction_rate": len(_CONTRACTION_RE.findall(text)) / n_tokens,
        "question_rate": text.count("?") / n_sents,
        "exclaim_rate": text.count("!") / n_sents,
    }


# Per-feature scale used to normalise an absolute difference into a 0–1 penalty.
# Roughly "a difference this large means the styles are unrelated on this axis".
_STYLE_SCALE = {
    "avg_sentence_len": 12.0,
    "avg_word_len": 2.5,
    "contraction_rate": 0.15,
    "question_rate": 1.0,
    "exclaim_rate": 1.0,
}


def _style_similarity(draft: str, reference: str) -> float:
    """Stylometric closeness (0–1), blending fingerprint features with the
    greeting/closing habit match."""
    if not draft.strip() or not reference.strip():
        return 0.0
    fa, fb = _style_features(draft), _style_features(reference)
    sims: list[float] = []
    for key, scale in _STYLE_SCALE.items():
        diff = abs(fa[key] - fb[key])
        sims.append(max(0.0, 1.0 - diff / scale))
    # Fold in whether the two share the greeting / sign-off habit.
    sims.append(_bool_match(_has_greeting(draft), _has_greeting(reference)))
    sims.append(_bool_match(_has_closing(draft), _has_closing(reference)))
    return sum(sims) / len(sims)


def _semantic_similarity(
    draft: str,
    reference: str,
    embed_fn: Callable[[str], Sequence[float]],
    cosine_fn: Callable[[Sequence[float], Sequence[float]], float] | None,
) -> float | None:
    """Cosine similarity of embeddings — "did it say the same thing".

    Returns ``None`` (rather than raising) if embedding fails, so a missing model
    downgrades the metric to its deterministic components instead of breaking the
    whole comparison run.
    """
    if not draft.strip() or not reference.strip():
        return 0.0
    try:
        if cosine_fn is None:
            from app.core.embeddings import cosine_similarity as cosine_fn  # noqa: PLC0415
        va, vb = embed_fn(draft), embed_fn(reference)
        return max(0.0, min(1.0, float(cosine_fn(va, vb))))
    except Exception:
        return None


# Weights when the semantic component is available vs. deterministic-only. They
# sum to 1.0 within each branch so the combined score stays in [0, 1].
_WEIGHTS_WITH_SEMANTIC = {"semantic": 0.45, "style": 0.25, "lexical": 0.20, "length": 0.10}
_WEIGHTS_NO_SEMANTIC = {"style": 0.45, "lexical": 0.35, "length": 0.20}


def voice_match_score(
    draft: str,
    reference: str,
    *,
    embed_fn: Callable[[str], Sequence[float]] | None = None,
    cosine_fn: Callable[[Sequence[float], Sequence[float]], float] | None = None,
) -> dict[str, Any]:
    """Score how closely ``draft`` matches the user's real ``reference`` reply.

    Returns a dict of sub-scores plus a combined ``voice_match`` (all 0–1, higher
    is closer). Pass ``embed_fn`` (e.g. ``app.core.embeddings.get_embedding``) to
    include the semantic component; without it the score uses the deterministic
    components only and ``semantic_similarity`` is ``None``.
    """
    lexical = _lexical_overlap(draft, reference)
    length = _length_ratio(draft, reference)
    style = _style_similarity(draft, reference)

    semantic: float | None = None
    if embed_fn is not None:
        semantic = _semantic_similarity(draft, reference, embed_fn, cosine_fn)

    if semantic is not None:
        w = _WEIGHTS_WITH_SEMANTIC
        combined = w["semantic"] * semantic + w["style"] * style + w["lexical"] * lexical + w["length"] * length
    else:
        w = _WEIGHTS_NO_SEMANTIC
        combined = w["style"] * style + w["lexical"] * lexical + w["length"] * length

    return {
        "voice_match": round(combined, 3),
        "semantic_similarity": round(semantic, 3) if semantic is not None else None,
        "lexical_overlap": round(lexical, 3),
        "style_similarity": round(style, 3),
        "length_ratio": round(length, 3),
        "greeting_match": _bool_match(_has_greeting(draft), _has_greeting(reference)) == 1.0,
        "closing_match": _bool_match(_has_closing(draft), _has_closing(reference)) == 1.0,
    }