文件内容
scripts/lib/signals.py
"""Reusable local scoring signals for v3 pipeline stages."""
from __future__ import annotations
import math
from . import dates, relevance, schema
# Editorial signal-to-noise scores. Grounding (Google Search) is 1.0 baseline;
# social platforms discounted for noise.
SOURCE_QUALITY = {
"xiaohongshu": 0.7,
"hackernews": 0.8,
"youtube": 0.85,
"reddit": 0.6,
"x": 0.68,
"polymarket": 0.5,
"instagram": 0.58,
"tiktok": 0.58,
}
def source_quality(source: str) -> float:
return SOURCE_QUALITY.get(source, 0.6)
def local_relevance(item: schema.SourceItem, ranking_query: str) -> float:
text = "\n".join(
part
for part in [item.title, item.body, item.snippet]
if part
)
hashtags = item.metadata.get("hashtags") if isinstance(item.metadata, dict) else None
score = relevance.token_overlap_relevance(ranking_query, text, hashtags=hashtags)
# High-engagement YouTube floor: official videos with millions of views
# often have titles that don't keyword-match the query (e.g., "YE - FATHER
# (feat. TRAVIS SCOTT)" doesn't match "kanye west"). The engagement signals
# say "this is important" even when text overlap is weak.
if item.source == "youtube" and item.engagement.get("views", 0) > 100_000:
score = max(score, 0.3)
# Project-mode GitHub floor: items fetched via --github-repo are explicitly
# requested by the user and relevant by construction. Without this floor,
# repos with low token diversity (e.g., "openclaw/openclaw" -> 1 unique token)
# get pruned despite being the primary search target.
labels = item.metadata.get("labels", []) if isinstance(item.metadata, dict) else []
if "project-mode" in labels:
score = max(score, 0.8)
return score
def freshness(item: schema.SourceItem, freshness_mode: str = "balanced_recent") -> int:
score = dates.recency_score(item.published_at)
if freshness_mode == "strict_recent":
return int(score)
if freshness_mode == "evergreen_ok":
return int((score * 0.6) + 40)
return int((score * 0.8) + 10)
def log1p_safe(value: float | int | None) -> float:
if value is None:
return 0.0
try:
numeric = float(value)
except (TypeError, ValueError):
return 0.0
if numeric <= 0:
return 0.0
return math.log1p(numeric)
def _top_comment_score(item: schema.SourceItem) -> float:
comments = item.metadata.get("top_comments") or []
if not comments or not isinstance(comments[0], dict):
return 0.0
return log1p_safe(comments[0].get("score"))
# Per-source engagement weights: list of (field_name, weight) tuples.
# Reddit uses a custom function because upvote_ratio and top_comment_score
# are not simple log1p fields.
ENGAGEMENT_WEIGHTS: dict[str, list[tuple[str, float]]] = {
"x": [("likes", 0.55), ("reposts", 0.25), ("replies", 0.15), ("quotes", 0.05)],
"youtube": [("views", 0.50), ("likes", 0.35), ("comments", 0.15)],
"tiktok": [("views", 0.50), ("likes", 0.30), ("comments", 0.20)],
"instagram": [("views", 0.50), ("likes", 0.30), ("comments", 0.20)],
"hackernews": [("points", 0.55), ("comments", 0.45)],
"polymarket": [("volume", 0.60), ("liquidity", 0.40)],
}
def _weighted_engagement(item: schema.SourceItem, weights: list[tuple[str, float]]) -> float | None:
values = [(log1p_safe(item.engagement.get(field)), weight) for field, weight in weights]
if not any(v for v, _ in values):
return None
return sum(v * w for v, w in values)
def _reddit_engagement(item: schema.SourceItem) -> float | None:
score = log1p_safe(item.engagement.get("score"))
comments = log1p_safe(item.engagement.get("num_comments"))
ratio = float(item.engagement.get("upvote_ratio") or 0.0)
top_comment = _top_comment_score(item)
if not any([score, comments, ratio, top_comment]):
return None
return (0.50 * score) + (0.35 * comments) + (0.05 * (ratio * 10.0)) + (0.10 * top_comment)
def _generic_engagement(item: schema.SourceItem) -> float | None:
if not item.engagement:
return None
values = [logged for v in item.engagement.values() if (logged := log1p_safe(v)) > 0]
if not values:
return None
return sum(values) / len(values)
def engagement_raw(item: schema.SourceItem) -> float | None:
if item.source == "reddit":
return _reddit_engagement(item)
weights = ENGAGEMENT_WEIGHTS.get(item.source)
if weights:
return _weighted_engagement(item, weights)
return _generic_engagement(item)
def normalize(values: list[float | None]) -> list[int | None]:
valid = [value for value in values if value is not None]
if not valid:
return [None for _ in values]
low = min(valid)
high = max(valid)
if math.isclose(low, high):
return [50 if value is not None else None for value in values]
return [
None
if value is None
else int(((value - low) / (high - low)) * 100)
for value in values
]
def annotate_stream(
items: list[schema.SourceItem],
ranking_query: str,
freshness_mode: str,
) -> list[schema.SourceItem]:
"""Attach local scoring metadata and return items sorted by local_rank_score."""
engagement_scores = normalize([engagement_raw(item) for item in items])
for item, eng_score in zip(items, engagement_scores, strict=True):
item.local_relevance = local_relevance(item, ranking_query)
item.freshness = freshness(item, freshness_mode)
item.engagement_score = eng_score
item.source_quality = source_quality(item.source)
item.local_rank_score = (
0.65 * item.local_relevance
+ 0.25 * (item.freshness / 100.0)
+ 0.10 * ((eng_score or 0) / 100.0)
)
return sorted(items, key=lambda item: item.local_rank_score or 0, reverse=True)
_SOCIAL_SOURCES = {"reddit", "x", "tiktok", "instagram"}
# Minimum view count for short-video platforms. Items below this floor
# are typically spam reposts or low-effort clips that add no unique signal.
_VIDEO_ENGAGEMENT_FLOOR_SOURCES = {"tiktok", "instagram"}
_VIDEO_ENGAGEMENT_FLOOR_VIEWS = 1000
def _passes_engagement_floor(item: schema.SourceItem, sole_source: bool) -> bool:
"""Check whether a TikTok/Instagram item meets the minimum view floor.
Items from sources not in _VIDEO_ENGAGEMENT_FLOOR_SOURCES always pass.
If the item's source is the *only* source represented in the batch
(sole_source=True), all items pass so we never return an empty result
for a whole source.
"""
if item.source not in _VIDEO_ENGAGEMENT_FLOOR_SOURCES:
return True
if sole_source:
return True
views = item.engagement.get("views", 0) if item.engagement else 0
return views >= _VIDEO_ENGAGEMENT_FLOOR_VIEWS
def prune_low_relevance(
items: list[schema.SourceItem],
minimum: float = 0.15,
) -> list[schema.SourceItem]:
"""Drop weak lexical matches when stronger evidence exists.
Social-source items with zero engagement get a stricter threshold
because zero engagement on a social platform is a strong noise signal.
TikTok and Instagram items with fewer than 1000 views are pruned
(unless they are the only source represented in the batch).
"""
sources_present = {item.source for item in items}
def passes(item: schema.SourceItem) -> bool:
rel = item.local_relevance if item.local_relevance is not None else 0.0
if rel < minimum:
return False
if item.source in _SOCIAL_SOURCES and (item.engagement_score is None or item.engagement_score == 0):
if rel < minimum * 1.5:
return False
sole_source = sources_present == {item.source}
if not _passes_engagement_floor(item, sole_source):
return False
return True
filtered = [item for item in items if passes(item)]
return filtered or items