文件预览

models.py

查看 Founder Signal 技能包中的文件内容。

文件内容

src/founder_signal/models.py

"""Typed models for Founder Signal."""

from __future__ import annotations

from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any


@dataclass(frozen=True)
class VerifiedEvidenceSnapshot:
    """Verified source text supplied from outside the runtime.

    ``reddit_url`` remains accepted for backward-compatible profile files, but new
    code should prefer ``platform`` and ``source_url``.
    """

    source_url: str
    text_snapshot: str
    platform: str = "reddit"
    verification_method: str = "agent_browser"
    verified_by: str = ""

    @property
    def reddit_url(self) -> str:
        return self.source_url

    @classmethod
    def from_dict(cls, payload: dict) -> "VerifiedEvidenceSnapshot":
        platform = str(payload.get("platform") or "reddit").strip().lower() or "reddit"
        source_url = str(payload.get("source_url") or payload.get("reddit_url") or "").strip()
        return cls(
            platform=platform,
            source_url=source_url,
            text_snapshot=str(payload["text_snapshot"]).strip(),
            verification_method=str(payload.get("verification_method", "agent_browser")).strip(),
            verified_by=str(payload.get("verified_by", "")).strip(),
        )


@dataclass(frozen=True)
class PlatformSourceConfig:
    """Platform-specific discovery/evidence configuration."""

    platform: str
    enabled: bool = True
    communities: list[str] = field(default_factory=list)
    discovery_providers: list[str] = field(default_factory=list)
    seed_urls: list[str] = field(default_factory=list)
    excluded_urls: list[str] = field(default_factory=list)

    @classmethod
    def from_dict(cls, platform: str, payload: dict[str, Any]) -> "PlatformSourceConfig":
        return cls(
            platform=platform.strip().lower(),
            enabled=bool(payload.get("enabled", True)),
            communities=[str(item) for item in payload.get("communities", [])],
            discovery_providers=[str(item) for item in payload.get("discovery_providers", [])],
            seed_urls=[str(item) for item in payload.get("seed_urls", [])],
            excluded_urls=[str(item) for item in payload.get("excluded_urls", [])],
        )


@dataclass(frozen=True)
class FounderSignalConfig:
    profile_id: str
    enabled: bool
    product_name: str
    product_one_liner: str
    target_audience: str
    keywords: list[str]
    subreddits: list[str] = field(default_factory=list)
    seed_reddit_urls: list[str] = field(default_factory=list)
    max_candidates: int = 0
    max_action_cards: int = 0
    discovery_mode: str = "live"
    max_post_age_days: int = 7
    preferred_post_age_hours: int = 72
    min_comment_count: int = 0
    max_comment_count: int = 250
    history_ttl_days: int = 45
    discovery_terms: list[str] = field(default_factory=list)
    live_discovery_terms: list[str] = field(default_factory=list)
    research_terms: list[str] = field(default_factory=list)
    scoring_terms: list[str] = field(default_factory=list)
    negative_scoring_terms: list[str] = field(default_factory=list)
    excluded_reddit_urls: list[str] = field(default_factory=list)
    platforms: dict[str, PlatformSourceConfig] = field(default_factory=dict)
    verified_evidence_snapshots: list[VerifiedEvidenceSnapshot] = field(default_factory=list)

    def __post_init__(self) -> None:
        platforms = dict(self.platforms)
        reddit = platforms.get("reddit")
        if reddit is None and (self.subreddits or self.seed_reddit_urls or self.excluded_reddit_urls):
            platforms["reddit"] = PlatformSourceConfig(
                platform="reddit",
                communities=list(self.subreddits),
                seed_urls=list(self.seed_reddit_urls),
                excluded_urls=list(self.excluded_reddit_urls),
            )
        elif reddit is not None:
            object.__setattr__(self, "subreddits", list(reddit.communities))
            object.__setattr__(self, "seed_reddit_urls", list(reddit.seed_urls))
            object.__setattr__(self, "excluded_reddit_urls", list(reddit.excluded_urls))
        object.__setattr__(self, "platforms", platforms)

    @classmethod
    def from_dict(cls, payload: dict) -> "FounderSignalConfig":
        platforms = _parse_platforms(payload)
        discovery_terms = [str(item) for item in payload.get("discovery_terms", [])]
        scoring_terms = [str(item) for item in payload.get("scoring_terms", [])]
        live_discovery_terms = [
            str(item) for item in payload.get("live_discovery_terms", discovery_terms)
        ]
        research_terms = [
            str(item)
            for item in payload.get(
                "research_terms",
                discovery_terms or scoring_terms,
            )
        ]
        default_mode = "research" if "discovery_mode" not in payload else "live"
        return cls(
            profile_id=str(payload["profile_id"]).strip(),
            enabled=bool(payload.get("enabled", True)),
            product_name=str(payload["product_name"]),
            product_one_liner=str(payload["product_one_liner"]),
            target_audience=str(payload["target_audience"]),
            keywords=[str(item) for item in payload["keywords"]],
            subreddits=platforms.get("reddit", PlatformSourceConfig("reddit")).communities,
            seed_reddit_urls=platforms.get("reddit", PlatformSourceConfig("reddit")).seed_urls,
            max_candidates=int(payload["max_candidates"]),
            max_action_cards=int(payload["max_action_cards"]),
            discovery_mode=str(payload.get("discovery_mode", default_mode)).strip().lower(),
            max_post_age_days=int(payload.get("max_post_age_days", 7)),
            preferred_post_age_hours=int(payload.get("preferred_post_age_hours", 72)),
            min_comment_count=int(payload.get("min_comment_count", 0)),
            max_comment_count=int(payload.get("max_comment_count", 250)),
            history_ttl_days=int(payload.get("history_ttl_days", 45)),
            discovery_terms=discovery_terms,
            live_discovery_terms=live_discovery_terms,
            research_terms=research_terms,
            scoring_terms=scoring_terms,
            negative_scoring_terms=[
                str(item) for item in payload.get("negative_scoring_terms", [])
            ],
            excluded_reddit_urls=platforms.get("reddit", PlatformSourceConfig("reddit")).excluded_urls,
            platforms=platforms,
            verified_evidence_snapshots=[
                VerifiedEvidenceSnapshot.from_dict(item)
                for item in payload.get("verified_evidence_snapshots", [])
            ],
        )


def _parse_platforms(payload: dict[str, Any]) -> dict[str, PlatformSourceConfig]:
    parsed: dict[str, PlatformSourceConfig] = {}
    raw_platforms = payload.get("platforms")
    if isinstance(raw_platforms, dict):
        for name, config_payload in raw_platforms.items():
            if isinstance(config_payload, dict):
                parsed[str(name).strip().lower()] = PlatformSourceConfig.from_dict(
                    str(name), config_payload
                )

    legacy_reddit_present = any(
        key in payload for key in ("subreddits", "seed_reddit_urls", "excluded_reddit_urls")
    )
    if legacy_reddit_present or not parsed:
        existing = parsed.get("reddit", PlatformSourceConfig("reddit"))
        parsed["reddit"] = PlatformSourceConfig(
            platform="reddit",
            enabled=existing.enabled,
            communities=(
                [str(item) for item in payload.get("subreddits", [])]
                if "subreddits" in payload
                else existing.communities
            ),
            discovery_providers=existing.discovery_providers,
            seed_urls=(
                [str(item) for item in payload.get("seed_reddit_urls", [])]
                if "seed_reddit_urls" in payload
                else existing.seed_urls
            ),
            excluded_urls=(
                [str(item) for item in payload.get("excluded_reddit_urls", [])]
                if "excluded_reddit_urls" in payload
                else existing.excluded_urls
            ),
        )
    return {name: config for name, config in parsed.items() if config.enabled}


@dataclass(frozen=True)
class StructuredRedditEvidence:
    post_title: str
    post_body: str
    subreddit: str
    comments_excerpt: str
    extraction_quality: str
    raw_text_snapshot: str
    post_age_days: int | None = None

    def to_dict(self) -> dict[str, Any]:
        return asdict(self)


@dataclass(frozen=True)
class EvidenceReadResult:
    candidate_id: str
    source_url: str
    evidence_url: str
    status: str
    raw_html_path: Path
    text_snapshot_path: Path
    source_url_path: Path
    evidence_url_path: Path
    platform: str = "reddit"
    structured_evidence_path: Path | None = None
    structured_evidence: StructuredRedditEvidence | None = None


RedditEvidenceReadResult = EvidenceReadResult