文件预览
reddit_enrich.py

查看 Last30days 技能包中的文件内容。
返回技能详情下载技能包打开来源页
文件内容
scripts/lib/reddit_enrich.py

"""Reddit thread enrichment with real engagement metrics.

Default path: reddit.com public JSON.
Legacy compatibility path: third-party API enrichment when explicitly enabled.
"""

import re
from typing import Any, Dict, List, Optional
from urllib.parse import urlparse

from . import http, dates


def extract_reddit_path(url: str) -> Optional[str]:
    """Extract the path from a Reddit URL.

    Args:
        url: Reddit URL

    Returns:
        Path component or None
    """
    parsed = urlparse(url)
    if "reddit.com" not in parsed.netloc:
        return None
    return parsed.path


class RedditRateLimitError(Exception):
    """Raised when Reddit returns HTTP 429 (rate limited)."""
    pass


def fetch_thread_data(
    url: str,
    mock_data: Optional[Dict] = None,
    timeout: int = 30,
    retries: int = 3,
) -> Optional[Dict[str, Any]]:
    """Fetch Reddit thread JSON data.

    Args:
        url: Reddit thread URL
        mock_data: Mock data for testing
        timeout: HTTP timeout per attempt in seconds
        retries: Number of retries on failure

    Returns:
        Thread data dict or None on failure

    Raises:
        RedditRateLimitError: When Reddit returns 429 (caller should bail)
    """
    if mock_data is not None:
        return mock_data

    path = extract_reddit_path(url)
    if not path:
        return None

    try:
        data = http.get_reddit_json(path, timeout=timeout, retries=retries)
        return data
    except http.HTTPError as e:
        if e.status_code == 429:
            raise RedditRateLimitError(f"Reddit rate limited (429) fetching {url}") from e
        return None


def parse_thread_data(data: Any) -> Dict[str, Any]:
    """Parse Reddit thread JSON into structured data.

    Args:
        data: Raw Reddit JSON response

    Returns:
        Dict with submission and comments data
    """
    result = {
        "submission": None,
        "comments": [],
    }

    if not isinstance(data, list) or len(data) < 1:
        return result

    # First element is submission listing
    submission_listing = data[0]
    if isinstance(submission_listing, dict):
        children = submission_listing.get("data", {}).get("children", [])
        if children:
            sub_data = children[0].get("data", {})
            result["submission"] = {
                "score": sub_data.get("score"),
                "num_comments": sub_data.get("num_comments"),
                "upvote_ratio": sub_data.get("upvote_ratio"),
                "created_utc": sub_data.get("created_utc"),
                "permalink": sub_data.get("permalink"),
                "title": sub_data.get("title"),
                "selftext": sub_data.get("selftext", "")[:500],  # Truncate
            }

    # Second element is comments listing
    if len(data) >= 2:
        comments_listing = data[1]
        if isinstance(comments_listing, dict):
            children = comments_listing.get("data", {}).get("children", [])
            for child in children:
                if child.get("kind") != "t1":  # t1 = comment
                    continue
                c_data = child.get("data", {})
                if not c_data.get("body"):
                    continue

                comment = {
                    "score": c_data.get("score", 0),
                    "created_utc": c_data.get("created_utc"),
                    "author": c_data.get("author", "[deleted]"),
                    "body": c_data.get("body", "")[:300],  # Truncate
                    "permalink": c_data.get("permalink"),
                }
                result["comments"].append(comment)

    return result


def get_top_comments(comments: List[Dict], limit: int = 10) -> List[Dict[str, Any]]:
    """Get top comments sorted by score.

    Args:
        comments: List of comment dicts
        limit: Maximum number to return

    Returns:
        Top comments sorted by score
    """
    # Filter out deleted/removed
    valid = [c for c in comments if c.get("author") not in ("[deleted]", "[removed]")]

    # Sort by score descending
    sorted_comments = sorted(valid, key=lambda c: c.get("score", 0), reverse=True)

    return sorted_comments[:limit]


def extract_comment_insights(comments: List[Dict], limit: int = 7) -> List[str]:
    """Extract key insights from top comments.

    Uses simple heuristics to identify valuable comments:
    - Has substantive text
    - Contains actionable information
    - Not just agreement/disagreement

    Args:
        comments: Top comments
        limit: Max insights to extract

    Returns:
        List of insight strings
    """
    insights = []

    for comment in comments[:limit * 2]:  # Look at more comments than we need
        body = comment.get("body", "").strip()
        if not body or len(body) < 30:
            continue

        # Skip low-value patterns
        skip_patterns = [
            r'^(this|same|agreed|exactly|yep|nope|yes|no|thanks|thank you)\.?$',
            r'^lol|lmao|haha',
            r'^\[deleted\]',
            r'^\[removed\]',
        ]
        if any(re.match(p, body.lower()) for p in skip_patterns):
            continue

        # Truncate to first meaningful sentence or ~150 chars
        insight = body[:150]
        if len(body) > 150:
            # Try to find a sentence boundary
            for i, char in enumerate(insight):
                if char in '.!?' and i > 50:
                    insight = insight[:i+1]
                    break
            else:
                insight = insight.rstrip() + "..."

        insights.append(insight)
        if len(insights) >= limit:
            break

    return insights


def enrich_reddit_item(
    item: Dict[str, Any],
    mock_thread_data: Optional[Dict] = None,
    timeout: int = 10,
    retries: int = 1,
) -> Dict[str, Any]:
    """Enrich a Reddit item with real engagement data.

    Args:
        item: Reddit item dict
        mock_thread_data: Mock data for testing
        timeout: HTTP timeout per attempt (default 10s for enrichment)
        retries: Number of retries (default 1 — fail fast for enrichment)

    Returns:
        Enriched item dict

    Raises:
        RedditRateLimitError: Propagated so caller can bail on remaining items
    """
    url = item.get("url", "")

    # Fetch thread data (RedditRateLimitError propagates to caller)
    thread_data = fetch_thread_data(url, mock_thread_data, timeout=timeout, retries=retries)
    if not thread_data:
        return item

    parsed = parse_thread_data(thread_data)
    submission = parsed.get("submission")
    comments = parsed.get("comments", [])

    # Update engagement metrics
    if submission:
        item["engagement"] = {
            "score": submission.get("score"),
            "num_comments": submission.get("num_comments"),
            "upvote_ratio": submission.get("upvote_ratio"),
        }

        # Update date from actual data
        created_utc = submission.get("created_utc")
        if created_utc:
            item["date"] = dates.timestamp_to_date(created_utc)

    # Get top comments
    top_comments = get_top_comments(comments)
    item["top_comments"] = []
    for c in top_comments:
        permalink = c.get("permalink", "")
        comment_url = f"https://reddit.com{permalink}" if permalink else ""
        item["top_comments"].append({
            "score": c.get("score", 0),
            "date": dates.timestamp_to_date(c.get("created_utc")),
            "author": c.get("author", ""),
            "excerpt": c.get("body", "")[:200],
            "url": comment_url,
        })

    # Extract insights
    item["comment_insights"] = extract_comment_insights(top_comments)

    return item


def enrich_reddit_item_sc(
    item: Dict[str, Any],
    token: str,
    timeout: int = 30,
) -> Dict[str, Any]:
    """Enrich a Reddit item using the legacy comment API.

    No rate limit risk. Uses 1 credit per call.

    Args:
        item: Reddit item dict (already has engagement from search)
        token: Legacy compatibility API key
        timeout: HTTP timeout

    Returns:
        Enriched item with top_comments and comment_insights
    """
    from . import reddit as reddit_mod

    url = item.get("url", "")
    if not url:
        return item

    raw_comments = reddit_mod.fetch_post_comments(url, token)
    if not raw_comments:
        return item

    top_comments = []
    for c in raw_comments[:10]:
        body = c.get("body", "")
        if not body or body in ("[deleted]", "[removed]"):
            continue

        score = c.get("ups") or c.get("score", 0)
        author = c.get("author", "[deleted]")
        permalink = c.get("permalink", "")
        comment_url = f"https://reddit.com{permalink}" if permalink else ""

        top_comments.append({
            "score": score,
            "date": dates.timestamp_to_date(c.get("created_utc")) if c.get("created_utc") else None,
            "author": author,
            "body": body[:300],
            "excerpt": body[:200],
            "url": comment_url,
        })

    top_comments.sort(key=lambda c: c.get("score", 0), reverse=True)

    item["top_comments"] = []
    for c in top_comments:
        item["top_comments"].append({
            "score": c.get("score", 0),
            "date": c.get("date"),
            "author": c.get("author", ""),
            "excerpt": c.get("excerpt", ""),
            "url": c.get("url", ""),
        })

    item["comment_insights"] = extract_comment_insights(top_comments)

    return item