文件预览

goodreads_rating.py

查看 Audible Goodreads Deal Scout 技能包中的文件内容。

文件内容

audible_goodreads_deal_scout/goodreads_rating.py

from __future__ import annotations

import hashlib
import html
import json
import re
import urllib.parse
import urllib.request
from datetime import UTC, datetime, timedelta
from pathlib import Path
from typing import Any, Callable
from urllib.error import HTTPError, URLError

from .shared import normalize_space, parse_float, parse_int_value, write_json_atomic


GOODREADS_RATING_TTL_SECONDS = 7 * 24 * 60 * 60
GOODREADS_USER_AGENT = "OpenClaw Audible Goodreads Deal Scout/1.0"


class GoodreadsRatingError(RuntimeError):
    pass


def goodreads_book_url(book_id: str) -> str:
    return f"https://www.goodreads.com/book/show/{urllib.parse.quote(normalize_space(book_id))}"


def fetch_goodreads_text_with_final_url(url: str) -> tuple[str, str]:
    request = urllib.request.Request(
        url,
        headers={
            "User-Agent": GOODREADS_USER_AGENT,
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.9",
        },
    )
    try:
        with urllib.request.urlopen(request, timeout=30) as response:
            raw = response.read()
            return raw.decode("utf-8", "ignore"), str(response.geturl() or url)
    except (HTTPError, URLError) as exc:
        raise GoodreadsRatingError(f"Goodreads rating lookup failed for {url}: {exc}") from exc


def parse_goodreads_rating(html_text: str) -> dict[str, Any]:
    rating: float | None = None
    ratings_count: int | None = None
    for match in re.finditer(r'<script[^>]+type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', html_text, re.I | re.S):
        try:
            payload = json.loads(html.unescape((match.group(1) or "").strip()))
        except Exception:
            continue
        candidates = payload if isinstance(payload, list) else [payload]
        for item in candidates:
            if not isinstance(item, dict):
                continue
            aggregate = item.get("aggregateRating") if isinstance(item.get("aggregateRating"), dict) else {}
            rating = rating if rating is not None else parse_float(aggregate.get("ratingValue"))
            ratings_count = ratings_count if ratings_count is not None else parse_int_value(aggregate.get("ratingCount") or aggregate.get("reviewCount"))
    if rating is None:
        for pattern in (
            r'itemprop=["\']ratingValue["\'][^>]+content=["\']([0-9.]+)["\']',
            r'"ratingValue"\s*:\s*"?([0-9.]+)"?',
            r'"average_rating"\s*:\s*"?([0-9.]+)"?',
        ):
            match = re.search(pattern, html_text, re.I)
            if match:
                rating = parse_float(match.group(1))
                break
    if ratings_count is None:
        for pattern in (
            r'"ratingCount"\s*:\s*"?([0-9,]+)"?',
            r'"ratings_count"\s*:\s*"?([0-9,]+)"?',
        ):
            match = re.search(pattern, html_text, re.I)
            if match:
                ratings_count = parse_int_value(match.group(1))
                break
    if rating is None:
        raise GoodreadsRatingError("Goodreads page did not expose an average rating.")
    return {
        "averageRating": rating,
        "ratingsCount": ratings_count,
        "source": "goodreads_public_page",
    }


def _cache_path(cache_dir: Path, book_id: str) -> Path:
    digest = hashlib.sha256(f"goodreads-rating-v1:{book_id}".encode("utf-8")).hexdigest()
    return cache_dir / f"{digest}.json"


def _read_cache(cache_dir: Path, book_id: str, *, no_cache: bool, refresh_cache: bool) -> dict[str, Any] | None:
    if no_cache or refresh_cache:
        return None
    path = _cache_path(cache_dir, book_id)
    if not path.exists():
        return None
    try:
        payload = json.loads(path.read_text(encoding="utf-8"))
        fetched_at = datetime.fromisoformat(str(payload.get("fetchedAt")))
    except Exception:
        return None
    if datetime.now(UTC) - fetched_at > timedelta(seconds=GOODREADS_RATING_TTL_SECONDS):
        return None
    payload["cacheHit"] = True
    return payload


def _write_cache(cache_dir: Path, book_id: str, payload: dict[str, Any], *, no_cache: bool) -> None:
    if no_cache:
        return
    write_json_atomic(_cache_path(cache_dir, book_id), {"fetchedAt": datetime.now(UTC).isoformat(), **payload})


def lookup_goodreads_rating(
    book_id: str,
    *,
    cache_dir: Path,
    refresh_cache: bool = False,
    no_cache: bool = False,
    fetcher: Callable[[str], tuple[str, str]] | None = None,
) -> dict[str, Any]:
    cleaned_book_id = normalize_space(book_id)
    if not cleaned_book_id:
        raise GoodreadsRatingError("Missing Goodreads book id.")
    cached = _read_cache(cache_dir, cleaned_book_id, no_cache=no_cache, refresh_cache=refresh_cache)
    if cached:
        return cached
    url = goodreads_book_url(cleaned_book_id)
    html_text, final_url = (fetcher or fetch_goodreads_text_with_final_url)(url)
    parsed = parse_goodreads_rating(html_text)
    payload = {**parsed, "url": final_url, "cacheHit": False}
    _write_cache(cache_dir, cleaned_book_id, payload, no_cache=no_cache)
    return payload