文件预览

dcard_fetch.py

查看 Dcard 技能包中的文件内容。

文件内容

scripts/dcard_fetch.py

#!/home/linuxbrew/.linuxbrew/opt/python@3.14/bin/python3.14
"""
Dcard 文章工具 — 單篇抓取 + 看板列表

Usage:
  dcard_fetch.py <post-url-or-id>
  dcard_fetch.py --forum relationship --id 261529038
  dcard_fetch.py list relationship
  dcard_fetch.py list relationship --sort popular --limit 10
  dcard_fetch.py list --all-forums
"""

import argparse
import json
import re
import sys
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import List

_CAMOUFOX = "/home/ichen/.cache/camoufox/camoufox"
_JS_DIR = Path(__file__).parent


# ── browser helpers ─────────────────────────────


def _browser():
    from patchright.sync_api import sync_playwright

    ps = sync_playwright().start()
    browser = ps.firefox.launch(headless=True, executable_path=_CAMOUFOX, args=["--no-sandbox"])
    context = browser.new_context(
        viewport={"width": 1280, "height": 720},
        locale="zh-TW",
        timezone_id="Asia/Taipei",
    )
    page = context.new_page()
    return ps, browser, page


# ── fetch single post ────────────────────────────


def fetch_post(forum: str | None, post_id: str) -> dict:
    url = f"https://www.dcard.tw/f/{forum or 'relationship'}/p/{post_id}"
    ps, browser, page = _browser()

    try:
        page.goto(url, wait_until="domcontentloaded", timeout=60000)
        for _ in range(4):
            page.wait_for_timeout(5000)
            title = page.title()
            if title and title != "Dcard" and "Dcard" in title:
                break

        html = page.content()
        post_data = _extract_post(html)
        if not post_data:
            post_data = {"title": page.title(), "text": page.inner_text("body"), "images": []}
            if post_data["title"] == "Dcard":
                post_data["text"] = "(無法載入文章,可能不存在或需登入)"
        post_data["url"] = url
        post_data["post_id"] = post_id
        return post_data
    finally:
        browser.close()
        ps.stop()


def _extract_post(html: str) -> dict | None:
    import lxml.html

    try:
        root = lxml.html.fromstring(html)
        for script in root.cssselect('script[type="application/ld+json"]'):
            text = script.text_content().strip()
            if '"SocialMediaPosting"' in text:
                data = json.loads(text)
                images = [
                    img.get("src")
                    for img in root.cssselect("img[src*='megapx-assets']")
                    if "orig" in (img.get("src") or "") or "1280" in (img.get("src") or "")
                ]
                return {"title": data.get("headline", "").strip(), "text": data.get("text", "").strip(), "images": images}
    except Exception:
        pass
    return None


# ── list forum posts ─────────────────────────────


@dataclass
class PostItem:
    title: str
    post_id: str
    forum: str
    time_ago: str = ""
    preview: str = ""
    like_count: int = 0
    comment_count: int = 0


def list_posts(forum: str, sort: str = "popular", limit: int = 15) -> List[PostItem]:
    if sort == "latest":
        url = f"https://www.dcard.tw/f/{forum}?latest=true"
    else:
        url = f"https://www.dcard.tw/f/{forum}"

    ps, browser, page = _browser()

    try:
        page.goto(url, wait_until="domcontentloaded", timeout=60000)
        page.wait_for_timeout(8000)

        js_file = _JS_DIR / "_list_posts.js"
        with open(js_file) as f:
            js_code = f.read()

        raw = page.evaluate(js_code, forum)
        return [PostItem(**p) for p in raw[:limit]]
    finally:
        browser.close()
        ps.stop()


def list_all_forums() -> dict:
    """抓熱門看板列表"""
    ps, browser, page = _browser()
    try:
        page.goto("https://www.dcard.tw/f/relationship", wait_until="domcontentloaded", timeout=30000)
        page.wait_for_timeout(5000)
        forums = page.evaluate("""
            () => {
                const items = [];
                const seen = new Set();
                document.querySelectorAll('a[href*="/f/"]:not([href*="/p/"])').forEach(a => {
                    const m = a.href.match(/\\/f\\/([\\w-]+)/);
                    if (!m || m[1] === 'popular' || seen.has(m[1])) return;
                    seen.add(m[1]);
                    items.push({name: a.textContent.trim() || m[1], slug: m[1]});
                });
                return items.slice(0, 30);
            }
        """)
        return {"forums": forums}
    finally:
        browser.close()
        ps.stop()


# ── output formatting ────────────────────────────


def fmt_post(data: dict, fmt: str) -> str:
    if fmt == "json":
        return json.dumps(data, ensure_ascii=False, indent=2)
    lines = [f"標題:{data.get('title', '')}", f"網址:{data.get('url', '')}", "", data.get("text", "")]
    imgs = data.get("images", [])
    if imgs:
        lines.extend(["", "— 圖片 —"] + imgs)
    return "\n".join(lines)


def fmt_list(items: List[PostItem], forum: str, sort: str, fmt: str) -> str:
    if fmt == "json":
        return json.dumps({"forum": forum, "sort": sort, "count": len(items), "posts": [asdict(p) for p in items]},
                          ensure_ascii=False, indent=2)
    label = "🔥 熱門" if sort == "popular" else "🕐 最新"
    lines = [f"=== Dcard /{forum} {label} ===", ""]
    for i, p in enumerate(items, 1):
        prev = (p.preview[:80] + "…") if len(p.preview) > 80 else p.preview
        lines.append(f"{i:2d}. {p.title}")
        if p.time_ago: lines.append(f"    ⏱ {p.time_ago}")
        if prev: lines.append(f"    💬 {prev}")
        lines.append(f"    🔗 https://www.dcard.tw/f/{p.forum}/p/{p.post_id}")
        lines.append("")
    lines.append(f"共 {len(items)} 篇文章")
    return "\n".join(lines)


# ── CLI ──────────────────────────────────────────


def _parse_id(s: str) -> tuple:
    m = re.search(r'/f/(\w+)/p/(\d+)', s)
    if m: return m.group(1), m.group(2)
    m = re.search(r'/p/(\d+)', s)
    if m: return None, m.group(1)
    if s.isdigit(): return None, s
    return None, None


def cmd_fetch(args):
    pid = args.id
    forum = args.forum
    if not pid and args.url_or_id:
        f, _id = _parse_id(args.url_or_id)
        forum = forum or f
        pid = _id
    if not pid:
        print("錯誤:請提供文章 ID 或 URL", file=sys.stderr)
        sys.exit(1)
    print(fmt_post(fetch_post(forum, pid), args.format))


def cmd_list(args):
    if args.all_forums:
        data = list_all_forums()
        if args.format == "json":
            print(json.dumps(data, ensure_ascii=False, indent=2))
        else:
            print("=== Dcard 熱門看板 ===")
            for f in data.get("forums", []):
                print(f"  {f['name']:20s} /{f['slug']}")
        return
    if not args.forum:
        print("錯誤:請提供看板名稱(如 relationship)或使用 --all-forums", file=sys.stderr)
        sys.exit(1)
    items = list_posts(args.forum, sort=args.sort, limit=args.limit)
    print(fmt_list(items, args.forum, args.sort, args.format))


def main():
    parser = argparse.ArgumentParser(description="Dcard 文章工具")
    sub = parser.add_subparsers(dest="command")
    # fetch
    pf = sub.add_parser("fetch", help="抓取單篇文章")
    pf.add_argument("url_or_id", nargs="?")
    pf.add_argument("--forum", "-f", default=None)
    pf.add_argument("--id", "-i", default=None)
    pf.add_argument("--format", "-o", choices=["text", "json"], default="text")
    # list
    pl = sub.add_parser("list", help="看板文章列表")
    pl.add_argument("forum", nargs="?", default=None)
    pl.add_argument("--sort", "-s", choices=["popular", "latest"], default="popular")
    pl.add_argument("--limit", "-l", type=int, default=15)
    pl.add_argument("--format", "-o", choices=["text", "json"], default="text")
    pl.add_argument("--all-forums", "-A", action="store_true", help="顯示熱門看板")
    args = parser.parse_args()
    if args.command == "list":
        cmd_list(args)
    elif args.command == "fetch":
        cmd_fetch(args)
    else:
        # default = fetch
        args.command = "fetch"
        cmd_fetch(args)


if __name__ == "__main__":
    main()