文件预览

render_markdown_pdf.py

查看 产品需求大法-评审沙盘 技能包中的文件内容。

文件内容

scripts/render_markdown_pdf.py

#!/usr/bin/env python3
"""Render a Markdown document to PDF using Playwright."""

from __future__ import annotations

import argparse
import html
import os
import re
import shutil
import subprocess
import sys
import tempfile
from pathlib import Path


CSS = """
@page { size: A4; margin: 18mm 15mm; }
* { box-sizing: border-box; }
body {
  color: #1f2933;
  font-family: -apple-system, BlinkMacSystemFont, "PingFang SC", "Hiragino Sans GB",
    "Microsoft YaHei", "Noto Sans CJK SC", "Arial Unicode MS", sans-serif;
  font-size: 12px;
  line-height: 1.68;
}
h1 { font-size: 24px; margin: 0 0 18px; padding-bottom: 10px; border-bottom: 2px solid #111827; }
h2 { font-size: 17px; margin: 24px 0 10px; padding-bottom: 5px; border-bottom: 1px solid #d8dee6; }
h3 { font-size: 14px; margin: 18px 0 8px; }
p { margin: 7px 0; }
ul, ol { margin: 7px 0 7px 20px; padding: 0; }
li { margin: 3px 0; }
table { width: 100%; border-collapse: collapse; margin: 10px 0 16px; table-layout: fixed; }
th, td { border: 1px solid #d8dee6; padding: 7px 8px; vertical-align: top; word-break: break-word; }
th { background: #f3f6f8; font-weight: 700; }
blockquote { margin: 10px 0; padding: 8px 12px; border-left: 4px solid #9aa7b3; background: #f7f9fb; }
code { font-family: "SFMono-Regular", Consolas, monospace; background: #f3f4f6; padding: 1px 4px; border-radius: 4px; }
pre { white-space: pre-wrap; word-break: break-word; background: #f3f4f6; padding: 10px; border-radius: 6px; }
hr { border: 0; border-top: 1px solid #d8dee6; margin: 18px 0; }
"""


def inline_markdown(text: str) -> str:
    escaped = html.escape(text)
    escaped = re.sub(r"`([^`]+)`", r"<code>\1</code>", escaped)
    escaped = re.sub(r"\*\*([^*]+)\*\*", r"<strong>\1</strong>", escaped)
    escaped = re.sub(r"\*([^*]+)\*", r"<em>\1</em>", escaped)
    return escaped


def split_table_row(line: str) -> list[str]:
    return [cell.strip() for cell in line.strip().strip("|").split("|")]


def is_table_separator(line: str) -> bool:
    cells = split_table_row(line)
    return bool(cells) and all(re.fullmatch(r":?-{3,}:?", cell or "") for cell in cells)


def render_table(lines: list[str], start: int) -> tuple[str, int]:
    header = split_table_row(lines[start])
    rows: list[list[str]] = []
    i = start + 2
    while i < len(lines) and "|" in lines[i].strip() and lines[i].strip().startswith("|"):
        rows.append(split_table_row(lines[i]))
        i += 1
    html_rows = ["<table><thead><tr>"]
    html_rows.extend(f"<th>{inline_markdown(cell)}</th>" for cell in header)
    html_rows.append("</tr></thead><tbody>")
    for row in rows:
        padded = row + [""] * max(0, len(header) - len(row))
        html_rows.append("<tr>")
        html_rows.extend(f"<td>{inline_markdown(cell)}</td>" for cell in padded[: len(header)])
        html_rows.append("</tr>")
    html_rows.append("</tbody></table>")
    return "".join(html_rows), i


def markdown_to_html(markdown_text: str) -> str:
    lines = markdown_text.splitlines()
    out: list[str] = []
    paragraph: list[str] = []
    in_code = False
    code_lines: list[str] = []

    def flush_paragraph() -> None:
        if paragraph:
            out.append(f"<p>{inline_markdown(' '.join(paragraph).strip())}</p>")
            paragraph.clear()

    i = 0
    while i < len(lines):
        line = lines[i]
        stripped = line.strip()

        if stripped.startswith("```"):
            if in_code:
                out.append(f"<pre><code>{html.escape(chr(10).join(code_lines))}</code></pre>")
                code_lines = []
                in_code = False
            else:
                flush_paragraph()
                in_code = True
            i += 1
            continue

        if in_code:
            code_lines.append(line)
            i += 1
            continue

        if not stripped:
            flush_paragraph()
            i += 1
            continue

        if i + 1 < len(lines) and "|" in stripped and is_table_separator(lines[i + 1].strip()):
            flush_paragraph()
            table_html, i = render_table(lines, i)
            out.append(table_html)
            continue

        heading = re.match(r"^(#{1,6})\s+(.+)$", stripped)
        if heading:
            flush_paragraph()
            level = min(len(heading.group(1)), 3)
            out.append(f"<h{level}>{inline_markdown(heading.group(2))}</h{level}>")
            i += 1
            continue

        if re.fullmatch(r"[-*_]{3,}", stripped):
            flush_paragraph()
            out.append("<hr>")
            i += 1
            continue

        if stripped.startswith(">"):
            flush_paragraph()
            quote = stripped.lstrip(">").strip()
            out.append(f"<blockquote>{inline_markdown(quote)}</blockquote>")
            i += 1
            continue

        if re.match(r"^[-*]\s+", stripped):
            flush_paragraph()
            items = []
            while i < len(lines) and re.match(r"^[-*]\s+", lines[i].strip()):
                items.append(re.sub(r"^[-*]\s+", "", lines[i].strip()))
                i += 1
            out.append("<ul>" + "".join(f"<li>{inline_markdown(item)}</li>" for item in items) + "</ul>")
            continue

        if re.match(r"^\d+\.\s+", stripped):
            flush_paragraph()
            items = []
            while i < len(lines) and re.match(r"^\d+\.\s+", lines[i].strip()):
                items.append(re.sub(r"^\d+\.\s+", "", lines[i].strip()))
                i += 1
            out.append("<ol>" + "".join(f"<li>{inline_markdown(item)}</li>" for item in items) + "</ol>")
            continue

        paragraph.append(stripped)
        i += 1

    flush_paragraph()
    return "\n".join(out)


def find_node(cli_node: str | None) -> str:
    candidates = [
        cli_node,
        os.environ.get("CODEX_NODE"),
        str(Path.home() / ".cache/codex-runtimes/codex-primary-runtime/dependencies/node/bin/node"),
        shutil.which("node"),
    ]
    for candidate in candidates:
        if candidate and Path(candidate).exists():
            return candidate
    raise SystemExit("未找到 Node.js。请通过 --node 指定 Node.js 路径。")


def find_node_modules(cli_node_modules: str | None) -> str | None:
    candidates = [
        cli_node_modules,
        os.environ.get("CODEX_NODE_MODULES"),
        os.environ.get("NODE_REPL_NODE_MODULE_DIRS", "").split(os.pathsep)[0],
        str(Path.home() / ".cache/codex-runtimes/codex-primary-runtime/dependencies/node/node_modules"),
    ]
    for candidate in candidates:
        if candidate and Path(candidate).exists():
            return candidate
    return None


def render_pdf(html_path: Path, pdf_path: Path, node: str, node_modules: str | None) -> None:
    script = f"""
const path = require('node:path');
const {{ pathToFileURL }} = require('node:url');
const Module = require('node:module');
Module._initPaths();
const {{ chromium }} = require('playwright');

(async () => {{
  const browser = await chromium.launch({{ headless: true }});
  const page = await browser.newPage({{ viewport: {{ width: 1120, height: 1600 }} }});
  await page.goto(pathToFileURL({str(html_path)!r}).href, {{ waitUntil: 'networkidle' }});
  await page.pdf({{
    path: {str(pdf_path)!r},
    format: 'A4',
    printBackground: true,
    margin: {{ top: '18mm', right: '15mm', bottom: '18mm', left: '15mm' }}
  }});
  await browser.close();
}})().catch((error) => {{
  console.error(error && error.stack ? error.stack : String(error));
  process.exit(1);
}});
"""
    env = os.environ.copy()
    if node_modules:
        existing = env.get("NODE_PATH", "")
        env["NODE_PATH"] = node_modules if not existing else node_modules + os.pathsep + existing
    with tempfile.NamedTemporaryFile("w", suffix=".cjs", delete=False, encoding="utf-8") as handle:
        handle.write(script)
        script_path = handle.name
    try:
        subprocess.run([node, script_path], check=True, env=env)
    except subprocess.CalledProcessError as exc:
        raise SystemExit(
            "PDF 渲染失败。请确认 Node.js 环境中可用 playwright,或通过 --node-modules 指定包含 playwright 的 node_modules。"
        ) from exc
    finally:
        Path(script_path).unlink(missing_ok=True)


def main() -> None:
    parser = argparse.ArgumentParser(description="将 Markdown 渲染为适合评审的 A4 PDF。")
    parser.add_argument("input", type=Path, help="输入 Markdown 文件")
    parser.add_argument("output", type=Path, help="输出 PDF 文件")
    parser.add_argument("--title", default=None, help="HTML/PDF 标题")
    parser.add_argument("--keep-html", action="store_true", help="保留中间 HTML 文件")
    parser.add_argument("--node", default=None, help="Node.js 可执行文件路径")
    parser.add_argument("--node-modules", default=None, help="包含 playwright 的 node_modules 路径")
    args = parser.parse_args()

    markdown_path = args.input.resolve()
    pdf_path = args.output.resolve()
    if not markdown_path.exists():
        raise SystemExit(f"输入文件不存在:{markdown_path}")

    title = args.title or markdown_path.stem
    body = markdown_to_html(markdown_path.read_text(encoding="utf-8"))
    html_doc = f"""<!doctype html>
<html lang="zh-CN">
<head>
  <meta charset="utf-8">
  <title>{html.escape(title)}</title>
  <style>{CSS}</style>
</head>
<body>
{body}
</body>
</html>
"""
    html_path = pdf_path.with_suffix(".html") if args.keep_html else Path(tempfile.mkstemp(suffix=".html")[1])
    html_path.write_text(html_doc, encoding="utf-8")
    pdf_path.parent.mkdir(parents=True, exist_ok=True)

    try:
        render_pdf(html_path, pdf_path, find_node(args.node), find_node_modules(args.node_modules))
    finally:
        if not args.keep_html:
            html_path.unlink(missing_ok=True)

    print(str(pdf_path))


if __name__ == "__main__":
    main()