文件预览

docx_to_md.py

查看 中国专利.Skill 技能包中的文件内容。

文件内容

tools/docx_to_md.py

#!/usr/bin/env python3
"""
将 Word(.docx)转为 Markdown,并把内嵌图片抽取到磁盘,便于 Step 2 扫描与 Agent Read。

依赖 mammoth(见仓库根目录 requirements.txt)。

用法:
  python docx_to_md.py --input design.docx --output outputs/case/design.md
  python docx_to_md.py -i a.docx -o b/out.md --media-dir b/my_images

默认图片目录:与输出 .md 同级的「{md 文件名}_media/」,Markdown 中为相对路径引用。
"""

from __future__ import annotations

import argparse
import sys
from pathlib import Path


def _require_mammoth():
    try:
        import mammoth
    except ImportError:
        print(
            "缺少依赖 mammoth。请在技能根目录执行: pip install -r requirements.txt",
            file=sys.stderr,
        )
        sys.exit(1)
    return mammoth


def _extension_for_content_type(content_type: str) -> str:
    subtype = (content_type or "").split("/")[-1].lower().strip()
    if not subtype or subtype == "octet-stream":
        return "bin"
    if subtype == "jpeg":
        return "jpg"
    return subtype[:12]


def _run(
    input_docx: Path,
    output_md: Path,
    media_dir: Path | None,
) -> int:
    mammoth = _require_mammoth()

    if not input_docx.is_file():
        print(f"输入文件不存在: {input_docx}", file=sys.stderr)
        return 2
    if input_docx.suffix.lower() != ".docx":
        print("警告: 期望 .docx(Office Open XML);旧版 .doc 不支持。", file=sys.stderr)

    output_md = output_md.resolve()
    output_md.parent.mkdir(parents=True, exist_ok=True)

    if media_dir is None:
        media_dir = output_md.parent / f"{output_md.stem}_media"
    else:
        media_dir = media_dir.resolve()
    media_dir.mkdir(parents=True, exist_ok=True)

    counter = [0]

    def save_image(image):
        counter[0] += 1
        ext = _extension_for_content_type(getattr(image, "content_type", "") or "")
        filename = f"img_{counter[0]:04d}.{ext}"
        out_path = media_dir / filename
        try:
            with image.open() as f:
                out_path.write_bytes(f.read())
        except Exception as e:
            print(f"警告: 抽取图片失败 ({filename}): {e}", file=sys.stderr)
            return {"src": "", "alt": ""}

        try:
            rel = out_path.relative_to(output_md.parent).as_posix()
        except ValueError:
            rel = out_path.as_posix()
        alt = getattr(image, "alt_text", None) or ""
        return {"src": rel, "alt": alt}

    image_converter = mammoth.images.img_element(save_image)

    with input_docx.open("rb") as docx_file:
        result = mammoth.convert_to_markdown(docx_file, convert_image=image_converter)

    for msg in result.messages:
        text = getattr(msg, "message", str(msg))
        typ = getattr(msg, "type", "message")
        print(f"mammoth [{typ}]: {text}", file=sys.stderr)

    text = (result.value or "").strip()
    header = (
        f"<!-- 由 docx_to_md.py 自 {input_docx.name} 转换,勿手改本行元信息 -->\n\n"
    )
    output_md.write_text(header + text + ("\n" if text else ""), encoding="utf-8")

    print(f"已写入: {output_md}")
    print(f"图片目录: {media_dir}")
    return 0


def main() -> int:
    p = argparse.ArgumentParser(description="Word (.docx) → Markdown + 抽取图片")
    p.add_argument("-i", "--input", required=True, type=Path, help="输入 .docx 路径")
    p.add_argument("-o", "--output", required=True, type=Path, help="输出 .md 路径")
    p.add_argument(
        "--media-dir",
        type=Path,
        default=None,
        help="图片输出目录(默认:与 .md 同级的 {md 主名}_media)",
    )
    args = p.parse_args()
    return _run(args.input, args.output, args.media_dir)


if __name__ == "__main__":
    raise SystemExit(main())