文件预览

extract_page_catalog.py

查看 123 技能包中的文件内容。

文件内容

tools/extract_page_catalog.py

from __future__ import annotations

import argparse
import json
from pathlib import Path

from pptx import Presentation


def guess_page_type(layout_name: str, texts: list[str]) -> str:
    name = layout_name.lower()
    merged = " ".join(texts).lower()
    if "cover" in name or "封面" in name:
        return "cover"
    if "目录" in name or "content" in name:
        return "toc"
    if "章节页" in name or "section page" in name:
        return "section"
    if "副章节" in name or "sub section" in name:
        return "sub_section"
    if "end page" in name or "封底" in name or "thanks" in merged:
        return "end"
    if any(k in merged for k in ["饼状图", "柱状图", "折线图", "面积图", "图表", "chart", "table", "地图"]):
        return "chart_or_data"
    if "标准内容页" in name or "standard page" in name:
        return "content"
    return "generic"


def extract_catalog(template_path: Path, output_path: Path) -> dict:
    prs = Presentation(str(template_path))
    pages = []
    for i, slide in enumerate(prs.slides):
        texts = []
        placeholders = []
        shape_stats = {
            "total": len(slide.shapes),
            "placeholder": 0,
            "picture": 0,
            "table": 0,
            "chart": 0,
            "group": 0,
            "textbox_or_text": 0,
        }

        for shape in slide.shapes:
            st = str(getattr(shape, "shape_type", ""))
            is_placeholder = bool(getattr(shape, "is_placeholder", False))
            has_text_frame = bool(getattr(shape, "has_text_frame", False))
            has_table = bool(getattr(shape, "has_table", False))
            has_chart = bool(getattr(shape, "has_chart", False))

            if is_placeholder:
                shape_stats["placeholder"] += 1
                if has_text_frame:
                    placeholders.append(
                        {
                            "idx": shape.placeholder_format.idx,
                            "name": getattr(shape, "name", ""),
                        }
                    )
            if "PICTURE" in st:
                shape_stats["picture"] += 1
            if has_table:
                shape_stats["table"] += 1
            if has_chart:
                shape_stats["chart"] += 1
            if "GROUP" in st:
                shape_stats["group"] += 1
            if has_text_frame:
                shape_stats["textbox_or_text"] += 1
                txt = shape.text.strip().replace("\n", " / ")
                if txt:
                    texts.append(txt[:140])

        placeholders.sort(key=lambda item: item["idx"])
        page_type = guess_page_type(slide.slide_layout.name, texts)
        pages.append(
            {
                "index": i,
                "layout_name": slide.slide_layout.name,
                "page_type_guess": page_type,
                "sample_texts": texts[:5],
                "placeholder_schema": placeholders,
                "shape_stats": shape_stats,
            }
        )

    result = {
        "template_file": template_path.name,
        "slide_count": len(prs.slides),
        "layout_count": len(prs.slide_layouts),
        "pages": pages,
    }
    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_text(json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8")
    return result


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Extract page catalog from PPT template.")
    parser.add_argument("--template", required=True, help="Path to template .pptx")
    parser.add_argument("--output", required=True, help="Path to output page_catalog.json")
    return parser.parse_args()


def main() -> None:
    args = parse_args()
    template_path = Path(args.template)
    output_path = Path(args.output)
    if not template_path.exists():
        raise FileNotFoundError(f"Template not found: {template_path}")
    result = extract_catalog(template_path, output_path)
    print(f"Catalog generated: {output_path.resolve()}")
    print(f"Slides archived: {result['slide_count']}")


if __name__ == "__main__":
    main()