文件预览

validate_project_corpus.py

查看 Paper Contribution Helper 技能包中的文件内容。

文件内容

scripts/engine/validate_project_corpus.py

#!/usr/bin/env python3
from __future__ import annotations

import argparse
import json
from collections import Counter
from pathlib import Path
from typing import Any

from common import read_json, write_json


MATERIAL_ROOT = Path("materials") / "_by_paper"
REQUIRED_OPTIONAL_STATUS = "source_status.json"
RECOGNIZED_REVIEW_FILES = [
    "official_reviews.json",
    "official_reviews.md",
    "meta_review.json",
    "meta_review.md",
    "decision.json",
    "decision.md",
]
RECOGNIZED_REPLY_FILES = [
    "author_replies.json",
    "author_replies.md",
    "reviewer_followups.json",
    "reviewer_followups.md",
    "public_comments.json",
]


def has_any(folder: Path, names: list[str]) -> bool:
    return any((folder / name).exists() for name in names)


def inspect_material(folder: Path, run_dir: Path) -> dict[str, Any]:
    status = read_json(folder / REQUIRED_OPTIONAL_STATUS)
    pdf_present = (folder / "paper.pdf").exists()
    submission_present = (folder / "submission.json").exists()
    review_present = has_any(folder, RECOGNIZED_REVIEW_FILES)
    reply_present = has_any(folder, RECOGNIZED_REPLY_FILES)
    issues: list[str] = []
    if not pdf_present:
        issues.append("missing paper.pdf")
    if not submission_present:
        issues.append("missing submission.json")
    if not (folder / REQUIRED_OPTIONAL_STATUS).exists():
        issues.append("missing source_status.json")
    if not review_present and not status.get("reviews_missing"):
        issues.append("reviews absent but source_status.reviews_missing is not true")
    if not reply_present and not status.get("replies_missing"):
        issues.append("replies absent but source_status.replies_missing is not true")
    return {
        "paper_key": folder.name,
        "material_dir": folder.relative_to(run_dir).as_posix(),
        "pdf_present": pdf_present,
        "submission_present": submission_present,
        "source_status_present": (folder / REQUIRED_OPTIONAL_STATUS).exists(),
        "review_present": review_present,
        "reply_present": reply_present,
        "issues": issues,
    }


def validate(run_dir: Path) -> dict[str, Any]:
    materials_root = run_dir / MATERIAL_ROOT
    records: list[dict[str, Any]] = []
    if materials_root.exists():
        for folder in sorted(path for path in materials_root.iterdir() if path.is_dir()):
            records.append(inspect_material(folder, run_dir))
    issue_count = sum(len(record["issues"]) for record in records)
    counters = {
        "pdf_present": sum(1 for record in records if record["pdf_present"]),
        "reviews_present": sum(1 for record in records if record["review_present"]),
        "replies_present": sum(1 for record in records if record["reply_present"]),
        "source_status_present": sum(1 for record in records if record["source_status_present"]),
    }
    source_kinds = Counter()
    for folder in sorted(path for path in materials_root.iterdir() if path.is_dir()) if materials_root.exists() else []:
        status = read_json(folder / REQUIRED_OPTIONAL_STATUS)
        source_kinds[str(status.get("source_kind") or "unknown")] += 1
    return {
        "schema_version": "1.0",
        "valid": bool(records) and issue_count == 0,
        "run_dir": run_dir.as_posix(),
        "paper_count": len(records),
        "issue_count": issue_count,
        "counts": counters,
        "source_kinds": dict(source_kinds),
        "records": records,
    }


def write_markdown(report: dict[str, Any], path: Path) -> None:
    lines = [
        "# Project Corpus Validation",
        "",
        f"- Run dir: `{report['run_dir']}`",
        f"- Valid: `{report['valid']}`",
        f"- Paper count: {report['paper_count']}",
        f"- Issue count: {report['issue_count']}",
        f"- Source kinds: {report['source_kinds']}",
        "",
        "| Paper key | PDF | Reviews | Replies | Source status | Issues |",
        "|---|---|---|---|---|---|",
    ]
    for record in report["records"]:
        issues = "; ".join(record["issues"]) if record["issues"] else ""
        lines.append(
            f"| `{record['paper_key']}` | {record['pdf_present']} | {record['review_present']} | {record['reply_present']} | {record['source_status_present']} | {issues} |"
        )
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text("\n".join(lines), encoding="utf-8")


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Validate a project-level corpus run before skill initialization/evolution.")
    parser.add_argument("--run-dir", type=Path, required=True)
    parser.add_argument("--out-json", type=Path, default=None)
    parser.add_argument("--out-md", type=Path, default=None)
    return parser.parse_args()


def main() -> None:
    args = parse_args()
    run_dir = args.run_dir.resolve()
    if not run_dir.exists():
        raise SystemExit(f"Run directory does not exist: {run_dir}")
    report = validate(run_dir)
    out_json = args.out_json or (run_dir / "validation" / "project_corpus_validation.json")
    out_md = args.out_md or (run_dir / "validation" / "project_corpus_validation.md")
    write_json(out_json, report)
    write_markdown(report, out_md)
    print(json.dumps({"valid": report["valid"], "paper_count": report["paper_count"], "issue_count": report["issue_count"], "out_json": str(out_json), "out_md": str(out_md)}, ensure_ascii=False, indent=2))
    if not report["valid"]:
        raise SystemExit(2)


if __name__ == "__main__":
    main()