文件预览

document_converter.py

查看 word-latex-formula 技能包中的文件内容。

文件内容

resources/latex_convert_project/latex_convert/document_converter.py

from __future__ import annotations

import os
import json
from pathlib import Path
import platform
import shutil
import subprocess
import sys
import tempfile


class ConversionError(RuntimeError):
    pass


WORD_FORMAT_DOCX = 16
WORD_FORMAT_PDF = 17
WORD_TIMEOUT_SECONDS = int(os.environ.get("LATEX_CONVERT_WORD_TIMEOUT", "45"))


def ensure_docx(input_path: Path, work_dir: Path, engine: str = "auto") -> tuple[Path, str]:
    input_path = input_path.resolve()
    suffix = input_path.suffix.lower()
    if suffix == ".docx":
        return input_path, "already-docx"
    if suffix not in {".doc", ".wps"}:
        raise ConversionError(f"Unsupported input extension: {suffix}")
    work_dir.mkdir(parents=True, exist_ok=True)
    output_path = work_dir / f"{input_path.stem}.docx"
    errors: list[str] = []
    engines = [engine] if engine != "auto" else ["word", "libreoffice"]
    for candidate in engines:
        try:
            if candidate == "word":
                convert_with_word(input_path, output_path)
            elif candidate == "libreoffice":
                convert_with_libreoffice(input_path, output_path)
            else:
                raise ConversionError(f"Unknown conversion engine: {candidate}")
            if output_path.exists() and output_path.stat().st_size > 0:
                return output_path, candidate
            errors.append(f"{candidate}: output was not created")
        except subprocess.TimeoutExpired:
            errors.append(f"{candidate}: timed out after {WORD_TIMEOUT_SECONDS}s")
        except Exception as exc:
            errors.append(f"{candidate}: {exc}")
    raise ConversionError("; ".join(errors))


def convert_with_word(input_path: Path, output_path: Path) -> None:
    system = platform.system().lower()
    if system == "darwin":
        _convert_with_word_mac(input_path, output_path)
        return
    if system == "windows":
        _convert_with_word_windows(input_path, output_path)
        return
    raise ConversionError("Microsoft Word automation is only implemented for macOS and Windows")


def convert_docx_to_pdf(input_path: Path, output_path: Path, engine: str = "auto") -> tuple[Path, str]:
    input_path = input_path.resolve()
    output_path = output_path.resolve()
    if input_path.suffix.lower() != ".docx":
        raise ConversionError(f"PDF preview requires a DOCX input, got: {input_path.suffix}")
    output_path.parent.mkdir(parents=True, exist_ok=True)
    errors: list[str] = []
    engines = [engine] if engine != "auto" else ["word", "libreoffice"]
    for candidate in engines:
        try:
            if candidate == "word":
                _export_pdf_with_word(input_path, output_path)
            elif candidate == "libreoffice":
                _export_pdf_with_libreoffice(input_path, output_path)
            else:
                raise ConversionError(f"Unknown preview engine: {candidate}")
            if output_path.exists() and output_path.stat().st_size > 0:
                return output_path, candidate
            errors.append(f"{candidate}: PDF output was not created")
        except subprocess.TimeoutExpired:
            errors.append(f"{candidate}: timed out after {WORD_TIMEOUT_SECONDS}s")
        except Exception as exc:
            errors.append(f"{candidate}: {exc}")
    raise ConversionError("; ".join(errors))


def _convert_with_word_mac(input_path: Path, output_path: Path) -> None:
    if not Path("/Applications/Microsoft Word.app").exists():
        raise ConversionError("Microsoft Word.app was not found in /Applications")
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with tempfile.TemporaryDirectory(prefix="latex-convert-word-", dir=_word_mac_work_root()) as tmp:
        tmp_dir = Path(tmp)
        local_input = tmp_dir / input_path.name
        local_output = tmp_dir / output_path.name
        shutil.copy2(input_path, local_input)
        _run_word_mac_save_as(local_input, local_output)
        if not local_output.exists():
            raise ConversionError("Microsoft Word did not create the DOCX")
        shutil.copy2(local_output, output_path)


def _export_pdf_with_word(input_path: Path, output_path: Path) -> None:
    system = platform.system().lower()
    if system == "darwin":
        _export_pdf_with_word_mac(input_path, output_path)
        return
    if system == "windows":
        _export_pdf_with_word_windows(input_path, output_path)
        return
    raise ConversionError("Microsoft Word PDF export is only implemented for macOS and Windows")


def _export_pdf_with_word_mac(input_path: Path, output_path: Path) -> None:
    if not Path("/Applications/Microsoft Word.app").exists():
        raise ConversionError("Microsoft Word.app was not found in /Applications")
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with tempfile.TemporaryDirectory(prefix="latex-convert-word-pdf-", dir=_word_mac_work_root()) as tmp:
        tmp_dir = Path(tmp)
        local_input = tmp_dir / input_path.name
        local_output = tmp_dir / output_path.name
        shutil.copy2(input_path, local_input)
        _run_word_mac_export_pdf(local_input, local_output)
        if not local_output.exists():
            raise ConversionError("Microsoft Word did not create the PDF")
        shutil.copy2(local_output, output_path)


def _word_mac_work_root() -> str | None:
    """Prefer Word's own sandbox container to avoid macOS file-access prompts."""
    configured = os.environ.get("LATEX_CONVERT_WORD_WORKDIR")
    candidates = [
        Path(configured).expanduser() if configured else None,
        Path.home() / "Library/Containers/com.microsoft.Word/Data/tmp",
        Path.home() / "Library/Containers/com.microsoft.Word/Data/Documents",
    ]
    for candidate in candidates:
        if candidate is None:
            continue
        try:
            candidate.mkdir(parents=True, exist_ok=True)
            probe = candidate / ".latex_convert_probe"
            probe.write_text("ok", encoding="utf-8")
            probe.unlink(missing_ok=True)
            return str(candidate)
        except OSError:
            continue
    return None


def _run_word_mac_save_as(input_path: Path, output_path: Path) -> None:
    script = f'''
set inFile to POSIX file "{_escape_applescript(str(input_path))}"
set outFile to POSIX file "{_escape_applescript(str(output_path))}"
tell application "Microsoft Word"
    set wasVisible to visible
    set oldAlerts to display alerts
    try
        set visible to false
        set display alerts to alerts none
        open inFile
        set activeDoc to active document
        save as activeDoc file name outFile file format format document default
        close activeDoc saving no
        set display alerts to oldAlerts
        set visible to wasVisible
    on error errMsg number errNum
        try
            if (count of documents) > 0 then close active document saving no
        end try
        set display alerts to oldAlerts
        set visible to wasVisible
        error errMsg number errNum
    end try
end tell
'''
    result = subprocess.run(
        ["osascript", "-e", script],
        text=True,
        capture_output=True,
        timeout=WORD_TIMEOUT_SECONDS,
    )
    if result.returncode != 0:
        raise ConversionError(_short_process_error(result))


def _run_word_mac_export_pdf(input_path: Path, output_path: Path) -> None:
    script = f'''
set inFile to POSIX file "{_escape_applescript(str(input_path))}"
set outFile to POSIX file "{_escape_applescript(str(output_path))}"
tell application "Microsoft Word"
    set wasVisible to visible
    set oldAlerts to display alerts
    try
        set visible to false
        set display alerts to alerts none
        open inFile
        set activeDoc to active document
        save as activeDoc file name outFile file format format PDF
        close activeDoc saving no
        set display alerts to oldAlerts
        set visible to wasVisible
    on error errMsg number errNum
        try
            if (count of documents) > 0 then close active document saving no
        end try
        set display alerts to oldAlerts
        set visible to wasVisible
        error errMsg number errNum
    end try
end tell
'''
    result = subprocess.run(
        ["osascript", "-e", script],
        text=True,
        capture_output=True,
        timeout=WORD_TIMEOUT_SECONDS,
    )
    if result.returncode != 0:
        raise ConversionError(_short_process_error(result))


def _convert_with_word_windows(input_path: Path, output_path: Path) -> None:
    output_path.parent.mkdir(parents=True, exist_ok=True)
    code = f"""
import pythoncom
import win32com.client

input_path = {json.dumps(str(input_path))}
output_path = {json.dumps(str(output_path))}
word = None
doc = None
pythoncom.CoInitialize()
try:
    word = win32com.client.DispatchEx('Word.Application')
    word.Visible = False
    word.DisplayAlerts = 0
    try:
        word.AutomationSecurity = 3
    except Exception:
        pass
    doc = word.Documents.Open(
        FileName=input_path,
        ConfirmConversions=False,
        ReadOnly=True,
        AddToRecentFiles=False,
        Visible=False,
        NoEncodingDialog=True,
    )
    doc.SaveAs2(FileName=output_path, FileFormat={WORD_FORMAT_DOCX}, AddToRecentFiles=False)
finally:
    if doc is not None:
        doc.Close(False)
    if word is not None:
        word.Quit()
    pythoncom.CoUninitialize()
"""
    result = subprocess.run(
        [sys.executable, "-c", code],
        text=True,
        capture_output=True,
        timeout=WORD_TIMEOUT_SECONDS,
    )
    if result.returncode != 0:
        raise ConversionError(_short_process_error(result))


def _export_pdf_with_word_windows(input_path: Path, output_path: Path) -> None:
    output_path.parent.mkdir(parents=True, exist_ok=True)
    code = f"""
import pythoncom
import win32com.client

input_path = {json.dumps(str(input_path))}
output_path = {json.dumps(str(output_path))}
word = None
doc = None
pythoncom.CoInitialize()
try:
    word = win32com.client.DispatchEx('Word.Application')
    word.Visible = False
    word.DisplayAlerts = 0
    try:
        word.AutomationSecurity = 3
    except Exception:
        pass
    doc = word.Documents.Open(
        FileName=input_path,
        ConfirmConversions=False,
        ReadOnly=True,
        AddToRecentFiles=False,
        Visible=False,
        NoEncodingDialog=True,
    )
    doc.ExportAsFixedFormat(OutputFileName=output_path, ExportFormat={WORD_FORMAT_PDF}, OpenAfterExport=False)
finally:
    if doc is not None:
        doc.Close(False)
    if word is not None:
        word.Quit()
    pythoncom.CoUninitialize()
"""
    result = subprocess.run(
        [sys.executable, "-c", code],
        text=True,
        capture_output=True,
        timeout=WORD_TIMEOUT_SECONDS,
    )
    if result.returncode != 0:
        raise ConversionError(_short_process_error(result))


def convert_with_libreoffice(input_path: Path, output_path: Path) -> None:
    soffice = shutil.which("soffice") or shutil.which("libreoffice")
    if not soffice:
        raise ConversionError("LibreOffice/soffice was not found")
    with tempfile.TemporaryDirectory(prefix="latex-convert-lo-") as tmp:
        tmp_path = Path(tmp)
        env = os.environ.copy()
        env["HOME"] = str(tmp_path / "home")
        env["UserInstallation"] = f"file://{tmp_path / 'profile'}"
        result = subprocess.run(
            [
                soffice,
                "--headless",
                f"-env:UserInstallation={(tmp_path / 'profile').as_uri()}",
                "--convert-to",
                "docx",
                "--outdir",
                str(output_path.parent),
                str(input_path),
            ],
            text=True,
            capture_output=True,
            timeout=180,
            env=env,
        )
    generated = output_path.parent / f"{input_path.stem}.docx"
    if result.returncode != 0:
        raise ConversionError(_short_process_error(result))
    if generated != output_path and generated.exists():
        generated.replace(output_path)
    if not output_path.exists():
        raise ConversionError(_short_process_error(result) or "LibreOffice did not create a DOCX")


def _export_pdf_with_libreoffice(input_path: Path, output_path: Path) -> None:
    soffice = shutil.which("soffice") or shutil.which("libreoffice")
    if not soffice:
        raise ConversionError("LibreOffice/soffice was not found")
    with tempfile.TemporaryDirectory(prefix="latex-convert-lo-pdf-") as tmp:
        tmp_path = Path(tmp)
        env = os.environ.copy()
        env["HOME"] = str(tmp_path / "home")
        env["UserInstallation"] = f"file://{tmp_path / 'profile'}"
        result = subprocess.run(
            [
                soffice,
                "--headless",
                f"-env:UserInstallation={(tmp_path / 'profile').as_uri()}",
                "--convert-to",
                "pdf",
                "--outdir",
                str(output_path.parent),
                str(input_path),
            ],
            text=True,
            capture_output=True,
            timeout=240,
            env=env,
        )
    generated = output_path.parent / f"{input_path.stem}.pdf"
    if result.returncode != 0:
        raise ConversionError(_short_process_error(result))
    if generated != output_path and generated.exists():
        generated.replace(output_path)
    if not output_path.exists():
        raise ConversionError(_short_process_error(result) or "LibreOffice did not create a PDF")


def _escape_applescript(value: str) -> str:
    return value.replace("\\", "\\\\").replace('"', '\\"')


def _short_process_error(result: subprocess.CompletedProcess[str]) -> str:
    text = (result.stderr or result.stdout or "").strip()
    return text[-1000:] if text else f"process exited with {result.returncode}"