文件预览

md_to_docx.py

查看 Finance OCR Pro 技能包中的文件内容。

文件内容

scripts/md_to_docx.py

"""
Markdown to DOCX Converter

Converts Markdown files (with HTML tables and LaTeX formulas)
into professionally styled A4 DOCX documents with multilingual support.

Handles:
    - Headings (# through ######)
    - Paragraphs with inline formatting (bold, italic, code)
    - Inline LaTeX ($...$) and block LaTeX ($$...$$)
    - HTML tables with colspan/rowspan and LaTeX in cells
    - Ordered and unordered lists
    - Fenced code blocks (```language ... ```)
    - Embedded images (when source file exists)

Requirements:
    pip install python-docx beautifulsoup4 lxml latex2mathml

Usage:
    from pathlib import Path
    markdown_to_docx(Path("input.md"), Path("output.docx"))
"""

from __future__ import annotations

import re
from pathlib import Path

from docx import Document
from docx.shared import Pt, Cm, RGBColor, Twips
from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_BREAK
from docx.enum.table import WD_ROW_HEIGHT_RULE
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
from bs4 import BeautifulSoup, NavigableString, Tag
import latex2mathml.converter
from lxml import etree


# ═══════════════════════════════════════════════════════════════════════════════
# Configuration
# ═══════════════════════════════════════════════════════════════════════════════

OMML_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math"
WORD_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
M = "{" + OMML_NS + "}"
W = "{" + WORD_NS + "}"

FONT_BODY = "Arial"
FONT_MATH = "Cambria Math"
FONT_CODE = "Consolas"

FONT_SIZE_BODY = Pt(11)
FONT_SIZE_TABLE = Pt(9.5)
FONT_SIZE_CODE = Pt(10)

COLOR_HEADING = RGBColor(0x1F, 0x49, 0x7D)
COLOR_BODY = RGBColor(0x1A, 0x1A, 0x1A)
COLOR_MUTED = RGBColor(0x88, 0x88, 0x88)

TABLE_HEADER_BG = "D6E4F0"
TABLE_ALT_ROW_BG = "F5F8FC"
TABLE_BORDER_COLOR = "A6A6A6"
TABLE_CELL_MARGIN_DXA = 72

CODE_BG_COLOR = "F2F2F2"
CODE_BORDER_COLOR = "CCCCCC"

MARGIN_NARROW = Cm(1.8)

HEADING_SIZES = {1: 20, 2: 16, 3: 14, 4: 12, 5: 11, 6: 11}


# ═══════════════════════════════════════════════════════════════════════════════
# Font Utilities
# ═══════════════════════════════════════════════════════════════════════════════

def _set_rfonts(rpr_element, font_name: str):
    """Set w:rFonts for all four script types (Latin, East Asian, Complex Script)."""
    rf = rpr_element.find(qn('w:rFonts'))
    if rf is None:
        rf = OxmlElement('w:rFonts')
        rpr_element.insert(0, rf)
    for attr in ('w:ascii', 'w:hAnsi', 'w:eastAsia', 'w:cs'):
        rf.set(qn(attr), font_name)


def set_run_fonts(run, font_name: str = FONT_BODY, size=FONT_SIZE_BODY,
                  bold: bool = False, italic: bool = False,
                  color: RGBColor | None = None):
    """Configure a run's font for full multilingual compatibility."""
    run.font.name = font_name
    run.font.size = size
    run.bold = bold
    run.italic = italic
    if color:
        run.font.color.rgb = color
    _set_rfonts(run._r.get_or_add_rPr(), font_name)


# ═══════════════════════════════════════════════════════════════════════════════
# Preprocessing
# ═══════════════════════════════════════════════════════════════════════════════

_PAGE_ORDER_RE = re.compile(r'^Page_Order_\d+$', re.IGNORECASE | re.MULTILINE)
_PAGE_NUMBER_RE = re.compile(r'Page\s+Number\s+\d+\s*:\s*\n?', re.IGNORECASE)
_PAGE_SEP_RE = re.compile(r'\n\s*---\s*\n')
_MULTI_BLANK_RE = re.compile(r'\n{3,}')


def preprocess_content(content: str) -> str:
    """Remove OCR noise, separators, and collapse excess blank lines.

    ``Page_Order_N`` sentinels are preserved so the document builder can
    insert page breaks at the correct positions.
    """
    content = _PAGE_NUMBER_RE.sub('', content)
    content = _PAGE_SEP_RE.sub('\n\n', content)
    content = _MULTI_BLANK_RE.sub('\n\n', content)
    return content.strip()


# ═══════════════════════════════════════════════════════════════════════════════
# MathML to OMML Conversion
# ═══════════════════════════════════════════════════════════════════════════════

_NARY_OPERATORS = frozenset({"∑", "∏", "∐", "∫", "∬", "∭", "∮", "⋂", "⋃"})
_COMMON_FUNCTIONS = frozenset({
    "arccos", "arcsin", "arctan", "arg", "cos", "cosh", "cot", "coth",
    "csc", "deg", "det", "dim", "exp", "gcd", "hom", "inf", "ker",
    "lg", "lim", "liminf", "limsup", "ln", "log", "max", "min", "mod",
    "Pr", "sec", "sin", "sinh", "sup", "tan", "tanh",
})


def _omml_run(text: str, *, italic: bool = False) -> etree._Element:
    """Create an OMML run element containing the given text."""
    r = etree.Element(M + "r")
    if italic:
        rPr = etree.SubElement(r, M + "rPr")
        sty = etree.SubElement(rPr, M + "sty")
        sty.set(M + "val", "i")
    t = etree.SubElement(r, M + "t")
    t.set("{http://www.w3.org/XML/1998/namespace}space", "preserve")
    t.text = text if text else ""
    return r


def _tag_local(element) -> str:
    """Strip namespace URI from an element's tag name."""
    tag = element.tag
    return tag.split('}')[1] if '}' in tag else tag


def _simple_text(element) -> str:
    """Return direct text for simple MathML token elements."""
    if len(element):
        return ""
    return (element.text or "").strip()


def _is_fence_mo(element, form: str | None = None) -> bool:
    """Return True for MathML fence operators produced by \\left/\\right."""
    if _tag_local(element) != 'mo':
        return False
    if element.get('fence') != 'true':
        return False
    if form is not None and element.get('form') != form:
        return False
    return bool((element.text or "").strip())


def _append_delimiter(parent_omml, begin: str, end: str, enclosed_children: list) -> None:
    """Append an OMML delimiter around already grouped MathML children."""
    d = etree.SubElement(parent_omml, M + "d")
    dPr = etree.SubElement(d, M + "dPr")
    beg = etree.SubElement(dPr, M + "begChr")
    beg.set(M + "val", begin)
    end_el = etree.SubElement(dPr, M + "endChr")
    end_el.set(M + "val", end)
    e = etree.SubElement(d, M + "e")
    _convert_child_sequence(enclosed_children, e)


def _append_identifier_run(parent_omml, text: str) -> None:
    """Append a compact identifier run instead of spacing letters apart."""
    if not text:
        return
    italic = len(text) == 1 and text.isalpha() and text not in _COMMON_FUNCTIONS
    parent_omml.append(_omml_run(text, italic=italic))


def _append_nary(parent_omml, operator: str, sub_el=None, sup_el=None,
                 body_children: list | None = None) -> None:
    """Append an OMML n-ary operator with optional lower/upper limits."""
    nary = etree.SubElement(parent_omml, M + "nary")
    naryPr = etree.SubElement(nary, M + "naryPr")
    ch = etree.SubElement(naryPr, M + "chr")
    ch.set(M + "val", operator)
    limLoc = etree.SubElement(naryPr, M + "limLoc")
    limLoc.set(M + "val", "undOvr" if operator != "∫" else "subSup")
    sub = etree.SubElement(nary, M + "sub")
    sup = etree.SubElement(nary, M + "sup")
    e = etree.SubElement(nary, M + "e")
    if sub_el is not None:
        _mml_to_omml(sub_el, sub)
    if sup_el is not None:
        _mml_to_omml(sup_el, sup)
    if body_children:
        _convert_child_sequence(body_children, e)


def _append_script(parent_omml, tag: str, base_text: str, sub_el=None, sup_el=None) -> None:
    """Append a subscript/superscript whose base was assembled from text."""
    if tag == 'msup':
        ss = etree.SubElement(parent_omml, M + "sSup")
        etree.SubElement(ss, M + "sSupPr")
        e = etree.SubElement(ss, M + "e")
        sup = etree.SubElement(ss, M + "sup")
        _append_identifier_run(e, base_text)
        if sup_el is not None:
            _mml_to_omml(sup_el, sup)
        return

    if tag == 'msub':
        ss = etree.SubElement(parent_omml, M + "sSub")
        etree.SubElement(ss, M + "sSubPr")
        e = etree.SubElement(ss, M + "e")
        sub = etree.SubElement(ss, M + "sub")
        _append_identifier_run(e, base_text)
        if sub_el is not None:
            _mml_to_omml(sub_el, sub)
        return

    ss = etree.SubElement(parent_omml, M + "sSubSup")
    etree.SubElement(ss, M + "sSubSupPr")
    e = etree.SubElement(ss, M + "e")
    sub = etree.SubElement(ss, M + "sub")
    sup = etree.SubElement(ss, M + "sup")
    _append_identifier_run(e, base_text)
    if sub_el is not None:
        _mml_to_omml(sub_el, sub)
    if sup_el is not None:
        _mml_to_omml(sup_el, sup)


def _nary_base_operator(element) -> str | None:
    """Return the n-ary operator represented by a MathML base element."""
    text = _simple_text(element)
    if text in _NARY_OPERATORS:
        return text
    if len(element) == 1:
        return _nary_base_operator(list(element)[0])
    return None


def _scripted_nary_parts(element) -> tuple[str, object | None, object | None] | None:
    """Return operator, subscript, and superscript for scripted n-ary MathML."""
    tag = _tag_local(element)
    children = list(element)
    if tag not in {'msub', 'msup', 'msubsup'} or not children:
        return None

    op = _nary_base_operator(children[0])
    if not op:
        return None

    sub_el = children[1] if tag in {'msub', 'msubsup'} and len(children) >= 2 else None
    sup_index = 1 if tag == 'msup' else 2
    sup_el = children[sup_index] if tag in {'msup', 'msubsup'} and len(children) > sup_index else None
    return op, sub_el, sup_el


def _append_mathml_tail(element, parent_omml) -> None:
    """Append an element tail in document order when MathML provides one."""
    if element.tail and element.tail.strip():
        parent_omml.append(_omml_run(element.tail.strip()))


def _collect_nary_body(children: list, start: int) -> tuple[list, int]:
    """Collect the operand following a scripted n-ary operator."""
    body: list = []
    depth = 0
    i = start

    while i < len(children):
        child = children[i]
        tag = _tag_local(child)
        text = (child.text or "").strip()

        if body and depth == 0 and tag == 'mo' and text in {'·', '×', '+', '-', '−', '=', '<', '>', '≤', '≥', '≈', ','}:
            break

        body.append(child)

        if tag == 'mo':
            if text in {'(', '[', '{'}:
                depth += 1
            elif text in {')', ']', '}'} and depth > 0:
                depth -= 1

        i += 1
        if body and depth == 0 and tag not in {'mi', 'mn', 'msub', 'msup', 'msubsup', 'mrow', 'mfenced'}:
            # Keep scanning through simple function arguments, but stop before the
            # next explicit binary operator handled above.
            continue

    return body, i


def _convert_child_sequence(children: list, parent_omml) -> None:
    """Convert a child sequence, coalescing adjacent identifiers."""
    i = 0
    while i < len(children):
        child = children[i]
        nary_parts = _scripted_nary_parts(child)
        if nary_parts:
            body, next_i = _collect_nary_body(children, i + 1)
            if body:
                op, sub_el, sup_el = nary_parts
                _append_nary(parent_omml, op, sub_el=sub_el, sup_el=sup_el, body_children=body)
                i = next_i
                continue

        if _tag_local(child) == 'mi' and _simple_text(child):
            parts = [_simple_text(child)]
            j = i + 1
            while j < len(children) and _tag_local(children[j]) == 'mi' and _simple_text(children[j]):
                parts.append(_simple_text(children[j]))
                j += 1
            if j < len(children) and _tag_local(children[j]) in {'msub', 'msup', 'msubsup'}:
                script_children = list(children[j])
                if script_children and _tag_local(script_children[0]) == 'mi' and _simple_text(script_children[0]):
                    base_text = ''.join(parts) + _simple_text(script_children[0])
                    sub_el = None
                    sup_el = None
                    script_tag = _tag_local(children[j])
                    if script_tag in {'msub', 'msubsup'} and len(script_children) >= 2:
                        sub_el = script_children[1]
                    if script_tag == 'msup' and len(script_children) >= 2:
                        sup_el = script_children[1]
                    elif script_tag == 'msubsup' and len(script_children) >= 3:
                        sup_el = script_children[2]
                    _append_script(parent_omml, script_tag, base_text, sub_el=sub_el, sup_el=sup_el)
                    _append_mathml_tail(children[j], parent_omml)
                    i = j + 1
                    continue
            _append_identifier_run(parent_omml, ''.join(parts))
            _append_mathml_tail(children[j - 1], parent_omml)
            i = j
            continue

        _mml_to_omml(child, parent_omml)
        _append_mathml_tail(child, parent_omml)
        i += 1


def _convert_children(mml_el, parent_omml):
    """Recursively convert all children of a MathML element to OMML."""
    if mml_el.text and mml_el.text.strip():
        parent_omml.append(_omml_run(mml_el.text.strip()))
    _convert_child_sequence(list(mml_el), parent_omml)


_MML_PASSTHROUGH_TAGS = frozenset(('mstyle', 'mpadded', 'mphantom'))
_MML_TEXT_TAGS = frozenset(('mn', 'mo', 'mtext'))


def _mml_to_omml(mml_el, parent_omml):
    """Recursively convert a single MathML element to its OMML equivalent."""
    tag = _tag_local(mml_el)
    text = (mml_el.text or "").strip()
    children = list(mml_el)

    if tag == 'math':
        _convert_children(mml_el, parent_omml)

    elif tag == 'semantics':
        for child in children:
            if not _tag_local(child).startswith('annotation'):
                _mml_to_omml(child, parent_omml)
                break

    elif tag == 'mrow':
        if len(children) >= 2 and _is_fence_mo(children[0], 'prefix') and _is_fence_mo(children[-1], 'postfix'):
            _append_delimiter(
                parent_omml,
                (children[0].text or "").strip(),
                (children[-1].text or "").strip(),
                children[1:-1],
            )
        elif children and _is_fence_mo(children[0], 'prefix') and any(_tag_local(child) == 'mtable' for child in children[1:]):
            _append_delimiter(parent_omml, (children[0].text or "").strip(), "", children[1:])
        else:
            _convert_children(mml_el, parent_omml)

    elif tag == 'mi':
        _append_identifier_run(parent_omml, text)

    elif tag in _MML_TEXT_TAGS:
        parent_omml.append(_omml_run(text))

    elif tag == 'mspace':
        parent_omml.append(_omml_run(" "))

    elif tag == 'mfrac':
        f = etree.SubElement(parent_omml, M + "f")
        etree.SubElement(f, M + "fPr")
        num = etree.SubElement(f, M + "num")
        den = etree.SubElement(f, M + "den")
        if len(children) >= 1:
            _mml_to_omml(children[0], num)
        if len(children) >= 2:
            _mml_to_omml(children[1], den)

    elif tag == 'msup':
        ss = etree.SubElement(parent_omml, M + "sSup")
        etree.SubElement(ss, M + "sSupPr")
        e = etree.SubElement(ss, M + "e")
        sup = etree.SubElement(ss, M + "sup")
        if len(children) >= 1:
            _mml_to_omml(children[0], e)
        if len(children) >= 2:
            _mml_to_omml(children[1], sup)

    elif tag == 'msub':
        ss = etree.SubElement(parent_omml, M + "sSub")
        etree.SubElement(ss, M + "sSubPr")
        e = etree.SubElement(ss, M + "e")
        sub = etree.SubElement(ss, M + "sub")
        if len(children) >= 1:
            _mml_to_omml(children[0], e)
        if len(children) >= 2:
            _mml_to_omml(children[1], sub)

    elif tag == 'msubsup':
        ss = etree.SubElement(parent_omml, M + "sSubSup")
        etree.SubElement(ss, M + "sSubSupPr")
        e = etree.SubElement(ss, M + "e")
        sub = etree.SubElement(ss, M + "sub")
        sup = etree.SubElement(ss, M + "sup")
        if len(children) >= 1:
            _mml_to_omml(children[0], e)
        if len(children) >= 2:
            _mml_to_omml(children[1], sub)
        if len(children) >= 3:
            _mml_to_omml(children[2], sup)

    elif tag == 'msqrt':
        rad = etree.SubElement(parent_omml, M + "rad")
        radPr = etree.SubElement(rad, M + "radPr")
        dh = etree.SubElement(radPr, M + "degHide")
        dh.set(M + "val", "1")
        etree.SubElement(rad, M + "deg")
        e = etree.SubElement(rad, M + "e")
        _convert_children(mml_el, e)

    elif tag == 'mroot':
        rad = etree.SubElement(parent_omml, M + "rad")
        etree.SubElement(rad, M + "radPr")
        deg = etree.SubElement(rad, M + "deg")
        e = etree.SubElement(rad, M + "e")
        if len(children) >= 1:
            _mml_to_omml(children[0], e)
        if len(children) >= 2:
            _mml_to_omml(children[1], deg)

    elif tag == 'mfenced':
        d = etree.SubElement(parent_omml, M + "d")
        dPr = etree.SubElement(d, M + "dPr")
        beg = etree.SubElement(dPr, M + "begChr")
        beg.set(M + "val", mml_el.get('open', '('))
        end = etree.SubElement(dPr, M + "endChr")
        end.set(M + "val", mml_el.get('close', ')'))
        e = etree.SubElement(d, M + "e")
        _convert_children(mml_el, e)

    elif tag == 'mtable':
        m_el = etree.SubElement(parent_omml, M + "m")
        etree.SubElement(m_el, M + "mPr")
        for child in children:
            if _tag_local(child) == 'mtr':
                mr = etree.SubElement(m_el, M + "mr")
                for cell in child:
                    if _tag_local(cell) == 'mtd':
                        e = etree.SubElement(mr, M + "e")
                        _convert_children(cell, e)

    elif tag == 'mover':
        acc = etree.SubElement(parent_omml, M + "acc")
        accPr = etree.SubElement(acc, M + "accPr")
        if len(children) >= 2:
            ch = etree.SubElement(accPr, M + "chr")
            ch.set(M + "val", children[1].text if children[1].text else "^")
        e = etree.SubElement(acc, M + "e")
        if len(children) >= 1:
            _mml_to_omml(children[0], e)

    elif tag == 'munder':
        ll = etree.SubElement(parent_omml, M + "limLow")
        etree.SubElement(ll, M + "limLowPr")
        e = etree.SubElement(ll, M + "e")
        lim = etree.SubElement(ll, M + "lim")
        if len(children) >= 1:
            _mml_to_omml(children[0], e)
        if len(children) >= 2:
            _mml_to_omml(children[1], lim)

    elif tag == 'munderover':
        nary = etree.SubElement(parent_omml, M + "nary")
        naryPr = etree.SubElement(nary, M + "naryPr")
        if len(children) >= 1:
            first = children[0]
            op = first.text
            if not op and len(list(first)) > 0:
                op = list(first)[0].text
            if op:
                ch = etree.SubElement(naryPr, M + "chr")
                ch.set(M + "val", op)
        limLoc = etree.SubElement(naryPr, M + "limLoc")
        limLoc.set(M + "val", "undOvr")
        sub = etree.SubElement(nary, M + "sub")
        sup = etree.SubElement(nary, M + "sup")
        e = etree.SubElement(nary, M + "e")
        if len(children) >= 2:
            _mml_to_omml(children[1], sub)
        if len(children) >= 3:
            _mml_to_omml(children[2], sup)

    elif tag in _MML_PASSTHROUGH_TAGS:
        _convert_children(mml_el, parent_omml)

    else:
        if text:
            parent_omml.append(_omml_run(text))
        _convert_children(mml_el, parent_omml)


def latex_to_omml(latex_str: str) -> etree._Element | None:
    """Convert a LaTeX string to an OMML <m:oMath> element, or None on failure."""
    try:
        latex_str = latex_str.strip()
        if not latex_str:
            return None
        mathml_str = latex2mathml.converter.convert(latex_str)
        mathml_str = re.sub(r'\s+xmlns(:[a-z]+)?="[^"]*"', '', mathml_str)
        mml_tree = etree.fromstring(mathml_str.encode('utf-8'))
        omath = etree.Element(M + "oMath")
        _mml_to_omml(mml_tree, omath)
        return omath
    except Exception as exc:
        preview = latex_str[:80] + ('...' if len(latex_str) > 80 else '')
        print(f"  [LaTeX warning] {exc} — expression: {preview}")
        return None


# ═══════════════════════════════════════════════════════════════════════════════
# Inline Content Parsing
# ═══════════════════════════════════════════════════════════════════════════════

_INLINE_RE = re.compile(
    r'(\$\$[\s\S]*?\$\$)'
    r'|(\$(?!\$)[^\$\n]+?\$)'
    r'|(\*\*\*(.+?)\*\*\*)'
    r'|(\*\*(.+?)\*\*)'
    r'|(\*(?!\*)(.+?)(?<!\*)\*)'
    r'|(`[^`\n]+?`)'
)
_TRAILING_MATH_BASE_RE = re.compile(r'([A-Za-z][A-Za-z0-9]*)$')


def _attach_leading_script_to_previous_text(parts: list[dict], latex: str) -> str:
    """Turn text like ``VaR$_{t-1}$`` into one LaTeX expression."""
    if not latex.startswith(('_', '^')) or not parts:
        return latex
    prev = parts[-1]
    if prev.get('type') != 'text':
        return latex
    match = _TRAILING_MATH_BASE_RE.search(prev.get('content', ''))
    if not match:
        return latex

    base = match.group(1)
    prev['content'] = prev['content'][:match.start(1)]
    if not prev['content']:
        parts.pop()
    return f"{base}{latex}"


def parse_inline_content(text: str) -> list[dict]:
    """
    Split text into typed segments for rendering.

    Segment types: text, bold, bold_italic, code, latex_inline, latex_block.
    """
    if not text:
        return []

    parts: list[dict] = []
    last = 0

    for m in _INLINE_RE.finditer(text):
        if m.start() > last:
            parts.append({'type': 'text', 'content': text[last:m.start()]})

        s = m.group(0)
        if s.startswith('$$') and s.endswith('$$'):
            c = s[2:-2].strip()
            if c:
                parts.append({'type': 'latex_block', 'content': c})
        elif s.startswith('$') and s.endswith('$'):
            c = s[1:-1].strip()
            if c:
                c = _attach_leading_script_to_previous_text(parts, c)
                parts.append({'type': 'latex_inline', 'content': c})
        elif s.startswith('***') and s.endswith('***'):
            parts.append({'type': 'bold_italic', 'content': s[3:-3]})
        elif s.startswith('**') and s.endswith('**'):
            parts.append({'type': 'bold', 'content': s[2:-2]})
        elif s.startswith('*') and s.endswith('*') and not s.startswith('**'):
            parts.append({'type': 'italic', 'content': s[1:-1]})
        elif s.startswith('`') and s.endswith('`'):
            parts.append({'type': 'code', 'content': s[1:-1]})

        last = m.end()

    if last < len(text):
        parts.append({'type': 'text', 'content': text[last:]})

    if not parts and text:
        parts.append({'type': 'text', 'content': text})

    return parts


# ═══════════════════════════════════════════════════════════════════════════════
# HTML Table Parsing
# ═══════════════════════════════════════════════════════════════════════════════

def _cell_text(cell_tag) -> str:
    """Extract all text from a BeautifulSoup table cell, collapsing whitespace."""
    fragments: list[str] = []

    def _walk(node):
        if isinstance(node, NavigableString):
            t = re.sub(r'\s+', ' ', str(node))
            if t:
                fragments.append(t)
        elif isinstance(node, Tag):
            if node.name == 'br':
                fragments.append('\n')
            for child in node.children:
                _walk(child)

    _walk(cell_tag)
    return ''.join(fragments).strip()


def parse_html_table(table_html: str) -> tuple[list[dict], int, int]:
    """
    Parse an HTML table into cell data with merge information.

    Returns (cells_data, num_rows, num_cols).
    Each cell dict has: row, col, rowspan, colspan, content (list of segments),
    is_header (bool).
    """
    soup = BeautifulSoup(table_html, 'lxml')
    table = soup.find('table')
    if not table:
        return [], 0, 0

    rows = table.find_all('tr')
    if not rows:
        return [], 0, 0

    num_rows = len(rows)
    grid: dict[tuple[int, int], bool] = {}
    max_cols = 0

    for ri, row in enumerate(rows):
        ci = 0
        for cell in row.find_all(['td', 'th']):
            while (ri, ci) in grid:
                ci += 1
            cs = int(cell.get('colspan', 1))
            rs = int(cell.get('rowspan', 1))
            for r in range(ri, ri + rs):
                for c in range(ci, ci + cs):
                    grid[(r, c)] = True
            ci += cs
        max_cols = max(max_cols, ci)

    cells_data: list[dict] = []
    grid = {}

    for ri, row in enumerate(rows):
        ci = 0
        for cell in row.find_all(['td', 'th']):
            while (ri, ci) in grid:
                ci += 1
            cs = int(cell.get('colspan', 1))
            rs = int(cell.get('rowspan', 1))
            content_text = _cell_text(cell)
            cells_data.append({
                'row': ri,
                'col': ci,
                'rowspan': rs,
                'colspan': cs,
                'content': parse_inline_content(content_text),
                'is_header': cell.name == 'th',
            })
            for r in range(ri, ri + rs):
                for c in range(ci, ci + cs):
                    grid[(r, c)] = True
            ci += cs

    return cells_data, num_rows, max_cols


# ═══════════════════════════════════════════════════════════════════════════════
# Table Rendering in DOCX
# ═══════════════════════════════════════════════════════════════════════════════

def _set_table_borders(table, color: str = TABLE_BORDER_COLOR, size: str = "4"):
    """Apply uniform thin borders to all table edges."""
    tblPr = table._tbl.tblPr
    if tblPr is None:
        tblPr = OxmlElement('w:tblPr')
        table._tbl.insert(0, tblPr)

    for old in tblPr.findall(qn('w:tblBorders')):
        tblPr.remove(old)

    borders = OxmlElement('w:tblBorders')
    for name in ('top', 'left', 'bottom', 'right', 'insideH', 'insideV'):
        b = OxmlElement(f'w:{name}')
        b.set(qn('w:val'), 'single')
        b.set(qn('w:sz'), size)
        b.set(qn('w:space'), '0')
        b.set(qn('w:color'), color)
        borders.append(b)
    tblPr.append(borders)


def _set_cell_shading(cell, hex_color: str):
    """Set the background fill color of a table cell."""
    tcPr = cell._tc.get_or_add_tcPr()
    for old in tcPr.findall(qn('w:shd')):
        tcPr.remove(old)
    shd = OxmlElement('w:shd')
    shd.set(qn('w:val'), 'clear')
    shd.set(qn('w:color'), 'auto')
    shd.set(qn('w:fill'), hex_color)
    tcPr.append(shd)


def _set_cell_margins(cell, dxa: int = TABLE_CELL_MARGIN_DXA):
    """Set uniform internal margins (padding) for a table cell."""
    tcPr = cell._tc.get_or_add_tcPr()
    mar = OxmlElement('w:tcMar')
    for side in ('top', 'left', 'bottom', 'right'):
        el = OxmlElement(f'w:{side}')
        el.set(qn('w:w'), str(dxa))
        el.set(qn('w:type'), 'dxa')
        mar.append(el)
    tcPr.append(mar)


def _set_cell_vertical_alignment(cell, val: str = "center"):
    """Vertically align cell content (top, center, bottom)."""
    tcPr = cell._tc.get_or_add_tcPr()
    va = OxmlElement('w:vAlign')
    va.set(qn('w:val'), val)
    tcPr.append(va)


def _render_segments(para, segments: list[dict], *,
                     font_size=FONT_SIZE_BODY, bold: bool = False,
                     color: RGBColor | None = None):
    """Render a list of inline segments (text, bold, LaTeX, code) into a paragraph."""
    for seg in segments:
        stype = seg['type']

        if stype == 'text':
            run = para.add_run(seg['content'])
            set_run_fonts(run, size=font_size, bold=bold, color=color)

        elif stype == 'bold':
            run = para.add_run(seg['content'])
            set_run_fonts(run, size=font_size, bold=True, color=color)

        elif stype == 'italic':
            run = para.add_run(seg['content'])
            set_run_fonts(run, size=font_size, italic=True, color=color)

        elif stype == 'bold_italic':
            run = para.add_run(seg['content'])
            set_run_fonts(run, size=font_size, bold=True, italic=True, color=color)

        elif stype == 'code':
            run = para.add_run(seg['content'])
            set_run_fonts(run, font_name=FONT_CODE, size=FONT_SIZE_CODE, color=color)

        elif stype in ('latex_inline', 'latex_block'):
            omml = latex_to_omml(seg['content'])
            if omml is not None:
                para._p.append(omml)
            else:
                run = para.add_run(f"${seg['content']}$")
                set_run_fonts(run, size=font_size, italic=True, color=color)


def create_docx_table(doc: Document, cells_data: list[dict],
                      num_rows: int, num_cols: int) -> None:
    """Build a professionally styled DOCX table from parsed HTML cell data."""
    if num_rows == 0 or num_cols == 0:
        return

    table = doc.add_table(rows=num_rows, cols=num_cols)
    table.style = 'Table Grid'

    tblPr = table._tbl.tblPr
    if tblPr is None:
        tblPr = OxmlElement('w:tblPr')
        table._tbl.insert(0, tblPr)
    tblW = OxmlElement('w:tblW')
    tblW.set(qn('w:type'), 'pct')
    tblW.set(qn('w:w'), '5000')
    tblPr.append(tblW)
    jc = OxmlElement('w:jc')
    jc.set(qn('w:val'), 'center')
    tblPr.append(jc)

    _set_table_borders(table)

    tblLayout = OxmlElement('w:tblLayout')
    tblLayout.set(qn('w:type'), 'fixed')
    tblPr.append(tblLayout)

    processed: set = set()

    for cd in cells_data:
        ri, ci = cd['row'], cd['col']
        if (ri, ci) in processed:
            continue

        try:
            cell = table.cell(ri, ci)
        except IndexError:
            continue

        rs, cs = cd['rowspan'], cd['colspan']
        if rs > 1 or cs > 1:
            er = min(ri + rs - 1, num_rows - 1)
            ec = min(ci + cs - 1, num_cols - 1)
            try:
                cell.merge(table.cell(er, ec))
                for r in range(ri, er + 1):
                    for c in range(ci, ec + 1):
                        processed.add((r, c))
            except Exception:
                pass

        processed.add((ri, ci))

        para = cell.paragraphs[0] if cell.paragraphs else cell.add_paragraph()
        para.clear()
        _render_segments(para, cd['content'],
                         font_size=FONT_SIZE_TABLE, bold=cd['is_header'])
        para.alignment = WD_ALIGN_PARAGRAPH.CENTER

        if cd['is_header']:
            _set_cell_shading(cell, TABLE_HEADER_BG)
        elif ri % 2 == 1:
            _set_cell_shading(cell, TABLE_ALT_ROW_BG)

        _set_cell_margins(cell)
        _set_cell_vertical_alignment(cell)

    for row in table.rows:
        row.height = Twips(360)
        row.height_rule = WD_ROW_HEIGHT_RULE.AT_LEAST


# ═══════════════════════════════════════════════════════════════════════════════
# Document Styling & Page Layout
# ═══════════════════════════════════════════════════════════════════════════════

def _setup_styles(doc: Document) -> None:
    """Configure Normal and Heading styles for a professional business look."""
    normal = doc.styles['Normal']
    normal.font.name = FONT_BODY
    normal.font.size = FONT_SIZE_BODY
    normal.font.color.rgb = COLOR_BODY
    normal.paragraph_format.space_after = Pt(8)
    normal.paragraph_format.line_spacing = 1.15
    _set_rfonts(normal.element.get_or_add_rPr(), FONT_BODY)

    for level, size in HEADING_SIZES.items():
        name = f'Heading {level}'
        if name not in doc.styles:
            continue
        hs = doc.styles[name]
        hs.font.name = FONT_BODY
        hs.font.size = Pt(size)
        hs.font.bold = (level <= 5)
        hs.font.color.rgb = COLOR_HEADING
        hs.paragraph_format.space_before = Pt(14)
        hs.paragraph_format.space_after = Pt(8)
        hs.paragraph_format.line_spacing = 1.2
        _set_rfonts(hs.element.get_or_add_rPr(), FONT_BODY)


def _setup_page(doc: Document) -> None:
    """Set A4 page size with narrow margins."""
    for section in doc.sections:
        section.page_width = Cm(21.0)
        section.page_height = Cm(29.7)
        section.left_margin = MARGIN_NARROW
        section.right_margin = MARGIN_NARROW
        section.top_margin = MARGIN_NARROW
        section.bottom_margin = MARGIN_NARROW


# ═══════════════════════════════════════════════════════════════════════════════
# Markdown Element Parsing
# ═══════════════════════════════════════════════════════════════════════════════

_HEADING_RE = re.compile(r'^(#{1,6})\s+(.+)$')
_UL_RE = re.compile(r'^[\-\*\+□☐]\s+(.+)$')
_OL_RE = re.compile(r'^\d+[\.\)]\s+(.+)$')
_IMG_RE = re.compile(r'^!\[([^\]]*)\]\(([^\)]+)\)')
_FENCE_RE = re.compile(r'^(`{3,}|~{3,})\s*(.*)')
_PAGE_BREAK_RE = re.compile(r'^Page_Order_\d+$', re.IGNORECASE)


def parse_markdown_elements(content: str) -> list[dict]:
    """
    Parse preprocessed markdown text into a flat list of typed elements.

    Element types: heading, paragraph, table, latex_block,
                   unordered_list, ordered_list, image, code_block.
    """
    elements: list[dict] = []
    lines = content.split('\n')
    i, n = 0, len(lines)

    while i < n:
        line = lines[i]
        stripped = line.strip()

        if not stripped:
            i += 1
            continue

        # ── Page break sentinel from OCR page markers ──
        if _PAGE_BREAK_RE.match(stripped):
            elements.append({'type': 'page_break'})
            i += 1
            continue

        # ── HTML table ──
        if '<table' in line.lower():
            buf = [line]
            depth = line.lower().count('<table') - line.lower().count('</table>')
            i += 1
            while i < n and depth > 0:
                depth += lines[i].lower().count('<table')
                depth -= lines[i].lower().count('</table>')
                buf.append(lines[i])
                i += 1
            elements.append({'type': 'table', 'content': '\n'.join(buf)})
            continue

        # ── Fenced code block (``` or ~~~) ──
        fence_m = _FENCE_RE.match(stripped)
        if fence_m:
            fence_char = fence_m.group(1)[0]
            fence_len = len(fence_m.group(1))
            lang = fence_m.group(2).strip().lower()
            buf = []
            i += 1
            while i < n:
                close_m = _FENCE_RE.match(lines[i].strip())
                if close_m and close_m.group(1)[0] == fence_char and len(close_m.group(1)) >= fence_len:
                    i += 1
                    break
                buf.append(lines[i])
                i += 1
            code_content = '\n'.join(buf)
            elements.append({'type': 'code_block', 'content': code_content, 'language': lang})
            continue

        # ── Heading ──
        hm = _HEADING_RE.match(stripped)
        if hm:
            elements.append({
                'type': 'heading',
                'level': len(hm.group(1)),
                'content': hm.group(2).strip(),
            })
            i += 1
            continue

        # ── Block LaTeX $$...$$ ──
        if stripped.startswith('$$'):
            if stripped.endswith('$$') and len(stripped) > 4:
                elements.append({'type': 'latex_block', 'content': stripped[2:-2].strip()})
                i += 1
                continue
            buf = [line]
            i += 1
            while i < n:
                buf.append(lines[i])
                if lines[i].strip().endswith('$$'):
                    i += 1
                    break
                i += 1
            joined = '\n'.join(buf)
            m = re.search(r'\$\$([\s\S]*?)\$\$', joined)
            if m:
                elements.append({'type': 'latex_block', 'content': m.group(1).strip()})
            continue

        # ── Unordered list ──
        um = _UL_RE.match(stripped)
        if um:
            items = []
            while i < n:
                raw_line = lines[i]
                ls = raw_line.strip()
                m = _UL_RE.match(ls)
                if m:
                    items.append(m.group(1))
                    i += 1
                elif ls and (raw_line.startswith('  ') or raw_line.startswith('\t')):
                    if items:
                        items[-1] += ' ' + ls
                    i += 1
                else:
                    break
            if items:
                elements.append({'type': 'unordered_list', 'items': items})
            continue

        # ── Ordered list ──
        om = _OL_RE.match(stripped)
        if om:
            items = []
            while i < n:
                ls = lines[i].strip()
                m = _OL_RE.match(ls)
                if not m:
                    break
                items.append(m.group(1))
                i += 1
            if items:
                elements.append({'type': 'ordered_list', 'items': items})
            continue

        # ── Image ──
        im = _IMG_RE.match(stripped)
        if im:
            elements.append({'type': 'image', 'alt': im.group(1), 'src': im.group(2)})
            i += 1
            continue

        # ── Paragraph (may span consecutive non-blank lines) ──
        buf = [line]
        i += 1
        while i < n:
            ns = lines[i].strip()
            if (not ns
                    or ns.startswith('#')
                    or ns.startswith('$$')
                    or _UL_RE.match(ns)
                    or _OL_RE.match(ns)
                    or '<table' in lines[i].lower()
                    or _IMG_RE.match(ns)
                    or _FENCE_RE.match(ns)
                    or _PAGE_BREAK_RE.match(ns)):
                break
            buf.append(lines[i])
            i += 1
        para_text = ' '.join(l.strip() for l in buf if l.strip())
        if para_text:
            elements.append({'type': 'paragraph', 'content': para_text})

    return elements


# ═══════════════════════════════════════════════════════════════════════════════
# Document Building Helpers
# ═══════════════════════════════════════════════════════════════════════════════

def _add_paragraph(doc: Document, text: str) -> None:
    """Add a body paragraph with mixed text / LaTeX / bold / code segments."""
    para = doc.add_paragraph()
    _render_segments(para, parse_inline_content(text))


def _add_heading(doc: Document, text: str, level: int) -> None:
    """Add a heading, supporting LaTeX content within the heading text."""
    segments = parse_inline_content(text)
    has_special = any(s['type'] != 'text' for s in segments)

    if not has_special:
        h = doc.add_heading(text, level=min(level, 6))
        for run in h.runs:
            set_run_fonts(run, size=Pt(HEADING_SIZES.get(level, 11)),
                          bold=(level <= 5), color=COLOR_HEADING)
        return

    para = doc.add_paragraph()
    para.style = f'Heading {min(level, 6)}'
    _render_segments(para, segments,
                     font_size=Pt(HEADING_SIZES.get(level, 11)),
                     bold=(level <= 5), color=COLOR_HEADING)


def _add_block_equation(doc: Document, latex_content: str) -> None:
    """Add a centered block-display LaTeX equation."""
    para = doc.add_paragraph()
    para.alignment = WD_ALIGN_PARAGRAPH.CENTER
    para.paragraph_format.space_before = Pt(6)
    para.paragraph_format.space_after = Pt(6)

    omml = latex_to_omml(latex_content)
    if omml is not None:
        omp = etree.Element(M + "oMathPara")
        ompp = etree.SubElement(omp, M + "oMathParaPr")
        jc = etree.SubElement(ompp, M + "jc")
        jc.set(M + "val", "center")
        omp.append(omml)
        para._p.append(omp)
    else:
        run = para.add_run(f"$${latex_content}$$")
        set_run_fonts(run, font_name=FONT_MATH, italic=True)


def _add_list(doc: Document, items: list[str], ordered: bool = False) -> None:
    """Add ordered or unordered list items, each supporting inline LaTeX/bold."""
    style_name = 'List Number' if ordered else 'List Bullet'
    for item_text in items:
        para = doc.add_paragraph(style=style_name)
        _render_segments(para, parse_inline_content(item_text))


def _add_image(doc: Document, md_dir: Path, alt: str, src: str) -> None:
    """Embed an image if the file exists, otherwise insert a placeholder."""
    img_path = None
    src_p = Path(src)

    if src_p.is_absolute() and src_p.exists():
        img_path = src_p
    else:
        candidate = md_dir / src
        if candidate.exists():
            img_path = candidate

    if img_path and img_path.exists():
        try:
            usable_width = Cm(21.0 - 2 * 1.8)
            para = doc.add_paragraph()
            para.alignment = WD_ALIGN_PARAGRAPH.CENTER
            run = para.add_run()
            run.add_picture(str(img_path), width=usable_width)
            return
        except Exception:
            pass

    para = doc.add_paragraph()
    para.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = para.add_run(f"[Image: {alt or src}]")
    set_run_fonts(run, size=Pt(10), italic=True, color=COLOR_MUTED)


def _set_paragraph_shading(para, hex_color: str):
    """Apply background shading to a paragraph."""
    pPr = para._p.get_or_add_pPr()
    shd = OxmlElement('w:shd')
    shd.set(qn('w:val'), 'clear')
    shd.set(qn('w:color'), 'auto')
    shd.set(qn('w:fill'), hex_color)
    pPr.append(shd)


def _set_paragraph_borders(para, hex_color: str = CODE_BORDER_COLOR,
                           sides: tuple[str, ...] = ('top', 'left', 'bottom', 'right')):
    """Apply a thin border to specified sides of a paragraph."""
    pPr = para._p.get_or_add_pPr()
    pBdr = OxmlElement('w:pBdr')
    for side in sides:
        b = OxmlElement(f'w:{side}')
        b.set(qn('w:val'), 'single')
        b.set(qn('w:sz'), '4')
        b.set(qn('w:space'), '4')
        b.set(qn('w:color'), hex_color)
        pBdr.append(b)
    pPr.append(pBdr)


def _add_code_block(doc: Document, code: str, language: str = "") -> None:
    """Render a fenced code block with monospace font and shaded background."""
    code_lines = code.split('\n')
    total = len(code_lines)

    for line_idx, code_line in enumerate(code_lines):
        para = doc.add_paragraph()
        para.paragraph_format.space_before = Pt(0)
        para.paragraph_format.space_after = Pt(0)
        para.paragraph_format.line_spacing = 1.0

        _set_paragraph_shading(para, CODE_BG_COLOR)

        sides = ['left', 'right']
        if line_idx == 0:
            sides.append('top')
            para.paragraph_format.space_before = Pt(4)
        if line_idx == total - 1:
            sides.append('bottom')
            para.paragraph_format.space_after = Pt(4)
        _set_paragraph_borders(para, sides=tuple(sides))

        run = para.add_run(code_line if code_line else " ")
        set_run_fonts(run, font_name=FONT_CODE, size=FONT_SIZE_CODE)


# ═══════════════════════════════════════════════════════════════════════════════
# Main Conversion
# ═══════════════════════════════════════════════════════════════════════════════

def markdown_to_docx(markdown_path: Path, output_path: Path) -> None:
    """
    Convert a Markdown file to a professionally styled DOCX document.

    The Markdown may contain:
      - Headers (# through ######)
      - Regular paragraphs with inline LaTeX / bold / code
      - Block LaTeX equations ($$...$$)
      - HTML tables with colspan / rowspan and LaTeX in cells
      - Ordered and unordered lists
      - Fenced code blocks (```language ... ```)
      - Image references ![alt](path)

    OCR artifacts (page separators, page markers) are automatically removed.

    Args:
        markdown_path: Path to the source .md file.
        output_path:   Path for the output .docx file.
    """
    content = Path(markdown_path).read_text(encoding='utf-8')
    content = preprocess_content(content)

    doc = Document()
    _setup_styles(doc)
    _setup_page(doc)

    md_dir = Path(markdown_path).parent
    elements = parse_markdown_elements(content)

    seen_page_break = False
    for elem in elements:
        t = elem['type']

        if t == 'page_break':
            if seen_page_break:
                para = doc.add_paragraph()
                para.add_run().add_break(WD_BREAK.PAGE)
            seen_page_break = True
            continue

        if t == 'heading':
            _add_heading(doc, elem['content'], elem['level'])

        elif t == 'paragraph':
            _add_paragraph(doc, elem['content'])

        elif t == 'table':
            cells, nr, nc = parse_html_table(elem['content'])
            if nr > 0 and nc > 0:
                create_docx_table(doc, cells, nr, nc)
                doc.add_paragraph()

        elif t == 'latex_block':
            _add_block_equation(doc, elem['content'])

        elif t == 'unordered_list':
            _add_list(doc, elem['items'], ordered=False)

        elif t == 'ordered_list':
            _add_list(doc, elem['items'], ordered=True)

        elif t == 'image':
            _add_image(doc, md_dir, elem.get('alt', ''), elem.get('src', ''))

        elif t == 'code_block':
            _add_code_block(doc, elem['content'], elem.get('language', ''))

    output_path.parent.mkdir(parents=True, exist_ok=True)
    doc.save(str(output_path))

    print(f"Converted: {markdown_path} → {output_path}")


# ═══════════════════════════════════════════════════════════════════════════════
# CLI Entry Point
# ═══════════════════════════════════════════════════════════════════════════════

if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser(
        description="Convert Markdown (with HTML tables, LaTeX) to DOCX.",
    )
    parser.add_argument("input", type=Path, help="Path to the source .md file.")
    parser.add_argument(
        "output", nargs="?", type=Path, default=None,
        help="Output .docx path (default: same stem as input).",
    )
    args = parser.parse_args()

    dst = args.output if args.output else args.input.with_suffix('.docx')
    markdown_to_docx(args.input, dst)