文件预览

cells_to_html.py

查看 pdf-fin-parse 技能包中的文件内容。

文件内容

scripts/cells_to_html.py

"""把 output_schema 形态的 cells[] 反向序列化成 HTML <table>.

用途:
  1. 跨页合并后整张逻辑表 → 输出 HTML,用于 X-TEDS 评测
  2. GT 标注辅助:v0.3 自动合并结果 → HTML 初稿,人工在初稿上校对
"""
from __future__ import annotations

from html import escape
from typing import Any, Dict, Iterable, List, Tuple


def _cells_by_pos(cells: Iterable[Dict[str, Any]]
                  ) -> Tuple[Dict[Tuple[int, int], Dict[str, Any]], int, int]:
    """{(row, col): anchor_cell} + (n_rows, n_cols)。仅保留 anchor(rowspan/colspan 用属性表达)。"""
    by_pos: Dict[Tuple[int, int], Dict[str, Any]] = {}
    n_rows = 0
    n_cols = 0
    for c in cells:
        r = c.get("row")
        col = c.get("col")
        if r is None or col is None:
            continue
        by_pos[(r, col)] = c
        n_rows = max(n_rows, r + 1)
        n_cols = max(n_cols, col + c.get("colspan", 1))
    return by_pos, n_rows, n_cols


def cells_to_html(cells: List[Dict[str, Any]], *,
                   header_rows: int = 0,
                   prefer_value: bool = False) -> str:
    """渲染 cells[] 为 <table>。

    Args:
        header_rows: 前 N 行用 <thead><th>,后面用 <tbody><td>
        prefer_value: True 时数据 cell 用 value(数字 + 千分位);False 用 text 原文。
                      X-TEDS 评测建议 False(与 GT 原文形态一致)。
    """
    by_pos, n_rows, n_cols = _cells_by_pos(cells)
    if n_rows == 0:
        return "<table></table>"

    occupied: Dict[Tuple[int, int], bool] = {}
    # 标记非锚位置(按 rowspan/colspan 展开)
    for (r, c), cell in by_pos.items():
        rs = max(1, cell.get("rowspan", 1))
        cs = max(1, cell.get("colspan", 1))
        for dr in range(rs):
            for dc in range(cs):
                if dr == 0 and dc == 0:
                    continue
                occupied[(r + dr, c + dc)] = True

    def _cell_text(cell: Dict[str, Any]) -> str:
        if prefer_value and cell.get("value") is not None:
            v = cell["value"]
            if isinstance(v, float) and v.is_integer():
                v = int(v)
            return f"{v:,}" if isinstance(v, (int, float)) else str(v)
        return cell.get("text", "")

    out: List[str] = ["<table>"]
    in_thead = header_rows > 0
    in_tbody = False
    if in_thead:
        out.append("<thead>")

    for r in range(n_rows):
        if in_thead and r >= header_rows:
            out.append("</thead>")
            out.append("<tbody>")
            in_thead = False
            in_tbody = True
        if not in_thead and not in_tbody and r >= header_rows:
            out.append("<tbody>")
            in_tbody = True

        out.append("<tr>")
        for c in range(n_cols):
            if occupied.get((r, c)):
                continue
            cell = by_pos.get((r, c))
            tag = "th" if r < header_rows else "td"
            if cell is None:
                out.append(f"<{tag}></{tag}>")
                continue
            attrs = []
            rs = cell.get("rowspan", 1)
            cs = cell.get("colspan", 1)
            if rs and rs != 1:
                attrs.append(f'rowspan="{rs}"')
            if cs and cs != 1:
                attrs.append(f'colspan="{cs}"')
            attr_str = (" " + " ".join(attrs)) if attrs else ""
            text = escape(_cell_text(cell))
            out.append(f"<{tag}{attr_str}>{text}</{tag}>")
        out.append("</tr>")

    if in_thead:
        out.append("</thead>")
    if in_tbody:
        out.append("</tbody>")
    out.append("</table>")
    return "".join(out)