文件预览

chinese_typeset.py

查看 lobster-novel 技能包中的文件内容。

文件内容

core/chinese_typeset.py

#!/usr/bin/env python3
"""
lobster-novel: Chinese typesetting engine (inspired by 马良写作 排版清洗).
Rules:
1. 全角英文/数字 → 半角
2. 半角标点(中文语境)→ 全角
3. 引号统一为中文引号(「」『』"")
4. 段落间距标准化
5. 中文与英文之间的空格规范化
6. 重复标点清理
"""
import re
from pathlib import Path

# ═══════════════════════════════════════════════════════════════
# Rule definitions
# ═══════════════════════════════════════════════════════════════

class ChineseTypeset:
    """Chinese text typesetting rules engine."""

    @staticmethod
    def fullwidth_to_halfwidth(text: str) -> str:
        """Convert fullwidth ASCII/numbers to halfwidth."""
        result = []
        for ch in text:
            code = ord(ch)
            # Fullwidth letters A-Z (FF21-FF3A), a-z (FF41-FF5A)
            if 0xFF21 <= code <= 0xFF3A:
                result.append(chr(code - 0xFEE0))
            elif 0xFF41 <= code <= 0xFF5A:
                result.append(chr(code - 0xFEE0))
            # Fullwidth digits 0-9 (FF10-FF19)
            elif 0xFF10 <= code <= 0xFF19:
                result.append(chr(code - 0xFEE0))
            # Fullwidth @ (FF20), other common symbols
            elif code == 0xFF20:  # @
                result.append("@")
            else:
                result.append(ch)
        return "".join(result)

    @staticmethod
    def halfwidth_to_fullwidth_punct(text: str) -> str:
        """Convert halfwidth punctuation to fullwidth in Chinese context."""
        # Only convert when adjacent to Chinese chars
        def _replace_in_chinese(m):
            return chr(ord(m.group(0)) + 0xFEE0)

        for punct in [",", ".", ":", ";", "!", "?"]:
            # Before Chinese char
            text = re.sub(rf'(?<=[\u4e00-\u9fff]){re.escape(punct)}(?=[\u4e00-\u9fff])',
                          _replace_in_chinese, text)
            # After Chinese char, before space/non-letter
            text = re.sub(rf'(?<=[\u4e00-\u9fff]){re.escape(punct)}(?=\s|$)',
                          _replace_in_chinese, text)
        return text

    @staticmethod
    def normalize_quotes(text: str) -> str:
        """Normalize various quote styles to standard Chinese quotes."""
        # Curly/smart quotes to straight
        text = text.replace("\u201c", "\"").replace("\u201d", "\"")
        text = text.replace("\u2018", "'").replace("\u2019", "'")
        # Left/right single quotes
        text = text.replace("\u201c", "\"").replace("\u201d", "\"")

        # French angle quotes to Chinese book title marks
        text = text.replace("\u00ab", "\u300a").replace("\u00bb", "\u300b")

        # English quotes in Chinese context → use fullwidth quotes if preferred
        # (leave as-is, too aggressive to auto-convert)

        return text

    @staticmethod
    def normalize_spacing(text: str) -> str:
        """Normalize spacing between Chinese and English/numeric text."""
        # Add space between Chinese and English
        text = re.sub(r'([\u4e00-\u9fff])([a-zA-Z])', r'\1 \2', text)
        text = re.sub(r'([a-zA-Z])([\u4e00-\u9fff])', r'\1 \2', text)
        # Add space between Chinese and numbers
        text = re.sub(r'([\u4e00-\u9fff])(\d)', r'\1 \2', text)
        text = re.sub(r'(\d)([\u4e00-\u9fff])', r'\1 \2', text)
        # Remove double spaces
        text = re.sub(r'  +', ' ', text)
        return text

    @staticmethod
    def clean_duplicate_punct(text: str) -> str:
        """Clean up duplicated punctuation marks."""
        # !!!! or ???  →  single + paren
        text = re.sub(r'[!?!?]{3,}', lambda m: m.group(0)[:2] + "...", text)
        # …… repeated → normalize to exactly 2
        text = re.sub(r'。{3,}', '……', text)
        text = re.sub(r'\.{3,}', '……', text)
        # ……连续出现(如…………)→  normalize
        text = re.sub(r'。{6,}', '……', text)
        # Repeated 。。
        text = re.sub(r'。。{2,}', '。', text)
        # Repeated ,,
        text = re.sub(r',,{2,}', ',', text)
        return text

    @staticmethod
    def normalize_paragraphs(text: str) -> str:
        """Normalize paragraph spacing and indentation."""
        lines = text.split("\n")
        # Remove leading/trailing spaces per line
        lines = [l.strip() for l in lines]
        # Collapse 3+ consecutive blank lines to 2
        result = []
        blank_count = 0
        for l in lines:
            if not l:
                blank_count += 1
                if blank_count <= 2:
                    result.append("")
            else:
                blank_count = 0
                result.append(l)
        return "\n".join(result)

    @staticmethod
    def clean_all(text: str) -> str:
        """Run all typesetting rules in sequence."""
        text = ChineseTypeset.fullwidth_to_halfwidth(text)
        text = ChineseTypeset.halfwidth_to_fullwidth_punct(text)
        text = ChineseTypeset.normalize_quotes(text)
        text = ChineseTypeset.normalize_spacing(text)
        text = ChineseTypeset.clean_duplicate_punct(text)
        text = ChineseTypeset.normalize_paragraphs(text)
        return text.strip()


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Chinese typesetting engine")
    parser.add_argument("file", nargs="?", help="input file (stdin if omitted)")
    parser.add_argument("--output", "-o", help="output file (stdout if omitted)")
    args = parser.parse_args()

    if args.file:
        text = Path(args.file).read_text(encoding="utf-8")
    else:
        import sys
        text = sys.stdin.read()

    result = ChineseTypeset.clean_all(text)

    if args.output:
        Path(args.output).write_text(result, encoding="utf-8")
        print(f"Saved: {args.output} ({len(text)} → {len(result)} chars)")
    else:
        print(result)