文件预览

book_writer.py

查看 Book Writer 技能包中的文件内容。

文件内容

scripts/book_writer.py

#!/usr/bin/env python3
"""
智能写书主程序 - 根据提示词生成书籍大纲并逐级扩写内容
"""

import os
import sys
import json
import yaml
import re
from pathlib import Path
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, asdict
import logging

# 尝试导入依赖,如果失败则提供替代方案
try:
    import openai
    from openai import OpenAI
    HAS_OPENAI = True
except ImportError:
    HAS_OPENAI = False

try:
    import requests
    HAS_REQUESTS = True
except ImportError:
    HAS_REQUESTS = False

# 导入内容优化器
try:
    from .content_optimizer import ContentOptimizer
    HAS_CONTENT_OPTIMIZER = True
except ImportError:
    HAS_CONTENT_OPTIMIZER = False

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('logs/book_writer.log'),
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)

@dataclass
class BookOutline:
    """书籍大纲数据结构"""
    title: str
    subtitle: Optional[str]
    chapters: List[Dict[str, Any]]
    metadata: Dict[str, Any]

@dataclass
class ChapterContent:
    """章节内容数据结构"""
    chapter_number: int
    title: str
    sections: List[Dict[str, Any]]
    content: str
    formulas: List[str]
    figures: List[Dict[str, str]]
    tables: List[Dict[str, Any]]
    code_snippets: List[Dict[str, str]]
    references: List[str]

@dataclass
class Book:
    """整本书的数据结构"""
    title: str
    outline: BookOutline
    chapters: List[ChapterContent]
    metadata: Dict[str, Any]

class BookWriter:
    """智能写书主类"""

    def __init__(self, config_path: str = "config.yaml"):
        """
        初始化写书器

        Args:
            config_path: 配置文件路径
        """
        self.config = self._load_config(config_path)
        self.client = None
        self.content_optimizer = None
        
        # 初始化OpenAI客户端
        self._initialize_openai_client()
        
        # 初始化内容优化器
        self._initialize_content_optimizer()

        # 创建输出目录
        self.output_dir = self.config.get("storage", {}).get("output_dir", "generated_books")
        self.temp_dir = self.config.get("storage", {}).get("temp_dir", "temp_files")
        Path(self.output_dir).mkdir(parents=True, exist_ok=True)
        Path(self.temp_dir).mkdir(parents=True, exist_ok=True)

        logger.info(f"写书器初始化完成,输出目录: {self.output_dir}")

    def _load_config(self, config_path: str) -> Dict:
        """加载配置文件"""
        if not os.path.exists(config_path):
            logger.warning(f"配置文件 {config_path} 不存在,使用默认配置")
            return {}

        try:
            with open(config_path, 'r') as f:
                config = yaml.safe_load(f)
            logger.info(f"配置文件加载成功: {config_path}")
            return config or {}
        except Exception as e:
            logger.error(f"加载配置文件失败: {e}")
            return {}

    def _initialize_openai_client(self):
        """初始化OpenAI客户端"""
        if not HAS_OPENAI:
            logger.warning("OpenAI库未安装,将使用模拟模式")
            return

        openai_config = self.config.get("openai", {})
        api_key = os.environ.get("OPENAI_API_KEY") or openai_config.get("api_key")

        if not api_key:
            logger.warning("OpenAI API密钥未设置,将使用模拟模式")
            return

        try:
            self.client = OpenAI(api_key=api_key)
            logger.info("OpenAI客户端初始化成功")
        except Exception as e:
            logger.error(f"OpenAI客户端初始化失败: {e}")

    def _initialize_content_optimizer(self):
        """初始化内容优化器"""
        if HAS_CONTENT_OPTIMIZER:
            try:
                self.content_optimizer = ContentOptimizer()
                logger.info("内容优化器初始化成功")
            except Exception as e:
                logger.error(f"内容优化器初始化失败: {e}")
                self.content_optimizer = None

    def generate_outline(self, prompt: str, max_chapters: Optional[int] = None) -> BookOutline:
        """
        根据提示词生成书籍大纲

        Args:
            prompt: 书籍主题提示词
            max_chapters: 最大章节数

        Returns:
            BookOutline: 生成的书籍大纲
        """
        logger.info(f"开始生成书籍大纲: {prompt}")

        # 使用默认值或配置值
        if max_chapters is None:
            max_chapters = self.config.get("defaults", {}).get("max_chapters", 10)

        # 构造请求
        outline_prompt = f"""
        请为以下主题生成一本结构完整的书籍大纲:
        主题: {prompt}

        要求:
        1. 生成不超过{max_chapters}章的大纲
        2. 每章包含3-5个小节
        3. 为每章和每节提供简要描述
        4. 考虑目标读者为{self.config.get('content', {}).get('target_audience', 'general')}
        5. 内容风格应为{self.config.get('content', {}).get('writing_style', 'informative')}

        请以JSON格式返回,结构如下:
        {{
            "title": "书籍标题",
            "subtitle": "副标题(可选)",
            "chapters": [
                {{
                    "chapter_number": 1,
                    "title": "第一章标题",
                    "description": "章节简介",
                    "sections": [
                        {{
                            "section_number": 1,
                            "title": "第一节标题",
                            "description": "小节简介"
                        }}
                    ]
                }}
            ],
            "metadata": {{
                "topic": "{prompt}",
                "target_audience": "{self.config.get('content', {}).get('target_audience', 'general')}",
                "writing_style": "{self.config.get('content', {}).get('writing_style', 'informative')}"
            }}
        }}
        """

        if self.client:
            # 使用真实的OpenAI API
            try:
                response = self.client.chat.completions.create(
                    model=self.config.get("openai", {}).get("model", "gpt-4o"),
                    messages=[{"role": "user", "content": outline_prompt}],
                    temperature=self.config.get("openai", {}).get("temperature", 0.7),
                    max_tokens=self.config.get("openai", {}).get("max_tokens", 2000)
                )
                
                response_text = response.choices[0].message.content
                logger.info(f"OpenAI响应: {response_text[:200]}...")
            except Exception as e:
                logger.error(f"OpenAI API调用失败: {e}")
                response_text = self._generate_outline_mock(prompt, max_chapters)  # 使用模拟数据
        else:
            # 使用模拟模式
            response_text = self._generate_outline_mock(prompt, max_chapters)

        # 解析响应
        try:
            # 尝试提取JSON部分
            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
            if json_match:
                json_str = json_match.group()
                outline_data = json.loads(json_str)
                
                # 创建BookOutline对象
                outline = BookOutline(
                    title=outline_data.get("title", "未命名书籍"),
                    subtitle=outline_data.get("subtitle"),
                    chapters=outline_data.get("chapters", []),
                    metadata=outline_data.get("metadata", {})
                )
                
                logger.info(f"大纲生成成功,共{len(outline.chapters)}章")
                return outline
            else:
                logger.error("未能从响应中提取JSON数据")
                # 返回一个基本的大纲
                return self._create_basic_outline(prompt, max_chapters)
                
        except json.JSONDecodeError as e:
            logger.error(f"JSON解析失败: {e}")
            logger.error(f"响应内容: {response_text}")
            return self._create_basic_outline(prompt, max_chapters)

    def _generate_outline_mock(self, prompt: str, max_chapters: int) -> str:
        """模拟生成大纲(用于测试或无API密钥时)"""
        logger.info(f"使用模拟模式生成大纲: {prompt}")
        
        # 创建一个模拟的JSON响应
        mock_outline = {
            "title": f"{prompt} - 完整指南",
            "subtitle": f"深入理解{prompt}的核心概念与应用",
            "chapters": [],
            "metadata": {
                "topic": prompt,
                "target_audience": self.config.get('content', {}).get('target_audience', 'general'),
                "writing_style": self.config.get('content', {}).get('writing_style', 'informative')
            }
        }
        
        for i in range(1, min(max_chapters, 5) + 1):  # 限制为最多5章以保持简洁
            chapter = {
                "chapter_number": i,
                "title": f"第{i}章 {prompt}基础",
                "description": f"介绍{prompt}的基本概念和原理",
                "sections": []
            }
            
            for j in range(1, 4):  # 每章3个小节
                section_title = f"{chapter['title']}的小节{j}"
                chapter["sections"].append({
                    "section_number": j,
                    "title": section_title,
                    "description": f"探讨{section_title}的相关内容"
                })
            
            mock_outline["chapters"].append(chapter)
        
        return json.dumps(mock_outline, ensure_ascii=False, indent=2)

    def _create_basic_outline(self, prompt: str, max_chapters: int) -> BookOutline:
        """创建基本大纲(当JSON解析失败时)"""
        logger.warning("创建基本大纲结构")
        
        chapters = []
        for i in range(1, min(max_chapters, 5) + 1):
            chapters.append({
                "chapter_number": i,
                "title": f"第{i}章",
                "description": f"关于{prompt}的第{i}部分内容",
                "sections": [{
                    "section_number": 1,
                    "title": "引言",
                    "description": "本章引言"
                }, {
                    "section_number": 2,
                    "title": "主要内容",
                    "description": "本章主要内容"
                }, {
                    "section_number": 3,
                    "title": "总结",
                    "description": "本章总结"
                }]
            })
        
        return BookOutline(
            title=f"{prompt}指南",
            subtitle=f"关于{prompt}的全面介绍",
            chapters=chapters,
            metadata={
                "topic": prompt,
                "target_audience": self.config.get('content', {}).get('target_audience', 'general'),
                "writing_style": self.config.get('content', {}).get('writing_style', 'informative')
            }
        )

    def expand_chapter(self, chapter_data: Dict, chapter_index: int) -> ChapterContent:
        """
        扩写单个章节

        Args:
            chapter_data: 章节数据
            chapter_index: 章节索引

        Returns:
            ChapterContent: 扩写后的章节内容
        """
        logger.info(f"开始扩写章节: {chapter_data['title']}")

        # 构造扩写提示
        expand_prompt = f"""
        请详细扩写以下章节内容:
        章节标题: {chapter_data['title']}
        章节描述: {chapter_data['description']}
        
        该章节包含以下小节:
        {chr(10).join([f"- {sec['title']}: {sec['description']}" for sec in chapter_data['sections']])}
        
        要求:
        1. 写作长度为{self.config.get('defaults', {}).get('content_length', 'medium')}
        2. 包含适当的标题层级结构
        3. 如适用,添加相关的数学公式、代码示例、图表描述或表格
        4. 保持{self.config.get('content', {}).get('writing_style', 'informative')}的写作风格
        5. 如适用,包含相关引用和参考文献
        
        请返回内容,包含:
        - 主要内容文本
        - 数学公式列表(如果有)
        - 代码示例列表(如果有)
        - 图表描述列表(如果有)
        - 表格数据列表(如果有)
        - 参考文献列表(如果有)
        """

        if self.client:
            # 使用真实的OpenAI API
            try:
                response = self.client.chat.completions.create(
                    model=self.config.get("openai", {}).get("model", "gpt-4o"),
                    messages=[{"role": "user", "content": expand_prompt}],
                    temperature=self.config.get("openai", {}).get("temperature", 0.7),
                    max_tokens=self.config.get("openai", {}).get("max_tokens", 3000)
                )
                
                content = response.choices[0].message.content
            except Exception as e:
                logger.error(f"OpenAI API调用失败: {e}")
                content = self._expand_chapter_mock(chapter_data, chapter_index)  # 使用模拟数据
        else:
            # 使用模拟模式
            content = self._expand_chapter_mock(chapter_data, chapter_index)

        # 解析内容,提取各种元素
        chapter_content = self._parse_expanded_content(content, chapter_data, chapter_index)
        
        logger.info(f"章节扩写完成: {chapter_data['title']}")
        return chapter_content

    def _expand_chapter_mock(self, chapter_data: Dict, chapter_index: int) -> str:
        """模拟扩写章节内容"""
        logger.info(f"使用模拟模式扩写章节: {chapter_data['title']}")
        
        return f"""
# {chapter_data['title']}

## 章节概述

本章将详细介绍{chapter_data['title']}的相关内容。我们将从基本概念入手,逐步深入探讨其核心原理和实际应用。

## 主要内容

{chapter_data['description']}

在这一部分,我们将:

1. 介绍基本定义和概念
2. 探讨核心原理
3. 分析实际应用场景

## 小节内容

"""

    def _parse_expanded_content(self, content: str, chapter_data: Dict, chapter_index: int) -> ChapterContent:
        """解析扩写后的内容,提取各种元素"""
        # 提取数学公式(LaTeX格式)
        formula_pattern = r'\$\$(.*?)\$\$|\$(.*?)\$'
        formulas = re.findall(formula_pattern, content, re.DOTALL)
        formulas = [item[0] if item[0] else item[1] for item in formulas if item[0] or item[1]]

        # 提取代码块
        code_pattern = r'```(\w*)\n(.*?)```'
        code_matches = re.findall(code_pattern, content, re.DOTALL)
        code_snippets = [{"language": lang, "code": code.strip()} for lang, code in code_matches]

        # 提取图表描述(简单模式)
        figure_pattern = r'图\d+[::]\s*(.*?)(?:\n|$)'
        figures = [{"caption": cap.strip(), "description": cap.strip()} for cap in re.findall(figure_pattern, content)]

        # 提取表格(简单模式)
        table_pattern = r'表\d+[::]\s*(.*?)(?:\n|$)'
        tables = [{"title": tbl.strip(), "description": tbl.strip()} for tbl in re.findall(table_pattern, content)]

        # 提取参考文献
        ref_patterns = [
            r'\[(\d+)\].*?',
            r'(?:参考文献|References).*?\n((?:.*?\n)*?)\n\n',
            r'\[([^\]]+)\]'
        ]
        references = []
        for pat in ref_patterns:
            refs = re.findall(pat, content)
            references.extend(refs)

        return ChapterContent(
            chapter_number=chapter_index,
            title=chapter_data['title'],
            sections=chapter_data['sections'],
            content=content,
            formulas=formulas,
            figures=figures,
            tables=tables,
            code_snippets=code_snippets,
            references=list(set(references))  # 去重
        )

    def expand_book(self, outline: BookOutline, max_chapters: Optional[int] = None) -> Book:
        """
        扩写整本书

        Args:
            outline: 书籍大纲
            max_chapters: 最大扩写章节数

        Returns:
            Book: 扩写后的书籍
        """
        logger.info(f"开始扩写整本书: {outline.title}")

        if max_chapters is None:
            max_chapters = self.config.get("defaults", {}).get("max_chapters", 3)  # 默认只扩写3章

        chapters_to_expand = outline.chapters[:max_chapters]
        expanded_chapters = []

        for i, chapter_data in enumerate(chapters_to_expand):
            logger.info(f"正在扩写第{i+1}/{len(chapters_to_expand)}章")
            chapter_content = self.expand_chapter(chapter_data, i+1)
            expanded_chapters.append(chapter_content)

        book = Book(
            title=outline.title,
            outline=outline,
            chapters=expanded_chapters,
            metadata=outline.metadata
        )

        logger.info(f"书籍扩写完成,共{len(expanded_chapters)}章")
        return book

    def save_book(self, book: Book, output_path: str):
        """
        保存书籍到文件

        Args:
            book: 书籍对象
            output_path: 输出路径
        """
        output_dir = Path(self.output_dir) / output_path
        output_dir.mkdir(parents=True, exist_ok=True)

        # 保存完整书籍数据
        book_data = {
            "title": book.title,
            "outline": asdict(book.outline),
            "chapters": [asdict(chapter) for chapter in book.chapters],
            "metadata": book.metadata
        }

        with open(output_dir / "book.json", 'w', encoding='utf-8') as f:
            json.dump(book_data, f, ensure_ascii=False, indent=2)

        # 为每一章创建单独的文件
        for chapter in book.chapters:
            chapter_file = output_dir / f"chapter_{chapter.chapter_number:02d}.md"
            with open(chapter_file, 'w', encoding='utf-8') as f:
                f.write(f"# {chapter.title}\n\n")
                f.write(chapter.content)
                
                # 添加公式
                if chapter.formulas:
                    f.write("\n## 数学公式\n\n")
                    for i, formula in enumerate(chapter.formulas):
                        f.write(f"$$\n{formula}\n$$\n\n")
                
                # 添加代码
                if chapter.code_snippets:
                    f.write("\n## 代码示例\n\n")
                    for snippet in chapter.code_snippets:
                        f.write(f"```{snippet['language']}\n{snippet['code']}\n```\n\n")
                
                # 添加图表
                if chapter.figures:
                    f.write("\n## 图表\n\n")
                    for figure in chapter.figures:
                        f.write(f"图: {figure['caption']}\n\n")
                
                # 添加表格
                if chapter.tables:
                    f.write("\n## 表格\n\n")
                    for table in chapter.tables:
                        f.write(f"表: {table['title']}\n\n")

        logger.info(f"书籍已保存到: {output_dir}")

def main():
    """命令行入口点"""
    import argparse

    parser = argparse.ArgumentParser(description="智能写书工具")
    parser.add_argument("--action", choices=["outline", "expand", "full"], 
                       default="outline", help="操作类型: outline(生成大纲), expand(扩写内容), full(全流程)")
    parser.add_argument("--prompt", type=str, help="书籍主题提示词")
    parser.add_argument("--book-path", type=str, help="书籍路径(扩写时使用)")
    parser.add_argument("--chapters", type=str, default="1,2,3", help="要扩写的章节(逗号分隔)")
    parser.add_argument("--output", type=str, help="输出目录名称")
    parser.add_argument("--max-chapters", type=int, default=3, help="最大章节数")

    args = parser.parse_args()

    # 创建写书器
    writer = BookWriter()

    if args.action == "outline":
        if not args.prompt:
            print("❌ 错误: 生成大纲需要提供 --prompt 参数")
            return

        print(f"📖 正在为 '{args.prompt}' 生成书籍大纲...")
        outline = writer.generate_outline(args.prompt, args.max_chapters)
        
        print(f"✅ 大纲生成完成!共 {len(outline.chapters)} 章")
        print(f"📚 书籍标题: {outline.title}")
        
        # 保存大纲
        output_name = args.output or outline.title.replace(" ", "_").replace("/", "_")
        outline_path = Path(writer.output_dir) / output_name
        outline_path.mkdir(parents=True, exist_ok=True)
        
        outline_data = {
            "title": outline.title,
            "subtitle": outline.subtitle,
            "chapters": outline.chapters,
            "metadata": outline.metadata
        }
        
        with open(outline_path / "outline.json", 'w', encoding='utf-8') as f:
            json.dump(outline_data, f, ensure_ascii=False, indent=2)
        
        print(f"💾 大纲已保存到: {outline_path}/outline.json")

    elif args.action == "expand":
        if not args.book_path:
            print("❌ 错误: 扩写内容需要提供 --book-path 参数")
            return
            
        if not args.prompt:
            print("❌ 错误: 扩写内容需要提供 --prompt 参数(用于重新生成大纲)")
            # 我们将从现有的大纲文件加载
            outline_path = Path(writer.output_dir) / args.book_path / "outline.json"
            if not outline_path.exists():
                print(f"❌ 错误: 未找到大纲文件 {outline_path}")
                return
            with open(outline_path, 'r', encoding='utf-8') as f:
                outline_data = json.load(f)
            # 重构BookOutline对象
            outline = BookOutline(
                title=outline_data["title"],
                subtitle=outline_data["subtitle"],
                chapters=outline_data["chapters"],
                metadata=outline_data["metadata"]
            )
        else:
            # 生成新大纲
            print(f"📖 正在为 '{args.prompt}' 生成书籍大纲...")
            outline = writer.generate_outline(args.prompt, args.max_chapters)

        print(f"✍️  正在扩写书籍内容...")
        selected_chapters = [int(x.strip()) for x in args.chapters.split(",")]
        max_chap = max(selected_chapters) if selected_chapters else args.max_chapters
        book = writer.expand_book(outline, max_chap)
        
        print(f"✅ 内容扩写完成!")
        
        # 保存完整书籍
        output_name = args.output or args.book_path
        writer.save_book(book, output_name)
        
        print(f"💾 书籍已保存到: {writer.output_dir}/{output_name}")

    elif args.action == "full":
        if not args.prompt:
            print("❌ 错误: 全流程操作需要提供 --prompt 参数")
            return

        print(f"📖 正在为 '{args.prompt}' 生成书籍大纲...")
        outline = writer.generate_outline(args.prompt, args.max_chapters)
        
        print(f"✍️  正在扩写书籍内容...")
        book = writer.expand_book(outline, args.max_chapters)
        
        output_name = args.output or outline.title.replace(" ", "_").replace("/", "_")
        writer.save_book(book, output_name)
        
        print(f"🎉 书籍生成完成!")
        print(f"📚 书籍标题: {book.title}")
        print(f"💾 保存位置: {writer.output_dir}/{output_name}")

if __name__ == "__main__":
    main()