文件预览

data_agent.py

查看 lobster-novel 技能包中的文件内容。

文件内容

agents/data_agent.py

from __future__ import annotations

import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from core.contract import (
    ChapterCommit,
    ChapterCommitEvent,
    EntityDelta,
    StateDelta,
    save_commit,
)
from core.story_state import StoryState


# ---------------------------------------------------------------------------
# Known markers
# ---------------------------------------------------------------------------

_DIALOGUE_MARKERS = {"说", "道", "问", "回答"}
_ACTION_KEYWORDS = {"推", "拉", "走", "跑", "站", "蹲", "握", "举", "打开", "关闭", "拿", "放"}
_REVELATION_KEYWORDS = {"发现", "知道", "意识到", "明白", "看到"}
_TRANSITION_KEYWORDS = {"最后", "终于", "从此", "这是"}
_EMOTION_KEYWORDS = {"哭", "笑", "沉默", "颤抖", "紧", "深呼"}
_WORLD_BUILDING_KEYWORDS = {"世界", "规则", "力量", "历史", "传说"}

_KNOWN_LOCATIONS = {
    "灰港镇", "铁匠铺", "码头", "盐碱地", "荒地", "灯塔",
    "酒馆", "教堂", "广场", "城墙", "港口", "市场",
    "议事厅", "老城区", "新城区", "旅馆", "钟楼", "墓地",
    "军营", "地牢", "下水道", "神庙", "祭坛",
}
_TIME_MARKERS = {
    "清晨", "午后", "黄昏", "傍晚", "深夜", "黎明",
    "天亮", "天黑", "日出", "日落", "早晨", "中午", "晚上",
}
_LOCATION_KEYWORDS = list(_KNOWN_LOCATIONS) + [
    "森林", "山谷", "河畔", "桥头", "山洞", "矿洞",
    "城堡", "塔楼", "花园", "庭院", "大殿",
]
_HOOK_CREATE_PATTERNS = ["留下了一个", "记住了", "标记了", "预示"]
_HOOK_RESOLVE_PATTERNS = [
    "兑现", "终于", "解开了", "找到了答案",
    "消散", "解体", "消失", "缩回地下", "闭合", "关闭",
    "完成了任务", "回到了", "路通了", "母亲", "穿过光门",
]

# ---------------------------------------------------------------------------
# Name-extraction exclusions (common non-character phrases matching the regex)
# ---------------------------------------------------------------------------

_NAME_EXCLUSIONS = {
    # 中文功能词/代词/句子碎片(防注册为角色名)
    "她", "他", "它", "这", "那", "我", "你", "我们", "他们", "她们",
    "它们", "自己", "什么", "怎么", "这样", "那样", "那个", "这个",
    "不是", "就是", "但是", "可是", "只是", "还是", "或者", "而且",
    "因为", "所以", "然后", "之后", "之前", "时候", "地方", "东西",
    "一个", "一些", "一点", "那种", "这些", "那些", "那么", "这么",
    "没有", "可以", "应该", "能够", "需要", "可能", "已经", "正在",
    "看到", "听到", "感到", "觉得", "知道", "发现", "开始", "结束",
    "继续", "停下", "走出", "进入", "回到", "离开", "穿过", "打开",
    "看见", "听见", "像是", "不像", "不像", "仿佛", "好像", "似乎",
    "盖子", "传来", "传来", "告诉", "回答", "解释", "描述", "补充",
    "这个", "那个", "哪个", "这些", "那些",
    "可以", "可是", "还是", "就是", "但是", "而且", "或者",
    "因为", "所以", "如果", "虽然", "然而", "不过", "只是",
    "什么", "怎么", "这样", "那样", "这么", "那么",
    "他们", "她们", "它们", "我们", "你们", "大家",
    "没有", "还有", "已经", "知道", "觉得", "看见",
    "需要", "想要", "开始", "继续", "后来", "接着",
    "刚才", "立刻", "马上", "突然", "然后", "最后",
    "前面", "后面", "左边", "右边", "上面", "下面",
    "一眼", "一下", "第一", "第二", "首先", "其次",
    "比如", "例如", "据说", "所谓", "可谓", "便是",
    "算是", "说是", "凡是", "算是","总是","全是","倒是","就是",
}


class DataAgent:
    """Extracts structured ChapterCommit data from chapter text using rule-based heuristics."""

    def __init__(self, project_dir: str | Path) -> None:
        self.project_dir = Path(project_dir)

    # ------------------------------------------------------------------
    # Main extraction
    # ------------------------------------------------------------------

    def extract_commit(
        self,
        chapter_text: str,
        chapter_number: int,
        chapter_title: str,
        story_state: StoryState,
    ) -> ChapterCommit:
        lines = chapter_text.split("\n")
        total_lines = len(lines)

        # 1. word_count
        word_count = sum(1 for ch in chapter_text if "\u4e00" <= ch <= "\u9fff")

        # 2. events
        events = self._extract_events(lines, story_state)
        if not events:
            first_line = lines[0].strip() if lines else ""
            events = [
                ChapterCommitEvent(
                    event_type="action",
                    description=(first_line[:80] if first_line else "(空章节)"),
                    strand="quest",
                )
            ]

        # 3. state_delta
        state_delta = self._extract_state_delta(lines, story_state)

        # 4. entity_delta
        entity_delta = self._extract_entity_delta(
            chapter_text, lines, story_state
        )

        # 5. summary_text
        summary_text = chapter_text[:100].replace("\n", " ").strip()

        # 6 / 7. hooks
        hooks_created = self._extract_hooks(lines, chapter_number)
        hooks_resolved = self._extract_hooks_resolved(lines)

        # 8. strand_ratios
        strand_ratios = self._calculate_strand_ratios(events)

        commit = ChapterCommit(
            chapter_number=chapter_number,
            chapter_title=chapter_title,
            word_count=word_count,
            events=events,
            state_delta=state_delta,
            entity_delta=entity_delta,
            summary_text=summary_text,
            hooks_created=hooks_created,
            hooks_resolved=hooks_resolved,
            strand_ratios=strand_ratios,
            timestamp=datetime.now(timezone.utc).isoformat(),
        )

        # Print brief summary
        print(
            f"[DataAgent] Chapter {chapter_number} — "
            f"{word_count} chars, {len(events)} events, "
            f"{len(state_delta.character_states)} characters, "
            f"{len(entity_delta.new_characters)} new entities"
        )
        return commit

    # ------------------------------------------------------------------
    # Event extraction
    # ------------------------------------------------------------------

    @staticmethod
    def _classify_line(
        line: str, known_chars: set[str]
    ) -> tuple[str, str, list[str], str | None]:
        """Return (event_type, description, entities_mentioned, strand_override)."""
        stripped = line.strip()
        desc = stripped[:80]

        entities = [c for c in known_chars if c in stripped]
        strand = None  # None means default "quest"

        # Try each event type in priority order
        # emotional_beat (before action so "颤抖" doesn't get caught as action)
        if any(kw in stripped for kw in _EMOTION_KEYWORDS):
            return "emotional_beat", desc, entities, strand

        # dialogue
        if any(kw in stripped for kw in _DIALOGUE_MARKERS) and stripped.endswith(
            ("。", "!", "?", "”", "」", "』")
        ):
            if any(kw in stripped for kw in _WORLD_BUILDING_KEYWORDS):
                strand = "constellation"
            else:
                strand = "fire"
            return "dialogue", desc, entities, strand

        if any(kw in stripped for kw in _WORLD_BUILDING_KEYWORDS):
            strand = "constellation"

        # revelation
        if any(kw in stripped for kw in _REVELATION_KEYWORDS):
            if strand is None:
                strand = "constellation" if any(
                    kw in stripped for kw in _WORLD_BUILDING_KEYWORDS
                ) else "quest"
            return "revelation", desc, entities, strand

        # action
        if any(kw in stripped for kw in _ACTION_KEYWORDS):
            return "action", desc, entities, strand

        # transition
        if any(kw in stripped for kw in _TRANSITION_KEYWORDS):
            return "transition", desc, entities, "quest"

        # Fallback — check if it looks like a sentence (ends with punctuation)
        if stripped and stripped[-1] in "。!?":
            if any(kw in stripped for kw in _WORLD_BUILDING_KEYWORDS):
                return "action", desc, entities, "constellation"
            return "action", desc, entities, strand

        return "", desc, entities, strand

    def _extract_events(
        self, lines: list[str], story_state: StoryState
    ) -> list[ChapterCommitEvent]:
        known_chars = set()
        for cid, ch in story_state.characters.items():
            known_chars.add(cid)
            known_chars.add(ch.name)

        events: list[ChapterCommitEvent] = []
        for i, raw_line in enumerate(lines):
            line = raw_line.strip()
            if not line:
                continue

            event_type, desc, entities, strand_override = self._classify_line(
                line, known_chars
            )
            if not event_type:
                continue

            strand = strand_override or "quest"

            events.append(
                ChapterCommitEvent(
                    event_type=event_type,
                    description=desc,
                    entities_mentioned=entities,
                    strand=strand,
                )
            )
        return events

    # ------------------------------------------------------------------
    # State delta
    # ------------------------------------------------------------------

    @staticmethod
    def _extract_state_delta(
        lines: list[str], story_state: StoryState
    ) -> StateDelta:
        character_states: dict[str, str] = {}
        location: str | None = None
        time_progression: str | None = None

        full_text = "\n".join(lines)

        # Character presence
        for cid, ch in story_state.characters.items():
            if cid in full_text or ch.name in full_text:
                character_states[cid] = "present"

        # Location — scan for known location markers, take last occurrence
        last_loc_pos = -1
        for loc in _LOCATION_KEYWORDS:
            pos = full_text.rfind(loc)
            if pos > last_loc_pos:
                last_loc_pos = pos
                location = loc

        # Time — scan for time markers, take last occurrence
        last_time_pos = -1
        for tm in _TIME_MARKERS:
            pos = full_text.rfind(tm)
            if pos > last_time_pos:
                last_time_pos = pos
                time_progression = tm

        return StateDelta(
            character_states=character_states,
            location=location,
            time_progression=time_progression,
        )

    # ------------------------------------------------------------------
    # Entity delta
    # ------------------------------------------------------------------

    def _extract_entity_delta(
        self,
        chapter_text: str,
        lines: list[str],
        story_state: StoryState,
    ) -> EntityDelta:
        known_char_ids = set(story_state.characters.keys())
        known_char_names = {ch.name for ch in story_state.characters.values()}
        all_known_chars = known_char_ids | known_char_names

        # New characters: find potential name-patterns (2-4 char proper noun
        # followed by 是/说/叫/名为 at sentence-start or after punctuation)
        new_characters: list[dict[str, Any]] = []
        new_char_matches = re.findall(
            r"(?:^|[。!?,\n])([\u4e00-\u9fff]{2,4})(?:是|说|叫|名为)",
            chapter_text,
        )
        seen_names: set[str] = set()
        for name in new_char_matches:
            name = name.strip()
            if (
                name
                and name not in all_known_chars
                and name not in seen_names
                and name not in _NAME_EXCLUSIONS
                and not any(
                    kw in name for kw in _DIALOGUE_MARKERS | _ACTION_KEYWORDS
                )
            ):
                seen_names.add(name)
                new_characters.append(
                    {"id": name, "name": name, "role": "配角", "description": ""}
                )

        # New locations: find location keywords NOT already in the known set
        new_locations: list[dict[str, Any]] = []
        found_new_locs: set[str] = set()
        for loc in _LOCATION_KEYWORDS:
            if loc in chapter_text and loc not in _KNOWN_LOCATIONS and loc not in found_new_locs:
                found_new_locs.add(loc)
                new_locations.append(
                    {"id": loc, "name": loc, "description": ""}
                )

        return EntityDelta(
            new_characters=new_characters,
            new_locations=new_locations,
        )

    # ------------------------------------------------------------------
    # Hooks
    # ------------------------------------------------------------------

    @staticmethod
    def _extract_hooks(
        lines: list[str], chapter_number: int
    ) -> list[str]:
        hooks: list[str] = []
        for line in lines:
            stripped = line.strip()
            for pattern in _HOOK_CREATE_PATTERNS:
                if pattern in stripped:
                    if stripped not in hooks:
                        hooks.append(stripped[:120])
                    break
        return hooks

    @staticmethod
    def _extract_hooks_resolved(lines: list[str]) -> list[str]:
        resolved: list[str] = []
        for line in lines:
            stripped = line.strip()
            for pattern in _HOOK_RESOLVE_PATTERNS:
                if pattern in stripped:
                    if stripped not in resolved:
                        resolved.append(stripped[:120])
                    break
        return resolved

    # ------------------------------------------------------------------
    # Strand ratios
    # ------------------------------------------------------------------

    @staticmethod
    def _calculate_strand_ratios(
        events: list[ChapterCommitEvent],
    ) -> dict[str, float]:
        total = len(events)
        if total == 0:
            return {"quest": 0.0, "fire": 0.0, "constellation": 0.0}

        counts: dict[str, int] = {"quest": 0, "fire": 0, "constellation": 0}
        for e in events:
            s = e.strand
            if s in counts:
                counts[s] += 1

        return {
            "quest": round(counts["quest"] / total, 4),
            "fire": round(counts["fire"] / total, 4),
            "constellation": round(counts["constellation"] / total, 4),
        }

    # ------------------------------------------------------------------
    # Save & integrate
    # ------------------------------------------------------------------

    def save_and_integrate(
        self, commit: ChapterCommit, story_state: StoryState
    ) -> None:
        save_commit(commit, self.project_dir)
        story_state.apply_commit(commit)
        story_state.save(self.project_dir)
        print(
            f"[DataAgent] Chapter {commit.chapter_number} commit saved and "
            f"integrated into story state."
        )