文件预览

nlp_index.py

查看 OmniSkill 通用技能调度器 技能包中的文件内容。

文件内容

src/dispatcher/nlp_index.py

"""
词频逆文档频率向量索引 (TF-IDF NLP Index)
用于处理自然语言,匹配最合适的技能。
"""
import math
from collections import defaultdict, Counter
from typing import List, Tuple

class TFIDFIndex:
    """轻量级 TF-IDF 向量索引"""
    def __init__(self):
        self.documents = {} # 文档内容库
        self.document_tokens = {} # 分词结果库
        self.word_document_count = defaultdict(int) # 词频 (文档频次)
        self.total_documents = 0 # 总文档数

    def _tokenize(self, text: str) -> List[str]:
        """简单分词器,按字拆分并转小写"""
        return [char.lower() for char in text if char.strip()]

    def add_skill(self, skill_id: str, description: str):
        """添加技能描述到索引"""
        tokens = self._tokenize(description)
        self.documents[skill_id] = description
        self.document_tokens[skill_id] = tokens
        self.total_documents += 1

        # 记录包含该词的文档数量,用于计算逆文档频率
        unique_tokens = set(tokens)
        for token in unique_tokens:
            self.word_document_count[token] += 1

    def match(self, query: str, top_k: int = 1) -> List[Tuple[str, float]]:
        """计算余弦相似度并返回最匹配技能"""
        if self.total_documents == 0:
            return []

        query_tokens = self._tokenize(query)
        query_counter = Counter(query_tokens)

        scores = defaultdict(float)

        # 遍历所有文档,计算相似度得分
        for skill_id, tokens in self.document_tokens.items():
            if not tokens:
                continue
                
            skill_counter = Counter(tokens)
            score = 0.0

            for token, q_count in query_counter.items():
                if token in skill_counter:
                    # 词频 (Term Frequency)
                    tf = skill_counter[token] / len(tokens)
                    # 逆文档频率 (Inverse Document Frequency)
                    idf = math.log(self.total_documents / (1 + self.word_document_count[token]))
                    # 累加权重得分
                    score += q_count * (tf * idf)

            scores[skill_id] = score

        # 按得分从高到低排序
        sorted_skills = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        # 仅返回得分大于0的技能
        return [(skill_id, score) for skill_id, score in sorted_skills[:top_k] if score > 0]