文件预览

code_quality_analyzer.py

查看 Code Analysis Skills 技能包中的文件内容。

文件内容

src/analyzers/code_quality_analyzer.py

"""
Code Quality Analyzer - Analyzes code quality signals from Git history.

Metrics include:
  - Bug fix commit ratio
  - Hotfix/revert frequency
  - Large commit ratio (potential code smell)
  - Test file modification ratio
  - Code complexity trend (via radon for Python files)
  - Documentation update ratio
"""

import logging
import os
import re
import tempfile
from collections import defaultdict
from typing import Dict

from src.analyzers.base_analyzer import BaseAnalyzer

logger = logging.getLogger(__name__)

# Patterns for identifying bug fix / hotfix commits
BUG_FIX_PATTERNS = [
    re.compile(r"\bfix(es|ed)?\b", re.IGNORECASE),
    re.compile(r"\bbug\b", re.IGNORECASE),
    re.compile(r"\bpatch\b", re.IGNORECASE),
    re.compile(r"\bhotfix\b", re.IGNORECASE),
    re.compile(r"\bresolve[sd]?\b", re.IGNORECASE),
]

REVERT_PATTERN = re.compile(r"\brevert\b", re.IGNORECASE)

LARGE_COMMIT_THRESHOLD = 500  # lines changed


class CodeQualityAnalyzer(BaseAnalyzer):
    """Analyzes code quality signals per author from Git history."""

    def analyze(self) -> Dict:
        """
        Analyze code quality signals for each author.

        Returns:
            Dict keyed by author name with code quality metrics.
        """
        author_data = defaultdict(lambda: {
            "total_commits": 0,
            "bug_fix_commits": 0,
            "revert_commits": 0,
            "large_commits": 0,
            "test_file_modifications": 0,
            "doc_file_modifications": 0,
            "total_file_modifications": 0,
            "commit_sizes": [],
            "python_files_content": [],  # for complexity analysis
        })

        for commit in self._get_commits():
            author = commit.author.name
            data = author_data[author]
            data["total_commits"] += 1

            msg = commit.msg

            # Bug fix detection
            if any(p.search(msg) for p in BUG_FIX_PATTERNS):
                data["bug_fix_commits"] += 1

            # Revert detection
            if REVERT_PATTERN.search(msg):
                data["revert_commits"] += 1

            # Analyze modified files
            total_lines_changed = 0
            for mod in commit.modified_files:
                filepath = mod.new_path or mod.old_path or ""
                lines_changed = mod.added_lines + mod.deleted_lines
                total_lines_changed += lines_changed
                data["total_file_modifications"] += 1

                # Test file detection
                if self._is_test_file(filepath):
                    data["test_file_modifications"] += 1

                # Documentation file detection
                if self._is_doc_file(filepath):
                    data["doc_file_modifications"] += 1

                # Collect Python file content for complexity analysis
                if filepath.endswith(".py") and mod.source_code:
                    data["python_files_content"].append(mod.source_code)

            data["commit_sizes"].append(total_lines_changed)
            if total_lines_changed > LARGE_COMMIT_THRESHOLD:
                data["large_commits"] += 1

        result = {}
        for author, data in author_data.items():
            total = data["total_commits"]
            if total == 0:
                continue

            total_mods = data["total_file_modifications"]

            # Calculate complexity metrics for Python files
            avg_complexity = self._compute_avg_complexity(data["python_files_content"])

            sizes = data["commit_sizes"]
            avg_commit_size = round(sum(sizes) / len(sizes), 1) if sizes else 0
            median_size = sorted(sizes)[len(sizes) // 2] if sizes else 0

            result[author] = {
                "total_commits": total,
                "bug_fix_commits": data["bug_fix_commits"],
                "bug_fix_ratio": round(data["bug_fix_commits"] / total, 3),
                "revert_commits": data["revert_commits"],
                "revert_ratio": round(data["revert_commits"] / total, 3),
                "large_commits": data["large_commits"],
                "large_commit_ratio": round(data["large_commits"] / total, 3),
                "test_modification_ratio": round(
                    data["test_file_modifications"] / total_mods, 3
                ) if total_mods else 0,
                "doc_modification_ratio": round(
                    data["doc_file_modifications"] / total_mods, 3
                ) if total_mods else 0,
                "avg_commit_size": avg_commit_size,
                "median_commit_size": median_size,
                "avg_python_complexity": avg_complexity,
            }

        return result

    @staticmethod
    def _is_test_file(filepath: str) -> bool:
        """Check if a file is a test file."""
        path_lower = filepath.lower()
        return any(
            pattern in path_lower
            for pattern in [
                "test", "spec", "__test__", "_test.", ".test.",
                "tests/", "test/", "spec/",
            ]
        )

    @staticmethod
    def _is_doc_file(filepath: str) -> bool:
        """Check if a file is a documentation file."""
        path_lower = filepath.lower()
        return any(
            path_lower.endswith(ext)
            for ext in [".md", ".rst", ".txt", ".adoc"]
        ) or any(
            name in path_lower
            for name in ["readme", "changelog", "contributing", "license", "docs/"]
        )

    @staticmethod
    def _compute_avg_complexity(python_sources: list) -> float:
        """
        Compute average cyclomatic complexity for Python source code samples.

        Uses radon library for complexity analysis.
        """
        if not python_sources:
            return 0.0

        try:
            from radon.complexity import cc_visit
        except ImportError:
            logger.warning("radon not installed, skipping complexity analysis")
            return 0.0

        complexities = []
        for source in python_sources[:50]:  # Limit to avoid performance issues
            try:
                results = cc_visit(source)
                for block in results:
                    complexities.append(block.complexity)
            except Exception:
                continue

        if not complexities:
            return 0.0

        return round(sum(complexities) / len(complexities), 2)