文件预览

bench.test.js

查看 Evolver 技能包中的文件内容。

文件内容

test/bench.test.js

'use strict';

const { describe, it, beforeEach, afterEach } = require('node:test');
const assert = require('node:assert/strict');
const fs = require('fs');
const path = require('path');
const os = require('os');

// ---------------------------------------------------------------------------
// evolver-bench: quantitative benchmark for evolution effectiveness
// Measures: gene selection accuracy, failure distillation quality,
//           signal extraction recall, anti-pattern avoidance
// ---------------------------------------------------------------------------

const { selectGene, selectGeneAndCapsule } = require('../src/gep/selector');
const { extractSignals } = require('../src/gep/signals');
const {
  collectFailureDistillationData,
  analyzeFailurePatterns,
  synthesizeRepairGeneFromFailures,
  autoDistillFromFailures,
  validateSynthesizedGene,
  shouldDistillFromFailures,
  REPAIR_DISTILLED_ID_PREFIX,
  FAILURE_DISTILLER_MIN_CAPSULES,
} = require('../src/gep/skillDistiller');

let tmpDir;
let savedEnv = {};

function setupTempEnv() {
  tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evolver-bench-'));
  savedEnv = {
    GEP_ASSETS_DIR: process.env.GEP_ASSETS_DIR,
    EVOLUTION_DIR: process.env.EVOLUTION_DIR,
    MEMORY_DIR: process.env.MEMORY_DIR,
    MEMORY_GRAPH_PATH: process.env.MEMORY_GRAPH_PATH,
    SKILL_DISTILLER: process.env.SKILL_DISTILLER,
    FAILURE_DISTILLER: process.env.FAILURE_DISTILLER,
  };
  process.env.GEP_ASSETS_DIR = path.join(tmpDir, 'assets');
  process.env.EVOLUTION_DIR = path.join(tmpDir, 'evolution');
  process.env.MEMORY_DIR = path.join(tmpDir, 'memory');
  process.env.MEMORY_GRAPH_PATH = path.join(tmpDir, 'evolution', 'memory_graph.jsonl');
  fs.mkdirSync(process.env.GEP_ASSETS_DIR, { recursive: true });
  fs.mkdirSync(process.env.EVOLUTION_DIR, { recursive: true });
  fs.mkdirSync(process.env.MEMORY_DIR, { recursive: true });
}

function teardownTempEnv() {
  Object.keys(savedEnv).forEach(function (key) {
    if (savedEnv[key] !== undefined) process.env[key] = savedEnv[key];
    else delete process.env[key];
  });
  try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch (e) {}
}

function writeJson(filePath, data) {
  fs.mkdirSync(path.dirname(filePath), { recursive: true });
  fs.writeFileSync(filePath, JSON.stringify(data, null, 2), 'utf8');
}

function makeFailedCapsule(id, gene, trigger, failureReason, learningSignals, violations) {
  return {
    type: 'Capsule',
    id: id,
    gene: gene,
    trigger: trigger || ['error'],
    outcome: { status: 'failed', score: 0.2 },
    failure_reason: failureReason || 'constraint violation',
    learning_signals: learningSignals || [],
    constraint_violations: violations || [],
    blast_radius: { files: 3, lines: 50 },
    created_at: new Date().toISOString(),
  };
}

// ---------------------------------------------------------------------------
// Bench 1: Gene Selection Accuracy
// ---------------------------------------------------------------------------
describe('bench: gene selection accuracy', function () {
  var BENCH_GENES = [
    { type: 'Gene', id: 'gene_repair', category: 'repair', signals_match: ['error', 'exception', 'failed', 'unstable'], strategy: ['fix'] },
    { type: 'Gene', id: 'gene_optimize', category: 'optimize', signals_match: ['protocol', 'gep', 'prompt', 'audit'], strategy: ['optimize'] },
    { type: 'Gene', id: 'gene_innovate', category: 'innovate', signals_match: ['user_feature_request', 'capability_gap', 'stable_success_plateau'], strategy: ['build'] },
    { type: 'Gene', id: 'gene_perf', category: 'optimize', signals_match: ['perf_bottleneck', 'latency', 'throughput', 'slow'], strategy: ['speed up'] },
  ];

  var TEST_CASES = [
    { signals: ['error', 'exception'], expected: 'gene_repair', label: 'error signals -> repair' },
    { signals: ['protocol', 'audit'], expected: 'gene_optimize', label: 'protocol signals -> optimize' },
    { signals: ['user_feature_request', 'capability_gap'], expected: 'gene_innovate', label: 'feature request -> innovate' },
    { signals: ['perf_bottleneck', 'latency'], expected: 'gene_perf', label: 'perf signals -> perf optimize' },
    { signals: ['failed', 'unstable'], expected: 'gene_repair', label: 'failure signals -> repair' },
    { signals: ['gep', 'prompt'], expected: 'gene_optimize', label: 'prompt signals -> optimize' },
  ];

  it('achieves >= 80% selection accuracy on standard signal scenarios', function () {
    var correct = 0;
    var total = TEST_CASES.length;

    for (var i = 0; i < TEST_CASES.length; i++) {
      var tc = TEST_CASES[i];
      var result = selectGene(BENCH_GENES, tc.signals, { effectivePopulationSize: 100 });
      if (result.selected && result.selected.id === tc.expected) {
        correct++;
      }
    }

    var accuracy = correct / total;
    assert.ok(accuracy >= 0.8, 'Gene selection accuracy ' + (accuracy * 100).toFixed(1) + '% < 80% threshold');
  });

  it('never selects a banned gene', function () {
    var banned = new Set(['gene_repair']);
    for (var i = 0; i < 20; i++) {
      var result = selectGene(BENCH_GENES, ['error', 'exception'], {
        bannedGeneIds: banned,
        effectivePopulationSize: 100,
      });
      if (result.selected) {
        assert.ok(!banned.has(result.selected.id), 'Selected banned gene: ' + result.selected.id);
      }
    }
  });
});

// ---------------------------------------------------------------------------
// Bench 2: Signal Extraction Recall
// ---------------------------------------------------------------------------
describe('bench: signal extraction recall', function () {
  it('extracts error signals from log-like input', function () {
    var signals = extractSignals({
      recentSessionTranscript: '',
      todayLog: '[error] Module X failed to load\n[error] Database connection timeout\nException in thread main',
      memorySnippet: '',
      userSnippet: '',
    });
    assert.ok(signals.includes('log_error'), 'Should detect log_error from [error] lines');
  });

  it('extracts feature request signals', function () {
    var signals = extractSignals({
      recentSessionTranscript: '',
      todayLog: '',
      memorySnippet: '',
      userSnippet: 'I want a dark mode toggle in the settings panel',
    });
    var hasFeatureSignal = signals.some(function (s) {
      return s.indexOf('user_feature_request') !== -1 || s.indexOf('user_improvement_suggestion') !== -1;
    });
    assert.ok(hasFeatureSignal, 'Should detect feature request from user input');
  });

  it('detects stagnation signals', function () {
    var recentEvents = [];
    for (var i = 0; i < 6; i++) {
      recentEvents.push({
        type: 'EvolutionEvent',
        outcome: { status: 'success', score: 0.85 },
        blast_radius: { files: 0, lines: 0 },
        signals: [],
      });
    }
    var signals = extractSignals({
      recentSessionTranscript: '',
      todayLog: '',
      memorySnippet: '',
      userSnippet: '',
      recentEvents: recentEvents,
    });
    var hasStagnation = signals.some(function (s) {
      return s.indexOf('empty_cycle') !== -1 || s.indexOf('stagnation') !== -1 || s.indexOf('steady_state') !== -1;
    });
    assert.ok(hasStagnation || signals.length === 0, 'Stagnation detection assessed (may not trigger with minimal events)');
  });
});

// ---------------------------------------------------------------------------
// Bench 3: Failure Distillation Quality
// ---------------------------------------------------------------------------
describe('bench: failure distillation quality', function () {
  beforeEach(setupTempEnv);
  afterEach(teardownTempEnv);

  it('collects and groups failed capsules correctly', function () {
    var failures = [
      makeFailedCapsule('f1', 'gene_repair', ['error', 'crash'], 'blast_radius_exceeded', ['problem:reliability'], ['blast_radius_exceeded']),
      makeFailedCapsule('f2', 'gene_repair', ['error', 'timeout'], 'blast_radius_exceeded', ['problem:reliability'], ['blast_radius_exceeded']),
      makeFailedCapsule('f3', 'gene_optimize', ['protocol'], 'validation_failed', ['problem:protocol'], ['validation_cmd_failed']),
    ];
    writeJson(path.join(process.env.GEP_ASSETS_DIR, 'failed_capsules.json'), { version: 1, failed_capsules: failures });

    var data = collectFailureDistillationData();
    assert.equal(data.failedCapsules.length, 3);
    assert.ok(Object.keys(data.grouped).length >= 2);
  });

  it('identifies high-frequency failure patterns', function () {
    var failures = [];
    for (var i = 0; i < 5; i++) {
      failures.push(makeFailedCapsule(
        'f' + i, 'gene_repair', ['error', 'memory_leak'],
        'blast_radius_exceeded: too many files changed',
        ['problem:reliability', 'risk:validation'],
        ['blast_radius_exceeded', 'max_files_exceeded']
      ));
    }
    writeJson(path.join(process.env.GEP_ASSETS_DIR, 'failed_capsules.json'), { version: 1, failed_capsules: failures });

    var data = collectFailureDistillationData();
    var analysis = analyzeFailurePatterns(data);
    assert.ok(analysis.high_frequency_failures.length >= 1, 'Should detect high-frequency failure pattern');
    assert.ok(analysis.high_frequency_failures[0].count >= 2);
  });

  it('synthesizes a repair gene from failure patterns', function () {
    var failures = [];
    for (var i = 0; i < 6; i++) {
      failures.push(makeFailedCapsule(
        'f' + i, 'gene_repair', ['error', 'crash'],
        'blast_radius_exceeded',
        ['problem:reliability'],
        ['blast_radius_exceeded']
      ));
    }
    writeJson(path.join(process.env.GEP_ASSETS_DIR, 'failed_capsules.json'), { version: 1, failed_capsules: failures });

    var data = collectFailureDistillationData();
    var analysis = analyzeFailurePatterns(data);
    var gene = synthesizeRepairGeneFromFailures(data, analysis, []);

    assert.ok(gene, 'Should produce a repair gene');
    assert.equal(gene.type, 'Gene');
    assert.equal(gene.category, 'repair');
    assert.ok(gene.id.startsWith(REPAIR_DISTILLED_ID_PREFIX) || gene.id.startsWith('gene_distilled_'), 'Gene id should have repair prefix');
    assert.ok(gene.strategy.length >= 4, 'Strategy should have guard steps');
    assert.ok(gene.strategy.some(function (s) { return s.indexOf('GUARD') !== -1 || s.indexOf('guard') !== -1; }), 'Should include guard steps');
  });

  it('autoDistillFromFailures produces a gene when threshold met', function () {
    var failures = [];
    for (var i = 0; i < 6; i++) {
      failures.push(makeFailedCapsule(
        'f' + i, 'gene_repair', ['error', 'crash'],
        'blast_radius_exceeded',
        ['problem:reliability'],
        ['blast_radius_exceeded']
      ));
    }
    writeJson(path.join(process.env.GEP_ASSETS_DIR, 'failed_capsules.json'), { version: 1, failed_capsules: failures });
    writeJson(path.join(process.env.GEP_ASSETS_DIR, 'genes.json'), { version: 1, genes: [] });

    var result = autoDistillFromFailures();
    assert.ok(result.ok, 'autoDistillFromFailures should succeed: ' + (result.reason || ''));
    assert.ok(result.gene);
    assert.equal(result.source, 'failure_distillation');

    var genes = JSON.parse(fs.readFileSync(path.join(process.env.GEP_ASSETS_DIR, 'genes.json'), 'utf8'));
    assert.ok(genes.genes.some(function (g) { return g.id === result.gene.id; }), 'Gene should be persisted');
  });

  it('returns insufficient_failures when below threshold', function () {
    var failures = [
      makeFailedCapsule('f1', 'gene_repair', ['error'], 'blast_radius_exceeded', [], []),
    ];
    writeJson(path.join(process.env.GEP_ASSETS_DIR, 'failed_capsules.json'), { version: 1, failed_capsules: failures });

    var result = autoDistillFromFailures();
    assert.equal(result.ok, false);
    assert.equal(result.reason, 'insufficient_failures');
  });

  it('idempotent skip on repeated calls with same data', function () {
    var failures = [];
    for (var i = 0; i < 6; i++) {
      failures.push(makeFailedCapsule('f' + i, 'gene_repair', ['error'], 'blast_radius', ['problem:reliability'], ['blast_radius']));
    }
    writeJson(path.join(process.env.GEP_ASSETS_DIR, 'failed_capsules.json'), { version: 1, failed_capsules: failures });
    writeJson(path.join(process.env.GEP_ASSETS_DIR, 'genes.json'), { version: 1, genes: [] });

    var first = autoDistillFromFailures();
    assert.ok(first.ok);

    var second = autoDistillFromFailures();
    assert.equal(second.ok, false);
    assert.equal(second.reason, 'idempotent_skip');
  });
});

// ---------------------------------------------------------------------------
// Bench 4: Anti-pattern Avoidance
// ---------------------------------------------------------------------------
describe('bench: anti-pattern avoidance', function () {
  it('genes with anti-patterns score lower than clean genes', function () {
    var riskyGene = {
      type: 'Gene', id: 'gene_risky', category: 'repair',
      signals_match: ['error'],
      anti_patterns: [
        { mode: 'hard', learning_signals: ['problem:reliability'] },
        { mode: 'hard', learning_signals: ['problem:reliability'] },
      ],
      strategy: ['fix'],
    };
    var safeGene = {
      type: 'Gene', id: 'gene_safe', category: 'repair',
      signals_match: ['error'],
      learning_history: [
        { outcome: 'success', mode: 'none' },
        { outcome: 'success', mode: 'none' },
      ],
      strategy: ['fix safely'],
    };

    var result = selectGene([riskyGene, safeGene], ['error'], { effectivePopulationSize: 100 });
    assert.ok(result.selected);
    assert.equal(result.selected.id, 'gene_safe', 'Should prefer gene without anti-patterns');
  });
});

// ---------------------------------------------------------------------------
// Bench 5: shouldDistillFromFailures gate
// ---------------------------------------------------------------------------
describe('bench: shouldDistillFromFailures gate', function () {
  beforeEach(setupTempEnv);
  afterEach(teardownTempEnv);

  it('returns false when FAILURE_DISTILLER=false', function () {
    process.env.FAILURE_DISTILLER = 'false';
    assert.equal(shouldDistillFromFailures(), false);
    delete process.env.FAILURE_DISTILLER;
  });

  it('returns false when not enough failures', function () {
    writeJson(path.join(process.env.GEP_ASSETS_DIR, 'failed_capsules.json'), {
      version: 1, failed_capsules: [makeFailedCapsule('f1', 'g', ['e'], 'reason', [], [])],
    });
    assert.equal(shouldDistillFromFailures(), false);
  });

  it('returns true when enough failures and no recent distillation', function () {
    var failures = [];
    for (var i = 0; i < FAILURE_DISTILLER_MIN_CAPSULES + 1; i++) {
      failures.push(makeFailedCapsule('f' + i, 'gene_repair', ['error'], 'reason', [], []));
    }
    writeJson(path.join(process.env.GEP_ASSETS_DIR, 'failed_capsules.json'), { version: 1, failed_capsules: failures });
    assert.equal(shouldDistillFromFailures(), true);
  });
});