文件预览

cases.json

查看 Coding Pronoun Prompt Resolver 技能包中的文件内容。

文件内容

evals/cases.json

[
  {
    "id": "high-context-single-file",
    "description": "Single pronoun with strong single-file context",
    "prompt": "fix it",
    "pronouns": ["it"],
    "context": "User previously said: 'the login form in src/auth/LoginForm.tsx has a bug where it doesn't validate email format'. Assistant edited src/auth/LoginForm.tsx and added email validation regex.",
    "context_reliability": {"last_edited_file": 0.85, "conversation_topic": 0.7},
    "expected": {
      "pronoun": "it",
      "should_resolve": true,
      "expected_referent_contains": ["LoginForm", "email", "validation"],
      "min_confidence": 0.8,
      "expected_idiomatic": false,
      "expected_tier": "self-check"
    }
  },
  {
    "id": "high-context-multi-file",
    "description": "Pronoun with context spanning multiple files",
    "prompt": "update them",
    "pronouns": ["them"],
    "context": "User asked to 'add created_at and updated_at timestamps to the User model and the Order model'. Assistant edited models/user.py and models/order.py adding timestamp fields.",
    "context_reliability": {"last_edited_file": 0.8, "conversation_topic": 0.75},
    "expected": {
      "pronoun": "them",
      "should_resolve": true,
      "expected_referent_contains": ["timestamp", "User", "Order"],
      "min_confidence": 0.7,
      "expected_idiomatic": false,
      "expected_tier": "self-check"
    }
  },
  {
    "id": "medium-context-ambiguous",
    "description": "Pronoun with context that could refer to two things -- should resolve to api endpoint (last tool call signal)",
    "prompt": "delete it",
    "pronouns": ["it"],
    "context": "User discussed both 'the deprecated API endpoint at /api/v1/users' and 'the stale cache in redis'. Assistant ran tests on both. Last tool call was a curl to /api/v1/users.",
    "context_reliability": {"last_tool_call": 0.6, "conversation_topic": 0.5},
    "expected": {
      "pronoun": "it",
      "should_resolve": true,
      "expected_referent_contains": ["api"],
      "min_confidence": 0.5,
      "expected_idiomatic": false,
      "expected_tier_flexible": true
    }
  },
  {
    "id": "low-context-bare-pronoun",
    "description": "Pronoun with zero useful context -- LLM may mark idiomatic or low-confidence; either is acceptable",
    "prompt": "fix it",
    "pronouns": ["it"],
    "context": "(No prior conversation context available. Resolve based on the message alone.)",
    "context_reliability": {},
    "expected": {
      "pronoun": "it",
      "should_resolve": false,
      "expected_referent_contains": [],
      "min_confidence": 0.0,
      "expected_idiomatic_flexible": true,
      "expected_tier_flexible": true
    }
  },
  {
    "id": "self-contained-high-context",
    "description": "Self-contained prompt with strong matching context -- should resolve to the auth handler confidently",
    "prompt": "fix it in the auth handler",
    "pronouns": ["it"],
    "context": "User said 'there is a null pointer exception in src/auth/handler.ts at line 42 when the session cookie is missing'. Assistant read src/auth/handler.ts.",
    "context_reliability": {"last_edited_file": 0.9, "recent_symbol": 0.8},
    "expected": {
      "pronoun": "it",
      "should_resolve": true,
      "expected_referent_contains": ["auth", "handler"],
      "min_confidence": 0.8,
      "expected_idiomatic_flexible": true,
      "expected_tier": "self-check"
    }
  },
  {
    "id": "chained-pronouns",
    "description": "Multiple chained pronouns requiring left-to-right resolution",
    "prompt": "take that and apply it to these",
    "pronouns": ["that", "it", "these"],
    "context": "User asked to 'extract the rate limiting logic from api/middleware.ts into a shared utility'. Assistant created lib/rate-limit.ts. Then user said 'we also need this in the websocket handler and the graphql resolver'.",
    "context_reliability": {"last_edited_file": 0.8, "conversation_topic": 0.85},
    "expected": {
      "pronoun": "that",
      "should_resolve": true,
      "expected_referent_contains": ["rate limit"],
      "min_confidence": 0.7,
      "expected_idiomatic": false,
      "expected_tier": "self-check"
    }
  },
  {
    "id": "idiomatic-erlang",
    "description": "Idiomatic phrase that should not be resolved",
    "prompt": "let it crash",
    "pronouns": ["it"],
    "context": "User is building an Erlang/Elixir supervision tree.",
    "context_reliability": {"conversation_topic": 0.8},
    "expected": {
      "pronoun": "it",
      "should_resolve": false,
      "expected_referent_contains": [],
      "min_confidence": 0.9,
      "expected_idiomatic": true,
      "expected_tier": "self-check"
    }
  },
  {
    "id": "structural-with-context",
    "description": "Standalone affirmation -- may resolve to the proposal or be treated as idiomatic",
    "prompt": "that's correct",
    "pronouns": ["that"],
    "context": "Assistant proposed refactoring the payment processing pipeline to use an event-driven architecture with a message queue.",
    "context_reliability": {"conversation_topic": 0.75},
    "expected": {
      "pronoun": "that",
      "should_resolve": false,
      "expected_referent_contains": ["refactor", "payment"],
      "min_confidence": 0.7,
      "expected_idiomatic_flexible": true,
      "expected_tier": "self-check"
    }
  },
  {
    "id": "high-context-recent-symbol",
    "description": "Pronoun referring to a recently discussed code symbol",
    "prompt": "rename it",
    "pronouns": ["it"],
    "context": "User said 'the function processUserData in utils/transform.ts has a misleading name since it now handles both user and order data'. Assistant read the function.",
    "context_reliability": {"recent_symbol": 0.9, "last_edited_file": 0.6},
    "expected": {
      "pronoun": "it",
      "should_resolve": true,
      "expected_referent_contains": ["processUserData", "utils/transform"],
      "min_confidence": 0.85,
      "expected_idiomatic": false,
      "expected_tier": "self-check"
    }
  },
  {
    "id": "structural-with-adjacent-instruction",
    "description": "Structural pronoun followed by clear instruction -- should be idiomatic or resolve to the proposal",
    "prompt": "that's correct, now refactor the parser",
    "pronouns": ["that"],
    "context": "Assistant proposed a new approach to error handling.",
    "context_reliability": {"conversation_topic": 0.7},
    "expected": {
      "pronoun": "that",
      "should_resolve": false,
      "expected_referent_contains": ["error handling"],
      "min_confidence": 0.7,
      "expected_idiomatic_flexible": true,
      "expected_tier": "self-check"
    }
  }
]