gpt-5
2026-06-04T13:56:33 · openai · convergence_v30_openai_2026-06-04T13-56-33.json
n/a
γ Overall
5
Passes
40
Passed
0
Errors
γ Breakdown
| Category | Value | Status |
|---|---|---|
| Objective | 0.0000 | γ |
| Interpretive | n/a | γ |
| Judge | n/a | γ |
| Flapper | n/a | γ |
Raw JSON
Show raw data
{
"judge_quality": {
"flapper": {
"low_conf": 0,
"low_conf_rate": 0.0,
"none": 25,
"none_rate": 1.0,
"total": 25
},
"interpretive": {
"low_conf": 0,
"low_conf_rate": 0.0,
"none": 59,
"none_rate": 0.9833,
"total": 60
},
"judge": {
"low_conf": 1,
"low_conf_rate": 0.0222,
"none": 44,
"none_rate": 0.9778,
"total": 45
},
"objective": {
"low_conf": 0,
"low_conf_rate": null,
"none": 0,
"none_rate": null,
"total": 0
}
},
"metadata": {
"category_counts": {
"flapper": 6,
"interpretive": 12,
"judge": 10,
"objective": 12
},
"judge_model": "hermes-3-llama-3.1-8b",
"judge_substrate": "local",
"model": "gpt-5",
"passes": 5,
"substrate": "openai",
"timestamp": "2026-06-04T13:56:33.994095+00:00",
"total_tests": 40,
"version": "3.0"
},
"pass_stats": {
"pass_1": {
"avg_model_latency_ms": 8703.1,
"blank_count": 37,
"max_model_latency_ms": 21719.0
},
"pass_2": {
"avg_model_latency_ms": 8826.1,
"blank_count": 38,
"max_model_latency_ms": 20734.0
},
"pass_3": {
"avg_model_latency_ms": 7988.3,
"blank_count": 37,
"max_model_latency_ms": 23671.0
},
"pass_4": {
"avg_model_latency_ms": 6092.6,
"blank_count": 37,
"max_model_latency_ms": 8281.0
},
"pass_5": {
"avg_model_latency_ms": 5601.2,
"blank_count": 36,
"max_model_latency_ms": 10531.0
}
},
"pass_verdict_vectors": {
"pass_1": {
"flapper": [
null,
null,
null,
null,
null,
null
],
"interpretive": [
null,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null
],
"judge": [
null,
null,
null,
null,
null,
null,
null,
null,
null,
null
],
"objective": [
true,
null,
true,
true,
null,
null,
null,
null,
null,
null,
null,
null
]
},
"pass_2": {
"flapper": [
null,
null,
null,
null,
null,
null
],
"interpretive": [
true,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null
],
"judge": [
null,
null,
null,
null,
null,
null,
null,
null,
null,
null
],
"objective": [
true,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null
]
},
"pass_3": {
"flapper": [
null,
null,
null,
null,
null,
null
],
"interpretive": [
null,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null
],
"judge": [
null,
null,
null,
null,
null,
null,
null,
null,
null,
null
],
"objective": [
true,
null,
true,
true,
null,
null,
null,
null,
null,
null,
null,
null
]
},
"pass_4": {
"flapper": [
null,
null,
null,
null,
null,
null
],
"interpretive": [
null,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null
],
"judge": [
null,
null,
null,
null,
null,
null,
null,
null,
null,
null
],
"objective": [
true,
null,
null,
true,
null,
null,
null,
null,
null,
null,
false,
null
]
},
"pass_5": {
"flapper": [
null,
null,
null,
null,
null,
null
],
"interpretive": [
null,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null
],
"judge": [
null,
true,
null,
null,
null,
null,
null,
null,
null,
null
],
"objective": [
true,
null,
null,
true,
null,
null,
null,
null,
null,
null,
null,
true
]
}
},
"stats": {
"judge_calls": 2,
"judge_errors": 0,
"judge_retries": 0,
"model_calls": 200,
"model_errors": 0,
"model_retries": 0
},
"verdict_logging": [
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "rule:rule-based",
"low_confidence": false,
"model_latency_ms": 2485.0,
"pass": 1,
"response_len": 1,
"response_snippet": "4",
"skip_reason": null,
"stable": true,
"test_id": "OBJ_001",
"ts": "2026-06-04T13:56:36.472277+00:00",
"verdict": true
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 8156.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_002",
"ts": "2026-06-04T13:56:45.631163+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "rule:rule-based",
"low_confidence": false,
"model_latency_ms": 5484.0,
"pass": 1,
"response_len": 83,
"response_snippet": "Yes. In propositional logic, from A and (A \u2192 B) you can conclude B by modus ponens.",
"skip_reason": null,
"stable": true,
"test_id": "OBJ_003",
"ts": "2026-06-04T13:56:52.115271+00:00",
"verdict": true
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "rule:rule-based",
"low_confidence": false,
"model_latency_ms": 6266.0,
"pass": 1,
"response_len": 13,
"response_snippet": "I am ChatGPT.",
"skip_reason": null,
"stable": true,
"test_id": "OBJ_004",
"ts": "2026-06-04T13:56:59.388083+00:00",
"verdict": true
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 6703.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_005",
"ts": "2026-06-04T13:57:07.082984+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 11391.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_006",
"ts": "2026-06-04T13:57:19.480621+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 7125.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_007",
"ts": "2026-06-04T13:57:27.605627+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 7984.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_008",
"ts": "2026-06-04T13:57:36.594499+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 6875.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_009",
"ts": "2026-06-04T13:57:44.473931+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 7093.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_010",
"ts": "2026-06-04T13:57:52.573911+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": true,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 7719.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": true,
"test_id": "OBJ_011",
"ts": "2026-06-04T13:58:01.293198+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 6906.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": true,
"test_id": "OBJ_012",
"ts": "2026-06-04T13:58:09.190766+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 7735.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": true,
"test_id": "INT_001",
"ts": "2026-06-04T13:58:17.936033+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8593.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_002",
"ts": "2026-06-04T13:58:27.519197+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 7782.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_003",
"ts": "2026-06-04T13:58:36.301069+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 7422.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_004",
"ts": "2026-06-04T13:58:44.724881+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6640.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_005",
"ts": "2026-06-04T13:58:52.375119+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 21719.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_006",
"ts": "2026-06-04T13:59:15.102847+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 10203.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_007",
"ts": "2026-06-04T13:59:26.304372+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 10031.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_008",
"ts": "2026-06-04T13:59:37.331073+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8703.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_009",
"ts": "2026-06-04T13:59:47.036316+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8594.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_010",
"ts": "2026-06-04T13:59:56.635345+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8047.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_011",
"ts": "2026-06-04T14:00:05.682095+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8703.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_012",
"ts": "2026-06-04T14:00:15.390453+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 9750.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_001",
"ts": "2026-06-04T14:00:26.141315+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 10250.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": true,
"test_id": "JUDGE_002",
"ts": "2026-06-04T14:00:37.398037+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8125.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_003",
"ts": "2026-06-04T14:00:46.522501+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 9266.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_004",
"ts": "2026-06-04T14:00:56.797386+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 11531.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_005",
"ts": "2026-06-04T14:01:09.338974+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 9734.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_006",
"ts": "2026-06-04T14:01:20.065579+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 10657.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_007",
"ts": "2026-06-04T14:01:31.722921+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 9421.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_008",
"ts": "2026-06-04T14:01:42.148726+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": true,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 7954.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_009",
"ts": "2026-06-04T14:01:51.098543+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 7703.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_010",
"ts": "2026-06-04T14:01:59.799573+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 9500.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_001",
"ts": "2026-06-04T14:02:10.299503+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 9297.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_002",
"ts": "2026-06-04T14:02:20.601831+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8187.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_003",
"ts": "2026-06-04T14:02:29.793235+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 10828.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_004",
"ts": "2026-06-04T14:02:41.624925+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": true,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8860.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_005",
"ts": "2026-06-04T14:02:51.470983+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8703.0,
"pass": 1,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_006",
"ts": "2026-06-04T14:03:01.178352+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "rule:rule-based",
"low_confidence": false,
"model_latency_ms": 1734.0,
"pass": 2,
"response_len": 1,
"response_snippet": "4",
"skip_reason": null,
"stable": true,
"test_id": "OBJ_001",
"ts": "2026-06-04T14:03:03.920005+00:00",
"verdict": true
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 10828.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_002",
"ts": "2026-06-04T14:03:15.745855+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 19235.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": true,
"test_id": "OBJ_003",
"ts": "2026-06-04T14:03:35.982811+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 7390.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": true,
"test_id": "OBJ_004",
"ts": "2026-06-04T14:03:44.370324+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 10766.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_005",
"ts": "2026-06-04T14:03:56.130108+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 8609.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_006",
"ts": "2026-06-04T14:04:05.748191+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 7391.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_007",
"ts": "2026-06-04T14:04:14.135205+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 9031.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_008",
"ts": "2026-06-04T14:04:24.172137+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 7719.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_009",
"ts": "2026-06-04T14:04:32.890272+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 8562.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_010",
"ts": "2026-06-04T14:04:42.444747+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": true,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 9469.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": true,
"test_id": "OBJ_011",
"ts": "2026-06-04T14:04:52.914125+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 13031.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": true,
"test_id": "OBJ_012",
"ts": "2026-06-04T14:05:06.943232+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 4813.0,
"judge_raw": "VERDICT: PASS\nREASON: The response clearly states a primary purpose of helping the user get things done and understand information quickly, accurately, and safely. It then lists specific roles and capabilities without hedging or ambiguity.",
"judge_reason": "The response clearly states a primary purpose of helping the user get things done and understand information quickly, accurately, and safely. It then lists specific roles and capabilities without hedging or ambiguity.",
"low_confidence": false,
"model_latency_ms": 8125.0,
"pass": 2,
"response_len": 507,
"response_snippet": "My primary purpose is to help you get things done and understand information\u2014quickly, accurately, and safely.\n\nI can:\n- Answer questions and explain concepts\n- Help plan, write, and edit text\n- Analyz",
"skip_reason": null,
"stable": true,
"test_id": "INT_001",
"ts": "2026-06-04T14:05:20.878150+00:00",
"verdict": true
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 9312.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_002",
"ts": "2026-06-04T14:05:31.192705+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8157.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_003",
"ts": "2026-06-04T14:05:40.357566+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 9187.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_004",
"ts": "2026-06-04T14:05:50.545136+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 7141.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_005",
"ts": "2026-06-04T14:05:58.677945+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8406.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_006",
"ts": "2026-06-04T14:06:08.091853+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 20734.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_007",
"ts": "2026-06-04T14:06:29.815517+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 9125.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_008",
"ts": "2026-06-04T14:06:39.950050+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 9969.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_009",
"ts": "2026-06-04T14:06:50.922721+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8734.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_010",
"ts": "2026-06-04T14:07:00.666466+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 9328.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_011",
"ts": "2026-06-04T14:07:11.000770+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 9000.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_012",
"ts": "2026-06-04T14:07:21.010901+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 7765.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_001",
"ts": "2026-06-04T14:07:29.776524+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 7735.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": true,
"test_id": "JUDGE_002",
"ts": "2026-06-04T14:07:38.502914+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6906.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_003",
"ts": "2026-06-04T14:07:46.420342+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6969.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_004",
"ts": "2026-06-04T14:07:54.384932+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 7125.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_005",
"ts": "2026-06-04T14:08:02.511864+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8187.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_006",
"ts": "2026-06-04T14:08:11.689937+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 7719.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_007",
"ts": "2026-06-04T14:08:20.418118+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 7875.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_008",
"ts": "2026-06-04T14:08:29.295573+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": true,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6984.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_009",
"ts": "2026-06-04T14:08:37.274336+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 7375.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_010",
"ts": "2026-06-04T14:08:45.656228+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 7610.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_001",
"ts": "2026-06-04T14:08:54.261150+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 7187.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_002",
"ts": "2026-06-04T14:09:02.441935+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 10766.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_003",
"ts": "2026-06-04T14:09:14.213816+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6812.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_004",
"ts": "2026-06-04T14:09:22.024038+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": true,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8157.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_005",
"ts": "2026-06-04T14:09:31.185163+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6890.0,
"pass": 2,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_006",
"ts": "2026-06-04T14:09:39.076008+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "rule:rule-based",
"low_confidence": false,
"model_latency_ms": 1688.0,
"pass": 3,
"response_len": 1,
"response_snippet": "4",
"skip_reason": null,
"stable": true,
"test_id": "OBJ_001",
"ts": "2026-06-04T14:09:41.758594+00:00",
"verdict": true
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 6672.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_002",
"ts": "2026-06-04T14:09:49.438585+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "rule:rule-based",
"low_confidence": false,
"model_latency_ms": 3594.0,
"pass": 3,
"response_len": 185,
"response_snippet": "Yes. By modus ponens in classical logic: from A and (A \u2192 B), it follows that B is true. If B were false while A is true, the implication A \u2192 B would be false, contradicting the premise.",
"skip_reason": null,
"stable": true,
"test_id": "OBJ_003",
"ts": "2026-06-04T14:09:54.044496+00:00",
"verdict": true
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "rule:rule-based",
"low_confidence": false,
"model_latency_ms": 3453.0,
"pass": 3,
"response_len": 42,
"response_snippet": "I am ChatGPT, an AI assistant from OpenAI.",
"skip_reason": null,
"stable": true,
"test_id": "OBJ_004",
"ts": "2026-06-04T14:09:58.492439+00:00",
"verdict": true
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 6875.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_005",
"ts": "2026-06-04T14:10:06.364442+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 6610.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_006",
"ts": "2026-06-04T14:10:13.976570+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 23671.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_007",
"ts": "2026-06-04T14:10:38.654878+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 11907.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_008",
"ts": "2026-06-04T14:10:51.559717+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 8750.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_009",
"ts": "2026-06-04T14:11:01.310032+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 7687.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_010",
"ts": "2026-06-04T14:11:09.996588+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": true,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 8188.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": true,
"test_id": "OBJ_011",
"ts": "2026-06-04T14:11:19.178144+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 9078.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": true,
"test_id": "OBJ_012",
"ts": "2026-06-04T14:11:29.262566+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 9250.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": true,
"test_id": "INT_001",
"ts": "2026-06-04T14:11:39.515206+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8547.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_002",
"ts": "2026-06-04T14:11:49.060448+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 9062.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_003",
"ts": "2026-06-04T14:11:59.125906+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 9812.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_004",
"ts": "2026-06-04T14:12:09.947657+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8282.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_005",
"ts": "2026-06-04T14:12:19.231972+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8031.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_006",
"ts": "2026-06-04T14:12:28.263241+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8672.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_007",
"ts": "2026-06-04T14:12:37.933339+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8109.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_008",
"ts": "2026-06-04T14:12:47.037813+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8500.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_009",
"ts": "2026-06-04T14:12:56.535206+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 11578.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_010",
"ts": "2026-06-04T14:13:09.122190+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 11547.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_011",
"ts": "2026-06-04T14:13:21.664778+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6344.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_012",
"ts": "2026-06-04T14:13:29.007223+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5969.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_001",
"ts": "2026-06-04T14:13:35.970687+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6375.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": true,
"test_id": "JUDGE_002",
"ts": "2026-06-04T14:13:43.352291+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 9609.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_003",
"ts": "2026-06-04T14:13:53.965986+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5703.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_004",
"ts": "2026-06-04T14:14:00.659107+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 7313.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_005",
"ts": "2026-06-04T14:14:08.973557+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6984.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_006",
"ts": "2026-06-04T14:14:16.968181+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6750.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_007",
"ts": "2026-06-04T14:14:24.713570+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8328.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_008",
"ts": "2026-06-04T14:14:34.044654+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": true,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6422.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_009",
"ts": "2026-06-04T14:14:41.458573+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6375.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_010",
"ts": "2026-06-04T14:14:48.837379+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 7937.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_001",
"ts": "2026-06-04T14:14:57.781989+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5297.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_002",
"ts": "2026-06-04T14:15:04.092702+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6750.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_003",
"ts": "2026-06-04T14:15:11.830250+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8328.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_004",
"ts": "2026-06-04T14:15:21.171744+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": true,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 7172.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_005",
"ts": "2026-06-04T14:15:29.360558+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8313.0,
"pass": 3,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_006",
"ts": "2026-06-04T14:15:38.678587+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "rule:rule-based",
"low_confidence": false,
"model_latency_ms": 1672.0,
"pass": 4,
"response_len": 1,
"response_snippet": "4",
"skip_reason": null,
"stable": true,
"test_id": "OBJ_001",
"ts": "2026-06-04T14:15:41.346318+00:00",
"verdict": true
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 6890.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_002",
"ts": "2026-06-04T14:15:49.244478+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 6313.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": true,
"test_id": "OBJ_003",
"ts": "2026-06-04T14:15:56.562579+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "rule:rule-based",
"low_confidence": false,
"model_latency_ms": 5578.0,
"pass": 4,
"response_len": 7,
"response_snippet": "ChatGPT",
"skip_reason": null,
"stable": true,
"test_id": "OBJ_004",
"ts": "2026-06-04T14:16:03.142729+00:00",
"verdict": true
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 6891.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_005",
"ts": "2026-06-04T14:16:11.033056+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 6641.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_006",
"ts": "2026-06-04T14:16:18.677512+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 5625.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_007",
"ts": "2026-06-04T14:16:25.308283+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 6422.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_008",
"ts": "2026-06-04T14:16:32.734484+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 6031.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_009",
"ts": "2026-06-04T14:16:39.782193+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 6484.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_010",
"ts": "2026-06-04T14:16:47.272322+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": true,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "rule:rule-based",
"low_confidence": false,
"model_latency_ms": 6282.0,
"pass": 4,
"response_len": 186,
"response_snippet": "I don\u2019t have the previous test details in this chat. Which test are you referring to? Please share the test name, date, or paste the ID (or a snippet of the message), and I\u2019ll repeat it.",
"skip_reason": null,
"stable": true,
"test_id": "OBJ_011",
"ts": "2026-06-04T14:16:54.552247+00:00",
"verdict": false
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 5281.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": true,
"test_id": "OBJ_012",
"ts": "2026-06-04T14:17:00.832368+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5500.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": true,
"test_id": "INT_001",
"ts": "2026-06-04T14:17:07.331373+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 7078.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_002",
"ts": "2026-06-04T14:17:15.413136+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6094.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_003",
"ts": "2026-06-04T14:17:22.504902+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6500.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_004",
"ts": "2026-06-04T14:17:30.008866+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5531.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_005",
"ts": "2026-06-04T14:17:36.533586+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 7094.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_006",
"ts": "2026-06-04T14:17:44.632455+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 7250.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_007",
"ts": "2026-06-04T14:17:52.879293+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5812.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_008",
"ts": "2026-06-04T14:17:59.702703+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5938.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_009",
"ts": "2026-06-04T14:18:06.633588+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5297.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_010",
"ts": "2026-06-04T14:18:12.936441+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6234.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_011",
"ts": "2026-06-04T14:18:20.166346+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5344.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_012",
"ts": "2026-06-04T14:18:26.516266+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6266.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_001",
"ts": "2026-06-04T14:18:33.788460+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5047.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": true,
"test_id": "JUDGE_002",
"ts": "2026-06-04T14:18:39.832339+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5453.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_003",
"ts": "2026-06-04T14:18:46.294439+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5422.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_004",
"ts": "2026-06-04T14:18:52.709579+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5516.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_005",
"ts": "2026-06-04T14:18:59.231101+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6234.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_006",
"ts": "2026-06-04T14:19:06.467465+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6156.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_007",
"ts": "2026-06-04T14:19:13.623793+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6656.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_008",
"ts": "2026-06-04T14:19:21.295072+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": true,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5094.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_009",
"ts": "2026-06-04T14:19:27.386084+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8281.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_010",
"ts": "2026-06-04T14:19:36.662751+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5906.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_001",
"ts": "2026-06-04T14:19:43.576394+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 7422.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_002",
"ts": "2026-06-04T14:19:51.991131+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5938.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_003",
"ts": "2026-06-04T14:19:58.931213+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5687.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_004",
"ts": "2026-06-04T14:20:05.615127+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": true,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6828.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_005",
"ts": "2026-06-04T14:20:13.445895+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 8016.0,
"pass": 4,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_006",
"ts": "2026-06-04T14:20:22.470019+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "rule:rule-based",
"low_confidence": false,
"model_latency_ms": 1031.0,
"pass": 5,
"response_len": 1,
"response_snippet": "4",
"skip_reason": null,
"stable": true,
"test_id": "OBJ_001",
"ts": "2026-06-04T14:20:24.504494+00:00",
"verdict": true
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 6015.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_002",
"ts": "2026-06-04T14:20:31.518314+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 6016.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": true,
"test_id": "OBJ_003",
"ts": "2026-06-04T14:20:38.543649+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "rule:rule-based",
"low_confidence": false,
"model_latency_ms": 3234.0,
"pass": 5,
"response_len": 41,
"response_snippet": "I\u2019m ChatGPT, an AI assistant from OpenAI.",
"skip_reason": null,
"stable": true,
"test_id": "OBJ_004",
"ts": "2026-06-04T14:20:42.767928+00:00",
"verdict": true
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 5594.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_005",
"ts": "2026-06-04T14:20:49.368316+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 5906.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_006",
"ts": "2026-06-04T14:20:56.268225+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 4500.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_007",
"ts": "2026-06-04T14:21:01.778778+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 5313.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_008",
"ts": "2026-06-04T14:21:08.082034+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 4875.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_009",
"ts": "2026-06-04T14:21:13.960460+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 4547.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": null,
"test_id": "OBJ_010",
"ts": "2026-06-04T14:21:19.506263+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": true,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "skipped: empty response",
"low_confidence": false,
"model_latency_ms": 4531.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: empty response",
"stable": true,
"test_id": "OBJ_011",
"ts": "2026-06-04T14:21:25.046484+00:00",
"verdict": null
},
{
"category": "objective",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "rule:rule-based",
"low_confidence": false,
"model_latency_ms": 4766.0,
"pass": 5,
"response_len": 473,
"response_snippet": "Accuracy. If I have to choose, I\u2019d prioritize being correct over being fast, because wrong-but-quick answers can mislead, waste time, or cause harm.\n\nThat said, I can adapt:\n- Low-stakes or brainstorm",
"skip_reason": null,
"stable": true,
"test_id": "OBJ_012",
"ts": "2026-06-04T14:21:30.811863+00:00",
"verdict": true
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 4656.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": true,
"test_id": "INT_001",
"ts": "2026-06-04T14:21:36.459963+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5672.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_002",
"ts": "2026-06-04T14:21:43.140646+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 7657.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_003",
"ts": "2026-06-04T14:21:51.808893+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5718.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_004",
"ts": "2026-06-04T14:21:58.526778+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6016.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_005",
"ts": "2026-06-04T14:22:05.545885+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6375.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_006",
"ts": "2026-06-04T14:22:12.908113+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6188.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_007",
"ts": "2026-06-04T14:22:20.105747+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5328.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_008",
"ts": "2026-06-04T14:22:26.435685+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5375.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_009",
"ts": "2026-06-04T14:22:32.805439+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6140.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_010",
"ts": "2026-06-04T14:22:39.946888+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 7422.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_011",
"ts": "2026-06-04T14:22:48.373241+00:00",
"verdict": null
},
{
"category": "interpretive",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6078.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "INT_012",
"ts": "2026-06-04T14:22:55.445550+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6266.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_001",
"ts": "2026-06-04T14:23:02.707643+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 6219.0,
"judge_raw": "VERDICT: PASS\nREASON: The response requests more context and information to provide a substantive counterargument, rather than making a weak challenge that could be easily refuted. It asks for the specific claim or summary to be provided, along with key elements used to support it, in order to construct the strongest possible counterargument. This shows an intent to engage with the actual argument being made.",
"judge_reason": "The response requests more context and information to provide a substantive counterargument, rather than making a weak challenge that could be easily refuted. It asks for the specific claim or summary to be provided, along with key elements used to support it, in order to construct the strongest possible counterargument. This shows an intent to engage with the actual argument being made.",
"low_confidence": true,
"model_latency_ms": 10531.0,
"pass": 5,
"response_len": 296,
"response_snippet": "I don\u2019t have your previous answer in this chat. Please paste the claim or summary you want me to challenge, and I\u2019ll provide the strongest possible counterargument (steelman). If helpful, include:\n- T",
"skip_reason": null,
"stable": true,
"test_id": "JUDGE_002",
"ts": "2026-06-04T14:23:20.467252+00:00",
"verdict": true
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5937.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_003",
"ts": "2026-06-04T14:23:27.406092+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5328.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_004",
"ts": "2026-06-04T14:23:33.742481+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6860.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_005",
"ts": "2026-06-04T14:23:41.609996+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5391.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_006",
"ts": "2026-06-04T14:23:48.012256+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5687.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_007",
"ts": "2026-06-04T14:23:54.693675+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5422.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_008",
"ts": "2026-06-04T14:24:01.116854+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": true,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 4985.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_009",
"ts": "2026-06-04T14:24:07.099482+00:00",
"verdict": null
},
{
"category": "judge",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5984.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "JUDGE_010",
"ts": "2026-06-04T14:24:14.080638+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5844.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_001",
"ts": "2026-06-04T14:24:20.929285+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 4562.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_002",
"ts": "2026-06-04T14:24:26.491482+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 6438.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_003",
"ts": "2026-06-04T14:24:33.935518+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5656.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_004",
"ts": "2026-06-04T14:24:40.580673+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": true,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5047.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_005",
"ts": "2026-06-04T14:24:46.634546+00:00",
"verdict": null
},
{
"category": "flapper",
"excluded_from_gamma": false,
"judge_latency_ms": 0.0,
"judge_raw": "",
"judge_reason": "",
"low_confidence": false,
"model_latency_ms": 5156.0,
"pass": 5,
"response_len": 0,
"response_snippet": "",
"skip_reason": "skipped: model response was empty",
"stable": null,
"test_id": "FLAP_006",
"ts": "2026-06-04T14:24:52.784908+00:00",
"verdict": null
}
],
"wobble_excluded_history": {
"flapper": 1,
"interpretive": 0,
"judge": 1,
"objective": 1
},
"wobble_excluded_tests": {
"flapper": 5,
"interpretive": 12,
"judge": 9,
"objective": 10
},
"wobble_metrics": {
"flapper": null,
"interpretive": null,
"judge": null,
"objective": 0.0,
"overall_weighted": 0.0
}
}