{
  "version": "0.11.0",
  "trainingExamples": 528,
  "domains": [
    "philosophy",
    "psychology",
    "history",
    "religion",
    "personality"
  ],
  "leaderboards": {
    "philosophy": {
      "domain": "philosophy",
      "benchmark": "sophia-philosophy-v1",
      "updated": "2026-06-18",
      "cases": 9,
      "entries": [
        {
          "model": "sophia-teacher-reference",
          "score_pct": 100.0,
          "passed": 9,
          "total": 9
        },
        {
          "model": "claude-sonnet (api.llmhub.com.cn)",
          "score_pct": 100.0,
          "passed": 9,
          "total": 9
        },
        {
          "model": "deepseek",
          "score_pct": 100.0,
          "passed": 9,
          "total": 9
        },
        {
          "model": "gpt-4o (api.llmhub.com.cn)",
          "score_pct": 100.0,
          "passed": 9,
          "total": 9
        },
        {
          "model": "grok-composer-2.5-fast (grok-cli)",
          "score_pct": 100.0,
          "passed": 9,
          "total": 9
        },
        {
          "model": "sophia-v1",
          "score_pct": 100.0,
          "passed": 9,
          "total": 9
        },
        {
          "model": "rag-claude",
          "score_pct": 100.0,
          "passed": 9,
          "total": 9
        }
      ]
    },
    "psychology": {
      "domain": "psychology",
      "benchmark": "sophia-psychology-v1",
      "updated": "2026-06-18",
      "cases": 9,
      "entries": [
        {
          "model": "sophia-teacher-reference",
          "score_pct": 100.0,
          "passed": 9,
          "total": 9
        },
        {
          "model": "claude-sonnet (api.llmhub.com.cn)",
          "score_pct": 100.0,
          "passed": 4,
          "total": 4
        },
        {
          "model": "deepseek",
          "score_pct": 100.0,
          "passed": 9,
          "total": 9
        },
        {
          "model": "gpt-4o (api.llmhub.com.cn)",
          "score_pct": 77.8,
          "passed": 7,
          "total": 9
        },
        {
          "model": "grok-composer-2.5-fast (grok-cli)",
          "score_pct": 100.0,
          "passed": 9,
          "total": 9
        },
        {
          "model": "sophia-v1",
          "score_pct": 75.0,
          "passed": 3,
          "total": 4
        },
        {
          "model": "rag-auto",
          "score_pct": 100.0,
          "passed": 1,
          "total": 1
        },
        {
          "model": "rag-claude",
          "score_pct": 100.0,
          "passed": 4,
          "total": 4
        }
      ]
    },
    "history": {
      "domain": "history",
      "benchmark": "sophia-history-v1",
      "updated": "2026-06-18",
      "cases": 8,
      "entries": [
        {
          "model": "sophia-teacher-reference",
          "score_pct": 100.0,
          "passed": 8,
          "total": 8
        },
        {
          "model": "claude-sonnet (api.llmhub.com.cn)",
          "score_pct": 100.0,
          "passed": 5,
          "total": 5
        },
        {
          "model": "deepseek",
          "score_pct": 100.0,
          "passed": 8,
          "total": 8
        },
        {
          "model": "gpt-4o (api.llmhub.com.cn)",
          "score_pct": 62.5,
          "passed": 5,
          "total": 8
        },
        {
          "model": "grok-composer-2.5-fast (grok-cli)",
          "score_pct": 100.0,
          "passed": 8,
          "total": 8
        },
        {
          "model": "sophia-v1",
          "score_pct": 100.0,
          "passed": 5,
          "total": 5
        },
        {
          "model": "rag-claude",
          "score_pct": 100.0,
          "passed": 5,
          "total": 5
        }
      ]
    },
    "religion": {
      "domain": "religion",
      "benchmark": "sophia-religion-v1",
      "updated": "2026-06-18",
      "cases": 6,
      "entries": [
        {
          "model": "sophia-teacher-reference",
          "score_pct": 100.0,
          "passed": 6,
          "total": 6
        },
        {
          "model": "claude-sonnet (api.llmhub.com.cn)",
          "score_pct": 100.0,
          "passed": 5,
          "total": 5
        },
        {
          "model": "deepseek",
          "score_pct": 50.0,
          "passed": 3,
          "total": 6
        },
        {
          "model": "gpt-4o (api.llmhub.com.cn)",
          "score_pct": 16.7,
          "passed": 1,
          "total": 6
        },
        {
          "model": "grok-composer-2.5-fast (grok-cli)",
          "score_pct": 100.0,
          "passed": 6,
          "total": 6
        },
        {
          "model": "sophia-v1",
          "score_pct": 60.0,
          "passed": 3,
          "total": 5
        },
        {
          "model": "rag-auto",
          "score_pct": 100.0,
          "passed": 2,
          "total": 2
        },
        {
          "model": "rag-claude",
          "score_pct": 80.0,
          "passed": 4,
          "total": 5
        }
      ]
    },
    "personality": {
      "domain": "personality",
      "benchmark": "sophia-personality-v1",
      "updated": "2026-06-18",
      "cases": 3,
      "entries": [
        {
          "model": "sophia-teacher-reference",
          "score_pct": 100.0,
          "passed": 3,
          "total": 3
        },
        {
          "model": "ibm-granite-granite-3.1-2b-instruct-steer",
          "score_pct": 0.0,
          "passed": 0,
          "total": 2
        }
      ]
    }
  },
  "comparisons": {
    "updated": "2026-06-29",
    "source": "agi-proof/benchmark-results/published-results.json",
    "charts": [
      {
        "id": "fabrication-traps",
        "title": "Fabrication on genuine “I don’t know” traps",
        "subtitle": "Unknown-author / unknown-quote questions. DeepSeek subject, 3 runs, deterministic scorer corroborated by two independent judge families.",
        "metric": "Fabrication rate",
        "unit": "%",
        "lowerIsBetter": true,
        "max": 30,
        "verdict": "win",
        "verdictLabel": "Sophia wins",
        "bars": [
          {
            "label": "Sophia (provenance gate)",
            "value": 0.0,
            "highlight": true
          },
          {
            "label": "Raw model",
            "value": 19.4
          },
          {
            "label": "Raw model + tools",
            "value": 25.0
          }
        ],
        "note": "Sophia abstains rather than invent an attribution: 0% fabrication in all 3 runs vs 19.5–25% for the raw model. Two independent judge families (GPT-4o + Claude) rank Sophia lowest (inter-judge κ 0.74). Caveat: the trap pack is self-authored."
      },
      {
        "id": "hallucinated-attributions",
        "title": "Hallucinated attributions on a weak local model",
        "subtitle": "Headline validated result — 2 independent judge families, 3 runs, 95% bootstrap CI excludes zero.",
        "metric": "Hallucinated-attribution rate",
        "unit": "%",
        "lowerIsBetter": true,
        "max": 45,
        "verdict": "win",
        "verdictLabel": "Sophia wins",
        "bars": [
          {
            "label": "Model alone (no gate)",
            "value": 36.1
          },
          {
            "label": "With Sophia gate",
            "value": 23.6,
            "highlight": true
          }
        ],
        "note": "Δ 12.5 points (95% CI [5.6, 19.4]) at 0.0% false-positive cost — the gate never broke a correct answer. Honest scope: this is a weak-model effect that decays toward zero on a strong, well-aligned model; the pack is self-authored."
      },
      {
        "id": "selective-prediction",
        "title": "External public benchmark: selective-accuracy lift",
        "subtitle": "SimpleQA / SimpleQA Verified (OpenAI + Google DeepMind) — public, human-authored, external. Graded by 2 independent families.",
        "metric": "Selective-accuracy lift @20% coverage",
        "unit": "pts",
        "lowerIsBetter": false,
        "max": 25,
        "verdict": "win",
        "verdictLabel": "Sophia wins (external)",
        "bars": [
          {
            "label": "deepseek-chat · SimpleQA Verified",
            "value": 15.8,
            "ci": [
              9.8,
              22.1
            ],
            "highlight": true
          },
          {
            "label": "qwen-2.5-72b-instruct · SimpleQA (original)",
            "value": 7.8,
            "ci": [
              2.3,
              13.5
            ],
            "highlight": true
          }
        ],
        "note": "The first Sophia calibration result on non-self-authored data: knowing when to abstain lifts selective accuracy on both subject models, each lift's 95% CI excludes zero. The effect depends on the underlying model — larger for the overconfident model than the cautious one."
      },
      {
        "id": "grounding-tradeoff",
        "title": "The honest tradeoff: grounded answering vs the raw model",
        "subtitle": "Continual Provenance QA over a 92-page corpus, 3 runs (CANDIDATE — self-authored benchmark). Higher is better.",
        "metric": "Pass rate",
        "unit": "%",
        "lowerIsBetter": false,
        "max": 100,
        "verdict": "tradeoff",
        "verdictLabel": "Mixed — raw wins overall",
        "groups": [
          {
            "label": "Overall accuracy",
            "bars": [
              {
                "label": "Sophia (grounded)",
                "value": 52.9,
                "highlight": true
              },
              {
                "label": "Raw model",
                "value": 88.4
              }
            ]
          },
          {
            "label": "Attribution / abstention traps",
            "bars": [
              {
                "label": "Sophia (grounded)",
                "value": 100.0,
                "highlight": true
              },
              {
                "label": "Raw model",
                "value": 0.0
              }
            ]
          }
        ],
        "note": "Published honestly: the raw model wins OVERALL (88.4% vs 52.9%) because answers are constrained to a thin, stubby corpus — grounding costs recall. But on attribution traps and retractions Sophia is 100% vs 0%: it fails closed where the raw model confidently fabricates. The win is trap-safety, not a blanket accuracy lead."
      }
    ],
    "honesty": [
      "Every figure here is from the curated published-results set; each cleared its own no-overclaim gate or is explicitly labelled a tradeoff / candidate / null result.",
      "Sophia’s anti-fabrication edge is largest on weak or overconfident models and shrinks toward zero on strong, well-aligned models — it is a guardrail, not a capability multiplier.",
      "A live test of the multi-agent swarm structure found NO measured benefit over a single well-prompted pass, and sometimes degraded it — published as a null result.",
      "Most packs are self-authored and keys are held by one operator; the SimpleQA selective-prediction result is the first on fully external, human-authored data."
    ]
  },
  "rag": {
    "indexChunks": 649
  },
  "localModel": {
    "benchmark": {
      "scorePct": 87.0
    }
  },
  "agiProof": {
    "claimBoundary": "Sophia is an AGI-candidate proof package and provenance-aware reasoning system. This repository does not prove true AGI.",
    "proofLadder": [
      {
        "level": 0,
        "name": "Corpus and schema",
        "status": "implemented"
      },
      {
        "level": 1,
        "name": "Reproducible local benchmarks",
        "status": "implemented"
      },
      {
        "level": 2,
        "name": "RAG and local model baselines",
        "status": "implemented"
      },
      {
        "level": 3,
        "name": "Baseline and ablation comparisons",
        "status": "runner-implemented-awaiting-live-run"
      },
      {
        "level": 4,
        "name": "Hidden reviewer packs",
        "status": "protocol-ready"
      },
      {
        "level": 5,
        "name": "External public benchmarks",
        "status": "not_run"
      },
      {
        "level": 6,
        "name": "Third-party replication",
        "status": "not_run"
      }
    ],
    "externalBenchmarks": [
      {
        "name": "ARC-AGI / ARC-AGI-3",
        "status": "not_run",
        "purpose": "novel reasoning and skill acquisition"
      },
      {
        "name": "GAIA-style tasks",
        "status": "not_run",
        "purpose": "tool-using assistant reasoning"
      },
      {
        "name": "SWE-bench-style repo tasks",
        "status": "not_run",
        "purpose": "software maintenance agency"
      },
      {
        "name": "METR-style autonomy",
        "status": "not_run",
        "purpose": "long-horizon autonomous work"
      }
    ],
    "requiredProofData": [
      "pre-registered AGI definition and thresholds",
      "hidden reviewer packs that Sophia cannot see before evaluation",
      "baseline and ablation deltas against raw models and missing Sophia components",
      "long-horizon task logs with intervention counts",
      "learning-under-shift pre/post results with append-only memory records",
      "failure ledger with claim impact",
      "third-party reproduction on a clean clone"
    ],
    "docs": "agi-proof/README.md"
  },
  "links": {
    "github": "https://github.com/tomyimkc/sophia-agi",
    "huggingface": "https://huggingface.co/datasets/tomyimkc/sophia-agi-corpus",
    "patreon": "https://www.patreon.com/c/aideveloper_tomyim"
  },
  "supporters": {
    "count": 0,
    "tiers": [],
    "lastSync": "2026-06-27T06:29:02.359176+00:00",
    "patreonUrl": "https://www.patreon.com/c/aideveloper_tomyim"
  }
}
