From bd953051b0f0465f545aeb6a8e1d95a25149f608 Mon Sep 17 00:00:00 2001
From: Tyrin Todd <tyrin@modaic.dev>
Date: Mon, 16 Feb 2026 13:41:05 -0800
Subject: [PATCH] (no commit message)

---
 README.md    | 312 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 config.json  |  55 +++++++++
 program.json |  44 ++++++++
 3 files changed, 410 insertions(+), 1 deletion(-)
 create mode 100644 config.json
 create mode 100644 program.json
diff --git a/README.md b/README.md
index 75b7602..77c405b 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,312 @@
-# sentiment
+# Bench
 
+Modaic internal SDK for benchmarking judges and training confidence probes.
+
+## Installation
+
+```bash
+cd cli
+uv sync
+```
+
+## CLI Commands
+
+All commands are run from the `cli` directory via `uv run mo <command>`.
+
+### `create`
+
+Create benchmark datasets for training confidence probes. This command runs a judge on examples, extracts embeddings via Modal, and pushes the resulting dataset to HuggingFace Hub.
+
+**Subcommands:**
+
+- `create ppe` - Create dataset from PPE (human-preference + correctness) benchmarks
+- `create judge_bench` - Create dataset from the JudgeBench benchmark
+
+**Usage:**
+
+```bash
+# Interactive mode (recommended) - prompts for configuration
+uv run mo create ppe
+uv run mo create judge_bench
+
+# With config file
+uv run mo create ppe --config config.yaml
+uv run mo create judge_bench --config config.yaml
+```
+
+**Options:**
+
+| Option     | Short | Description                |
+| ---------- | ----- | -------------------------- |
+| `--config` | `-c`  | Path to config file (YAML) |
+
+**Config File Example:**
+
+```yaml
+judge: tyrin/ppe-judge-gepa
+output: tytodd/my-probe-dataset
+n_train: 500
+n_test: 100
+embedding_layer: -1  # -1 for middle layer
+```
+
+**What it does:**
+
+1. Loads examples from the benchmark dataset
+2. Runs the specified judge on each example to get predictions
+3. Extracts embeddings from the judge's LLM via Modal (GPU)
+4. Creates a HuggingFace dataset with columns: `question`, `response_a`, `response_b`, `label`, `predicted`, `messages`, `embeddings`
+5. Pushes to HuggingFace Hub
+
+---
+
+### `train`
+
+Train a confidence probe on an embeddings dataset created with `create`.
+
+**Usage:**
+
+```bash
+# Interactive mode (recommended) - prompts for all configuration
+uv run mo train
+
+# With config file
+uv run mo train --config config.yaml
+
+# With CLI arguments
+uv run mo train --dataset tytodd/my-embeddings --epochs 10 --lr 0.0001
+```
+
+**Options:**
+
+| Option           | Short | Description                                                                       | Default           |
+| ---------------- | ----- | --------------------------------------------------------------------------------- | ----------------- |
+| `--config`       | `-c`  | Path to config file (YAML)                                                        | -                 |
+| `--dataset`      | `-d`  | Dataset path (HuggingFace Hub or local) (must be a dataset created with `create`) | -                 |
+| `--model-path`   | `-m`  | Output path for trained model                                                     | `{dataset}_probe` |
+| `--batch-size`   |       | Batch size                                                                        | 4                 |
+| `--epochs`       |       | Number of training epochs                                                         | 10                |
+| `--lr`           |       | Learning rate                                                                     | 0.0001            |
+| `--weight-decay` |       | Weight decay                                                                      | 0.01              |
+| `--test-size`    |       | Validation split ratio (if no test split)                                         | 0.2               |
+| `--seed`         |       | Random seed                                                                       | 42                |
+| `--project`      |       | W&B project name                                                                  | model_path        |
+| `--hub-path`     |       | HuggingFace Hub path to push model                                                | -                 |
+
+**Config File Example:**
+
+```yaml
+dataset_path: tytodd/my-probe-dataset
+model_path: ./best_probe
+hub_path: tytodd/my-probe  # Optional: push to HF Hub
+batch_size: 4
+epochs: 10
+learning_rate: 0.0001
+weight_decay: 0.01
+test_size: 0.2
+seed: 42
+```
+
+**What it does:**
+
+1. Loads an embeddings dataset (from HuggingFace Hub or local)
+2. Creates binary labels: 1 if `predicted == label`, 0 otherwise
+3. Trains a linear probe using MSE loss (Brier score optimization)
+4. Logs metrics to Weights & Biases (Brier, ECE, MCE, Kuiper, AUROC)
+5. Saves the best model based on validation Brier score
+6. Optionally pushes to HuggingFace Hub
+
+---
+
+### `eval`
+
+Evaluate a trained confidence probe on a dataset. Computes calibration and discrimination metrics.
+
+**Usage:**
+
+```bash
+# Interactive mode (recommended) - prompts for probe and dataset
+uv run mo eval
+
+# With CLI arguments
+uv run mo eval --probe tytodd/my-probe --dataset tytodd/my-embeddings
+
+# Evaluate on train split instead of test
+uv run mo eval --probe tytodd/my-probe --dataset tytodd/my-embeddings --split train
+```
+
+**Options:**
+
+| Option                       | Short | Description                              | Default      |
+| ---------------------------- | ----- | ---------------------------------------- | ------------ |
+| `--probe`                    | `-p`  | Probe path (HuggingFace Hub or local)    | -            |
+| `--dataset`                  | `-d`  | Dataset path (HuggingFace Hub or local)  | -            |
+| `--split`                    | `-s`  | Dataset split to evaluate on             | test         |
+| `--batch-size`               | `-b`  | Batch size for evaluation                | 64           |
+| `--normalize/--no-normalize` | `-n`  | Normalize embeddings with StandardScaler | probe config |
+
+**Metrics computed:**
+
+| Metric      | Description                                       |
+| ----------- | ------------------------------------------------- |
+| Brier Score | Mean squared error between predictions and labels |
+| Accuracy    | Classification accuracy at 0.5 threshold          |
+| F1 Score    | Harmonic mean of precision and recall             |
+| ECE         | Expected Calibration Error (10 bins)              |
+| MCE         | Maximum Calibration Error                         |
+| Kuiper      | Kuiper statistic for calibration                  |
+| AUROC       | Area Under the ROC Curve (discrimination)         |
+
+**What it does:**
+
+1. Loads a pretrained probe from HuggingFace Hub or local path
+2. Loads a dataset created with `create`
+3. Creates binary labels: 1 if `predicted == label`, 0 otherwise
+4. Runs inference and computes calibration/discrimination metrics
+5. Displays results in a formatted table
+
+---
+
+### `compile`
+
+Compile (optimize) a judge using GEPA over a dataset. GEPA iteratively improves the judge's prompt based on training examples.
+
+**Subcommands:**
+
+- `compile` (base) - Compile with custom dataset and parameter mapping
+- `compile ppe` - Compile specifically for PPE datasets (human-preference + correctness)
+
+**Usage:**
+
+```bash
+# Interactive mode
+uv run mo compile
+uv run mo compile ppe
+
+# With config file
+uv run mo compile --config config.yaml
+uv run mo compile ppe --config config.yaml
+```
+
+**Options:**
+
+| Option     | Short | Description                |
+| ---------- | ----- | -------------------------- |
+| `--config` | `-c`  | Path to config file (YAML) |
+
+**Config File Example:**
+
+```yaml
+judge: tyrin/ppe-judge
+dataset: tytodd/ppe-human-preference
+inputs: # selects which input columns of the dataset to use (not necearry if using a compile subcommand like ppe or judge_bench) 
+  - name: question
+  - name: response_a
+    column: response_A  # Map param name to dataset column
+  - name: response_b
+    column: response_B
+label_column: label
+n_train: 100
+n_val: 50
+base_model: gpt-4o-mini
+reflection_model: gpt-4o
+output: tyrin/ppe-judge-gepa
+seed: 42
+```
+
+**What it does:**
+
+1. Loads a judge from Modaic Hub
+2. Loads training/validation examples from a HuggingFace dataset
+3. Maps judge parameters to dataset columns
+4. Runs GEPA optimization to improve the judge's prompt
+5. Pushes the optimized judge to Modaic Hub
+
+---
+
+### `embed`
+
+Regenerate embeddings for an existing dataset using a different model or layer. Useful for experimenting with different embedding configurations without re-running the judge.
+
+**Usage:**
+
+```bash
+# Interactive mode
+uv run mo embed
+
+# With CLI arguments
+uv run mo embed --dataset tytodd/my-dataset --hf-model Qwen/Qwen3-VL-32B-Instruct --layer -1
+```
+
+**Options:**
+
+| Option       | Short | Description                              |
+| ------------ | ----- | ---------------------------------------- |
+| `--dataset`  | `-d`  | Dataset path (HuggingFace Hub or local)  |
+| `--hf-model` | `-m`  | HuggingFace model path for embeddings    |
+| `--layer`    | `-l`  | Hidden layer index (-1 for middle layer) |
+
+**What it does:**
+
+1. Loads an existing dataset (must have a `messages` column)
+2. Regenerates embeddings using the specified model/layer via Modal
+3. Replaces the `embeddings` column in the dataset
+4. Prompts to push the updated dataset to HuggingFace Hub
+
+**Example workflow:**
+
+```bash
+# Original dataset was created with layer 32
+# Now try middle layer instead
+uv run mo embed \
+  --dataset tytodd/my-embeddings \
+  --hf-model Qwen/Qwen3-VL-32B-Instruct \
+  --layer -1
+```
+
+---
+
+## Recommended Embedding Layers
+
+When extracting embeddings, use these recommended layer indices for best probe performance:
+
+| Model         | HuggingFace Path                    | Recommended Layer |
+| ------------- | ----------------------------------- | ----------------- |
+| GPT-OSS 20B   | `openai/gpt-oss-20b`                | 8                 |
+| Qwen3-VL 32B  | `Qwen/Qwen3-VL-32B-Instruct`        | 16                |
+| Llama 3.3 70B | `meta-llama/Llama-3.3-70B-Instruct` | 32                |
+
+Use `-1` for the middle layer if experimenting with an unlisted model.
+
+---
+
+## Typical Workflow
+
+```bash
+# 1. Create a probe dataset from a benchmark
+uv run mo create ppe
+
+# 2. Train a confidence probe
+uv run mo train --dataset tytodd/ppe-qwen3-embeddings
+
+# 3. Evaluate the probe on a test set
+uv run mo eval --probe tytodd/my-probe --dataset tytodd/ppe-qwen3-embeddings
+
+# 4. (Optional) Compile/optimize a judge with GEPA
+uv run mo compile ppe
+
+# 5. (Optional) Re-embed with different layer
+uv run mo embed --dataset tytodd/my-dataset --layer 32
+```
+
+## Environment Variables
+
+Create a `.env` file with:
+
+```bash
+OPENAI_API_KEY=...
+WANDB_API_KEY=...
+HF_TOKEN=...
+MODAIC_TOKEN=...
+TOGETHER_API_KEY=...
+```
diff --git a/config.json b/config.json
new file mode 100644
index 0000000..653c931
--- /dev/null
+++ b/config.json
@@ -0,0 +1,55 @@
+{
+  "model": null,
+  "signature": {
+    "description": "Given a review title and content, determine the overall sentiment.\n\nTask: Analyze the text to determine if the reviewer has a positive or negative\nopinion about the product/service.\n\n- positive: The review expresses satisfaction, appreciation, or a favorable view\n- negative: The review expresses dissatisfaction, criticism, or an unfavorable view\n\nConsider:\n1. Overall tone and emotional language\n2. Whether the reviewer recommends the product\n3. Specific praise or complaints mentioned\n\nFirst reason through your thought process in the `reasoning` field.\nBe sure to verbalize any uncertainty in your thought process.\nThen output your conclusion in the `label` field.",
+    "properties": {
+      "title": {
+        "__dspy_field_type": "input",
+        "anyOf": [
+          {
+            "type": "string"
+          },
+          {
+            "type": "null"
+          }
+        ],
+        "default": null,
+        "desc": "The title of the review",
+        "prefix": "Title:",
+        "title": "Title"
+      },
+      "content": {
+        "__dspy_field_type": "input",
+        "desc": "The full review text/content",
+        "prefix": "Content:",
+        "title": "Content",
+        "type": "string"
+      },
+      "reasoning": {
+        "__dspy_field_type": "output",
+        "desc": "Your step by step reasoning for the sentiment classification. Verbalize uncertainty.",
+        "prefix": "Reasoning:",
+        "title": "Reasoning",
+        "type": "string"
+      },
+      "label": {
+        "__dspy_field_type": "output",
+        "desc": "The sentiment: 'positive' or 'negative'",
+        "enum": [
+          "positive",
+          "negative"
+        ],
+        "prefix": "Label:",
+        "title": "Label",
+        "type": "string"
+      }
+    },
+    "required": [
+      "content",
+      "reasoning",
+      "label"
+    ],
+    "title": "Sentiment",
+    "type": "object"
+  }
+}
\ No newline at end of file
diff --git a/program.json b/program.json
new file mode 100644
index 0000000..0dce119
--- /dev/null
+++ b/program.json
@@ -0,0 +1,44 @@
+{
+  "traces": [],
+  "train": [],
+  "demos": [],
+  "signature": {
+    "instructions": "Given a review title and content, determine the overall sentiment.\n\nTask: Analyze the text to determine if the reviewer has a positive or negative\nopinion about the product/service.\n\n- positive: The review expresses satisfaction, appreciation, or a favorable view\n- negative: The review expresses dissatisfaction, criticism, or an unfavorable view\n\nConsider:\n1. Overall tone and emotional language\n2. Whether the reviewer recommends the product\n3. Specific praise or complaints mentioned\n\nFirst reason through your thought process in the `reasoning` field.\nBe sure to verbalize any uncertainty in your thought process.\nThen output your conclusion in the `label` field.",
+    "fields": [
+      {
+        "prefix": "Title:",
+        "description": "The title of the review"
+      },
+      {
+        "prefix": "Content:",
+        "description": "The full review text/content"
+      },
+      {
+        "prefix": "Reasoning:",
+        "description": "Your step by step reasoning for the sentiment classification. Verbalize uncertainty."
+      },
+      {
+        "prefix": "Label:",
+        "description": "The sentiment: 'positive' or 'negative'"
+      }
+    ]
+  },
+  "lm": {
+    "model": "together_ai/Qwen/Qwen3-VL-32B-Instruct",
+    "model_type": "chat",
+    "cache": true,
+    "num_retries": 3,
+    "finetuning_model": null,
+    "launch_kwargs": {},
+    "train_kwargs": {},
+    "temperature": null,
+    "max_tokens": null
+  },
+  "metadata": {
+    "dependency_versions": {
+      "python": "3.11",
+      "dspy": "3.1.2",
+      "cloudpickle": "3.1"
+    }
+  }
+}
\ No newline at end of file