From bd953051b0f0465f545aeb6a8e1d95a25149f608 Mon Sep 17 00:00:00 2001 From: Tyrin Todd Date: Mon, 16 Feb 2026 13:41:05 -0800 Subject: [PATCH] (no commit message) --- README.md | 312 ++++++++++++++++++++++++++++++++++++++++++++++++++- config.json | 55 +++++++++ program.json | 44 ++++++++ 3 files changed, 410 insertions(+), 1 deletion(-) create mode 100644 config.json create mode 100644 program.json diff --git a/README.md b/README.md index 75b7602..77c405b 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,312 @@ -# sentiment +# Bench +Modaic internal SDK for benchmarking judges and training confidence probes. + +## Installation + +```bash +cd cli +uv sync +``` + +## CLI Commands + +All commands are run from the `cli` directory via `uv run mo `. + +### `create` + +Create benchmark datasets for training confidence probes. This command runs a judge on examples, extracts embeddings via Modal, and pushes the resulting dataset to HuggingFace Hub. + +**Subcommands:** + +- `create ppe` - Create dataset from PPE (human-preference + correctness) benchmarks +- `create judge_bench` - Create dataset from the JudgeBench benchmark + +**Usage:** + +```bash +# Interactive mode (recommended) - prompts for configuration +uv run mo create ppe +uv run mo create judge_bench + +# With config file +uv run mo create ppe --config config.yaml +uv run mo create judge_bench --config config.yaml +``` + +**Options:** + +| Option | Short | Description | +| ---------- | ----- | -------------------------- | +| `--config` | `-c` | Path to config file (YAML) | + +**Config File Example:** + +```yaml +judge: tyrin/ppe-judge-gepa +output: tytodd/my-probe-dataset +n_train: 500 +n_test: 100 +embedding_layer: -1 # -1 for middle layer +``` + +**What it does:** + +1. Loads examples from the benchmark dataset +2. Runs the specified judge on each example to get predictions +3. Extracts embeddings from the judge's LLM via Modal (GPU) +4. Creates a HuggingFace dataset with columns: `question`, `response_a`, `response_b`, `label`, `predicted`, `messages`, `embeddings` +5. Pushes to HuggingFace Hub + +--- + +### `train` + +Train a confidence probe on an embeddings dataset created with `create`. + +**Usage:** + +```bash +# Interactive mode (recommended) - prompts for all configuration +uv run mo train + +# With config file +uv run mo train --config config.yaml + +# With CLI arguments +uv run mo train --dataset tytodd/my-embeddings --epochs 10 --lr 0.0001 +``` + +**Options:** + +| Option | Short | Description | Default | +| ---------------- | ----- | --------------------------------------------------------------------------------- | ----------------- | +| `--config` | `-c` | Path to config file (YAML) | - | +| `--dataset` | `-d` | Dataset path (HuggingFace Hub or local) (must be a dataset created with `create`) | - | +| `--model-path` | `-m` | Output path for trained model | `{dataset}_probe` | +| `--batch-size` | | Batch size | 4 | +| `--epochs` | | Number of training epochs | 10 | +| `--lr` | | Learning rate | 0.0001 | +| `--weight-decay` | | Weight decay | 0.01 | +| `--test-size` | | Validation split ratio (if no test split) | 0.2 | +| `--seed` | | Random seed | 42 | +| `--project` | | W&B project name | model_path | +| `--hub-path` | | HuggingFace Hub path to push model | - | + +**Config File Example:** + +```yaml +dataset_path: tytodd/my-probe-dataset +model_path: ./best_probe +hub_path: tytodd/my-probe # Optional: push to HF Hub +batch_size: 4 +epochs: 10 +learning_rate: 0.0001 +weight_decay: 0.01 +test_size: 0.2 +seed: 42 +``` + +**What it does:** + +1. Loads an embeddings dataset (from HuggingFace Hub or local) +2. Creates binary labels: 1 if `predicted == label`, 0 otherwise +3. Trains a linear probe using MSE loss (Brier score optimization) +4. Logs metrics to Weights & Biases (Brier, ECE, MCE, Kuiper, AUROC) +5. Saves the best model based on validation Brier score +6. Optionally pushes to HuggingFace Hub + +--- + +### `eval` + +Evaluate a trained confidence probe on a dataset. Computes calibration and discrimination metrics. + +**Usage:** + +```bash +# Interactive mode (recommended) - prompts for probe and dataset +uv run mo eval + +# With CLI arguments +uv run mo eval --probe tytodd/my-probe --dataset tytodd/my-embeddings + +# Evaluate on train split instead of test +uv run mo eval --probe tytodd/my-probe --dataset tytodd/my-embeddings --split train +``` + +**Options:** + +| Option | Short | Description | Default | +| ---------------------------- | ----- | ---------------------------------------- | ------------ | +| `--probe` | `-p` | Probe path (HuggingFace Hub or local) | - | +| `--dataset` | `-d` | Dataset path (HuggingFace Hub or local) | - | +| `--split` | `-s` | Dataset split to evaluate on | test | +| `--batch-size` | `-b` | Batch size for evaluation | 64 | +| `--normalize/--no-normalize` | `-n` | Normalize embeddings with StandardScaler | probe config | + +**Metrics computed:** + +| Metric | Description | +| ----------- | ------------------------------------------------- | +| Brier Score | Mean squared error between predictions and labels | +| Accuracy | Classification accuracy at 0.5 threshold | +| F1 Score | Harmonic mean of precision and recall | +| ECE | Expected Calibration Error (10 bins) | +| MCE | Maximum Calibration Error | +| Kuiper | Kuiper statistic for calibration | +| AUROC | Area Under the ROC Curve (discrimination) | + +**What it does:** + +1. Loads a pretrained probe from HuggingFace Hub or local path +2. Loads a dataset created with `create` +3. Creates binary labels: 1 if `predicted == label`, 0 otherwise +4. Runs inference and computes calibration/discrimination metrics +5. Displays results in a formatted table + +--- + +### `compile` + +Compile (optimize) a judge using GEPA over a dataset. GEPA iteratively improves the judge's prompt based on training examples. + +**Subcommands:** + +- `compile` (base) - Compile with custom dataset and parameter mapping +- `compile ppe` - Compile specifically for PPE datasets (human-preference + correctness) + +**Usage:** + +```bash +# Interactive mode +uv run mo compile +uv run mo compile ppe + +# With config file +uv run mo compile --config config.yaml +uv run mo compile ppe --config config.yaml +``` + +**Options:** + +| Option | Short | Description | +| ---------- | ----- | -------------------------- | +| `--config` | `-c` | Path to config file (YAML) | + +**Config File Example:** + +```yaml +judge: tyrin/ppe-judge +dataset: tytodd/ppe-human-preference +inputs: # selects which input columns of the dataset to use (not necearry if using a compile subcommand like ppe or judge_bench) + - name: question + - name: response_a + column: response_A # Map param name to dataset column + - name: response_b + column: response_B +label_column: label +n_train: 100 +n_val: 50 +base_model: gpt-4o-mini +reflection_model: gpt-4o +output: tyrin/ppe-judge-gepa +seed: 42 +``` + +**What it does:** + +1. Loads a judge from Modaic Hub +2. Loads training/validation examples from a HuggingFace dataset +3. Maps judge parameters to dataset columns +4. Runs GEPA optimization to improve the judge's prompt +5. Pushes the optimized judge to Modaic Hub + +--- + +### `embed` + +Regenerate embeddings for an existing dataset using a different model or layer. Useful for experimenting with different embedding configurations without re-running the judge. + +**Usage:** + +```bash +# Interactive mode +uv run mo embed + +# With CLI arguments +uv run mo embed --dataset tytodd/my-dataset --hf-model Qwen/Qwen3-VL-32B-Instruct --layer -1 +``` + +**Options:** + +| Option | Short | Description | +| ------------ | ----- | ---------------------------------------- | +| `--dataset` | `-d` | Dataset path (HuggingFace Hub or local) | +| `--hf-model` | `-m` | HuggingFace model path for embeddings | +| `--layer` | `-l` | Hidden layer index (-1 for middle layer) | + +**What it does:** + +1. Loads an existing dataset (must have a `messages` column) +2. Regenerates embeddings using the specified model/layer via Modal +3. Replaces the `embeddings` column in the dataset +4. Prompts to push the updated dataset to HuggingFace Hub + +**Example workflow:** + +```bash +# Original dataset was created with layer 32 +# Now try middle layer instead +uv run mo embed \ + --dataset tytodd/my-embeddings \ + --hf-model Qwen/Qwen3-VL-32B-Instruct \ + --layer -1 +``` + +--- + +## Recommended Embedding Layers + +When extracting embeddings, use these recommended layer indices for best probe performance: + +| Model | HuggingFace Path | Recommended Layer | +| ------------- | ----------------------------------- | ----------------- | +| GPT-OSS 20B | `openai/gpt-oss-20b` | 8 | +| Qwen3-VL 32B | `Qwen/Qwen3-VL-32B-Instruct` | 16 | +| Llama 3.3 70B | `meta-llama/Llama-3.3-70B-Instruct` | 32 | + +Use `-1` for the middle layer if experimenting with an unlisted model. + +--- + +## Typical Workflow + +```bash +# 1. Create a probe dataset from a benchmark +uv run mo create ppe + +# 2. Train a confidence probe +uv run mo train --dataset tytodd/ppe-qwen3-embeddings + +# 3. Evaluate the probe on a test set +uv run mo eval --probe tytodd/my-probe --dataset tytodd/ppe-qwen3-embeddings + +# 4. (Optional) Compile/optimize a judge with GEPA +uv run mo compile ppe + +# 5. (Optional) Re-embed with different layer +uv run mo embed --dataset tytodd/my-dataset --layer 32 +``` + +## Environment Variables + +Create a `.env` file with: + +```bash +OPENAI_API_KEY=... +WANDB_API_KEY=... +HF_TOKEN=... +MODAIC_TOKEN=... +TOGETHER_API_KEY=... +``` diff --git a/config.json b/config.json new file mode 100644 index 0000000..653c931 --- /dev/null +++ b/config.json @@ -0,0 +1,55 @@ +{ + "model": null, + "signature": { + "description": "Given a review title and content, determine the overall sentiment.\n\nTask: Analyze the text to determine if the reviewer has a positive or negative\nopinion about the product/service.\n\n- positive: The review expresses satisfaction, appreciation, or a favorable view\n- negative: The review expresses dissatisfaction, criticism, or an unfavorable view\n\nConsider:\n1. Overall tone and emotional language\n2. Whether the reviewer recommends the product\n3. Specific praise or complaints mentioned\n\nFirst reason through your thought process in the `reasoning` field.\nBe sure to verbalize any uncertainty in your thought process.\nThen output your conclusion in the `label` field.", + "properties": { + "title": { + "__dspy_field_type": "input", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "desc": "The title of the review", + "prefix": "Title:", + "title": "Title" + }, + "content": { + "__dspy_field_type": "input", + "desc": "The full review text/content", + "prefix": "Content:", + "title": "Content", + "type": "string" + }, + "reasoning": { + "__dspy_field_type": "output", + "desc": "Your step by step reasoning for the sentiment classification. Verbalize uncertainty.", + "prefix": "Reasoning:", + "title": "Reasoning", + "type": "string" + }, + "label": { + "__dspy_field_type": "output", + "desc": "The sentiment: 'positive' or 'negative'", + "enum": [ + "positive", + "negative" + ], + "prefix": "Label:", + "title": "Label", + "type": "string" + } + }, + "required": [ + "content", + "reasoning", + "label" + ], + "title": "Sentiment", + "type": "object" + } +} \ No newline at end of file diff --git a/program.json b/program.json new file mode 100644 index 0000000..0dce119 --- /dev/null +++ b/program.json @@ -0,0 +1,44 @@ +{ + "traces": [], + "train": [], + "demos": [], + "signature": { + "instructions": "Given a review title and content, determine the overall sentiment.\n\nTask: Analyze the text to determine if the reviewer has a positive or negative\nopinion about the product/service.\n\n- positive: The review expresses satisfaction, appreciation, or a favorable view\n- negative: The review expresses dissatisfaction, criticism, or an unfavorable view\n\nConsider:\n1. Overall tone and emotional language\n2. Whether the reviewer recommends the product\n3. Specific praise or complaints mentioned\n\nFirst reason through your thought process in the `reasoning` field.\nBe sure to verbalize any uncertainty in your thought process.\nThen output your conclusion in the `label` field.", + "fields": [ + { + "prefix": "Title:", + "description": "The title of the review" + }, + { + "prefix": "Content:", + "description": "The full review text/content" + }, + { + "prefix": "Reasoning:", + "description": "Your step by step reasoning for the sentiment classification. Verbalize uncertainty." + }, + { + "prefix": "Label:", + "description": "The sentiment: 'positive' or 'negative'" + } + ] + }, + "lm": { + "model": "together_ai/Qwen/Qwen3-VL-32B-Instruct", + "model_type": "chat", + "cache": true, + "num_retries": 3, + "finetuning_model": null, + "launch_kwargs": {}, + "train_kwargs": {}, + "temperature": null, + "max_tokens": null + }, + "metadata": { + "dependency_versions": { + "python": "3.11", + "dspy": "3.1.2", + "cloudpickle": "3.1" + } + } +} \ No newline at end of file