openai
diff --git a/‎docs/benchmarking/Jailbreak_roc_curves.png‎
214 KB b/‎docs/benchmarking/Jailbreak_roc_curves.png‎
214 KB
diff --git a/‎docs/benchmarking/jailbreak_roc_curve.png‎
-205 KB b/‎docs/benchmarking/jailbreak_roc_curve.png‎
-205 KB
diff --git a/‎docs/ref/checks/jailbreak.md‎
Lines changed: 17 additions & 14 deletions b/‎docs/ref/checks/jailbreak.md‎
Lines changed: 17 additions & 14 deletions
diff --git a/‎src/guardrails/evals/core/visualizer.py‎
Lines changed: 17 additions & 8 deletions b/‎src/guardrails/evals/core/visualizer.py‎
Lines changed: 17 additions & 8 deletions
@@ -96,37 +96,40 @@ When conversation history is available (e.g., in chat applications or agent work
 
 ### Dataset Description
 
-This benchmark evaluates model performance on a diverse set of prompts:
+This benchmark combines multiple public datasets and synthetic benign conversations:
 
-- **Subset of the open source jailbreak dataset [JailbreakV-28k](https://huggingface.co/datasets/JailbreakV-28K/JailBreakV-28k)** (n=2,000)
-- **Synthetic prompts** covering a diverse range of benign topics (n=1,000)
-- **Open source [Toxicity](https://github.com/surge-ai/toxicity/blob/main/toxicity_en.csv) dataset** containing harmful content that does not involve jailbreak attempts (n=1,000)
+- **Red Queen jailbreak corpus ([GitHub](https://github.com/kriti-hippo/red_queen/blob/main/Data/Red_Queen_Attack.zip))**: 14,000 positive samples collected with gpt-4o attacks.
+- **Tom Gibbs multi-turn jailbreak attacks ([Hugging Face](https://huggingface.co/datasets/tom-gibbs/multi-turn_jailbreak_attack_datasets/tree/main))**: 4,136 positive samples.
+- **Scale MHJ dataset ([Hugging Face](https://huggingface.co/datasets/ScaleAI/mhj))**: 537 positive samples.
+- **Synthetic benign conversations**: 12,433 negative samples generated by seeding prompts from [WildGuardMix](https://huggingface.co/datasets/allenai/wildguardmix?utm_source=chatgpt.com) where `adversarial=false` and `prompt_harm_label=false`, then expanding each single-turn input into five-turn dialogues using gpt-4.1.
 
-**Total n = 4,000; positive class prevalence = 2,000 (50.0%)**
+**Total n = 31,106; positives = 18,673; negatives = 12,433**
+
+For benchmarking, we randomly sampled 4,000 conversations from this pool using a 50/50 split between positive and negative samples.
 
 ### Results
 
 #### ROC Curve
 
-![ROC Curve](../../benchmarking/jailbreak_roc_curve.png)
+![ROC Curve](../../benchmarking/Jailbreak_roc_curves.png)
 
 #### Metrics Table
 
 | Model         | ROC AUC | Prec@R=0.80 | Prec@R=0.90 | Prec@R=0.95 | Recall@FPR=0.01 |
 |--------------|---------|-------------|-------------|-------------|-----------------|
-| gpt-5         | 0.982   | 0.984       | 0.977       | 0.977       | 0.743           |
-| gpt-5-mini    | 0.980   | 0.980       | 0.976       | 0.975       | 0.734           |
-| gpt-4.1       | 0.979   | 0.975       | 0.975       | 0.975       | 0.661           |
-| gpt-4.1-mini (default) | 0.979   | 0.974       | 0.972       | 0.972       | 0.654           |
+| gpt-5         | 0.994   | 0.993       | 0.993       | 0.993       | 0.997           |
+| gpt-5-mini    | 0.813   | 0.832       | 0.832       | 0.832       | 0.000           |
+| gpt-4.1       | 0.999   | 0.999       | 0.999       | 0.999       | 1.000           |
+| gpt-4.1-mini (default) | 0.928   | 0.968       | 0.968       | 0.500       | 0.000           |
 
 #### Latency Performance
 
 | Model         | TTC P50 (ms) | TTC P95 (ms) |
 |--------------|--------------|--------------|
-| gpt-5         | 4,569        | 7,256        |
-| gpt-5-mini    | 5,019        | 9,212        |
-| gpt-4.1       | 841          | 1,861        |
-| gpt-4.1-mini  | 749          | 1,291        |
+| gpt-5         | 7,370        | 12,218       |
+| gpt-5-mini    | 7,055        | 11,579       |
+| gpt-4.1       | 2,998        | 4,204        |
+| gpt-4.1-mini  | 1,538        | 2,089        |
 
 **Notes:**
 
 
@@ -12,6 +12,7 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import seaborn as sns
+from sklearn.metrics import roc_auc_score, roc_curve
 
 logger = logging.getLogger(__name__)
 
@@ -111,10 +112,8 @@ def create_roc_curves(self, results_by_model: dict[str, list[Any]], guardrail_na
                 continue
 
             try:
-                from sklearn.metrics import roc_curve
-
                 fpr, tpr, _ = roc_curve(y_true, y_scores)
-                roc_auc = np.trapz(tpr, fpr)
+                roc_auc = roc_auc_score(y_true, y_scores)
                 ax.plot(fpr, tpr, label=f"{model_name} (AUC = {roc_auc:.3f})", linewidth=2)
             except Exception as e:
                 logger.error("Failed to calculate ROC curve for model %s: %s", model_name, e)
@@ -144,15 +143,25 @@ def _extract_roc_data(self, results: list[Any], guardrail_name: str) -> tuple[li
         y_scores = []
 
         for result in results:
-            if guardrail_name in result.expected_triggers:
-                expected = result.expected_triggers[guardrail_name]
-                actual = result.triggered.get(guardrail_name, False)
+            if guardrail_name not in result.expected_triggers:
+                logger.warning("Guardrail '%s' not found in expected_triggers for sample %s", guardrail_name, result.id)
+                continue
 
-                y_true.append(1 if expected else 0)
-                y_scores.append(1 if actual else 0)
+            expected = result.expected_triggers[guardrail_name]
+            y_true.append(1 if expected else 0)
+            y_scores.append(self._get_confidence_score(result, guardrail_name))
 
         return y_true, y_scores
 
+    def _get_confidence_score(self, result: Any, guardrail_name: str) -> float:
+        """Extract the model-reported confidence score for plotting."""
+        if guardrail_name in result.details:
+            guardrail_details = result.details[guardrail_name]
+            if isinstance(guardrail_details, dict) and "confidence" in guardrail_details:
+                return float(guardrail_details["confidence"])
+
+        return 1.0 if result.triggered.get(guardrail_name, False) else 0.0
+
     def create_latency_comparison_chart(self, latency_results: dict[str, dict[str, Any]]) -> Path:
         """Create a chart comparing latency across models."""
         fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))