Source code for incerto.calibration.metrics

import torch
import torch.nn.functional as F
import numpy as np

from .utils import get_bin_stats


def nll(logits: torch.Tensor, labels: torch.Tensor) -> float:
    """
    Negative Log-Likelihood (cross-entropy) averaged over samples.
    """
    return F.cross_entropy(logits, labels, reduction="mean").item()


[docs] def brier_score(logits: torch.Tensor, labels: torch.Tensor) -> float: """ Brier score: mean squared error between one-hot labels and predicted probabilities. """ probs = F.softmax(logits, dim=1).detach().cpu().numpy() labels_np = labels.detach().cpu().numpy() n_samples, n_classes = probs.shape one_hot = np.eye(n_classes)[labels_np] return float(np.mean(np.sum((probs - one_hot) ** 2, axis=1)))
[docs] def ece_score(logits: torch.Tensor, labels: torch.Tensor, n_bins: int = 10) -> float: """ Expected Calibration Error (ECE). """ probs = F.softmax(logits, dim=1).detach().cpu().numpy() confidences = np.max(probs, axis=1) predictions = np.argmax(probs, axis=1) accuracies = (predictions == labels.detach().cpu().numpy()).astype(float) bin_conf, bin_acc, weight = get_bin_stats(confidences, accuracies, n_bins) return float(np.sum(weight * np.abs(bin_acc - bin_conf)))
[docs] def mce_score(logits: torch.Tensor, labels: torch.Tensor, n_bins: int = 10) -> float: """ Maximum Calibration Error (MCE). """ probs = F.softmax(logits, dim=1).detach().cpu().numpy() confidences = np.max(probs, axis=1) predictions = np.argmax(probs, axis=1) accuracies = (predictions == labels.detach().cpu().numpy()).astype(float) bin_conf, bin_acc, _ = get_bin_stats(confidences, accuracies, n_bins) return float(np.max(np.abs(bin_acc - bin_conf)))
def classwise_ece( logits: torch.Tensor, labels: torch.Tensor, n_bins: int = 10 ) -> float: """ Class-wise ECE: average ECE computed separately for each class. """ probs = F.softmax(logits, dim=1).detach().cpu().numpy() labels_np = labels.detach().cpu().numpy() n_samples, n_classes = probs.shape eces = [] for k in range(n_classes): idx = labels_np == k if not np.any(idx): continue conf_k = probs[idx, k] acc_k = (labels_np[idx] == k).astype(float) bin_conf, bin_acc, weight = get_bin_stats(conf_k, acc_k, n_bins) eces.append(np.sum(weight * np.abs(bin_acc - bin_conf))) return float(np.mean(eces)) if eces else 0.0
[docs] def adaptive_ece_score( logits: torch.Tensor, labels: torch.Tensor, n_bins: int = 10, norm: str = "l1", ) -> float: """ Adaptive Expected Calibration Error (Nixon et al., 2019). Uses equal-mass binning instead of equal-width binning, making it more robust to varying confidence distributions. Reference: Nixon et al., "Measuring Calibration in Deep Learning" (CVPR Workshops 2019) Args: logits: Model logits (N, C) labels: True labels (N,) n_bins: Number of bins norm: Norm to use ('l1' or 'l2') Returns: Adaptive ECE score """ probs = F.softmax(logits, dim=1).detach().cpu().numpy() confidences = np.max(probs, axis=1) predictions = np.argmax(probs, axis=1) accuracies = (predictions == labels.detach().cpu().numpy()).astype(float) # Sort by confidence sorted_indices = np.argsort(confidences) confidences_sorted = confidences[sorted_indices] accuracies_sorted = accuracies[sorted_indices] # Create adaptive bins (equal mass) n = len(confidences) bin_size = n // n_bins ece = 0.0 for i in range(n_bins): start_idx = i * bin_size end_idx = (i + 1) * bin_size if i < n_bins - 1 else n if start_idx >= end_idx: continue bin_conf = confidences_sorted[start_idx:end_idx].mean() bin_acc = accuracies_sorted[start_idx:end_idx].mean() weight = (end_idx - start_idx) / n if norm == "l1": ece += weight * abs(bin_acc - bin_conf) elif norm == "l2": ece += weight * (bin_acc - bin_conf) ** 2 else: raise ValueError(f"Unknown norm: {norm}") if norm == "l2": ece = np.sqrt(ece) return float(ece)