Source code for incerto.llm.calibration

"""
Calibration methods for LLMs.

Adapt calibration techniques for language model outputs, including
temperature scaling for token distributions and sequence-level calibration.
"""

from __future__ import annotations
import torch
import torch.nn as nn
import torch.nn.functional as F



[docs]
class TokenTemperatureScaling(nn.Module):
    """
    Temperature scaling for token-level probabilities.

    Applies a learnable temperature parameter to logits before softmax,
    making the distribution sharper (T < 1) or smoother (T > 1).
    """


[docs]
    def __init__(self, init_temp: float = 1.0):
        """
        Args:
            init_temp: Initial temperature value
        """
        super().__init__()
        self.temperature = nn.Parameter(torch.tensor(init_temp))



[docs]
    def forward(self, logits: torch.Tensor) -> torch.Tensor:
        """
        Apply temperature scaling.

        Args:
            logits: Token logits of shape (..., vocab_size)

        Returns:
            Temperature-scaled logits
        """
        return logits / self.temperature.clamp(min=1e-6)



[docs]
    def fit(
        self,
        logits: torch.Tensor,
        token_ids: torch.Tensor,
        lr: float = 0.01,
        max_iters: int = 50,
    ):
        """
        Fit temperature on validation data.

        Args:
            logits: Validation logits of shape (batch, seq_len, vocab_size)
            token_ids: True token IDs of shape (batch, seq_len)
            lr: Learning rate
            max_iters: Maximum optimization iterations
        """
        optimizer = torch.optim.LBFGS([self.temperature], lr=lr, max_iter=max_iters)

        def eval_fn():
            optimizer.zero_grad()
            scaled_logits = self.forward(logits)
            # Flatten for cross-entropy
            loss = F.cross_entropy(
                scaled_logits.view(-1, scaled_logits.size(-1)), token_ids.view(-1)
            )
            loss.backward()
            return loss

        optimizer.step(eval_fn)
        return self





[docs]
class SequenceLengthCalibration:
    """
    Calibrate for length bias in sequence probabilities.

    Longer sequences tend to have lower probabilities. This adjusts
    for that bias using length normalization.
    """


[docs]
    def __init__(self, alpha: float = 0.6):
        """
        Args:
            alpha: Length penalty factor (common range: 0.5-1.0)
        """
        self.alpha = alpha



[docs]
    def calibrate(
        self,
        seq_log_prob: torch.Tensor,
        seq_length: torch.Tensor,
    ) -> torch.Tensor:
        """
        Apply length normalization.

        Args:
            seq_log_prob: Log probability of sequence (batch,)
            seq_length: Length of sequence (batch,)

        Returns:
            Length-normalized scores
        """
        # Divide by length^alpha
        normalized = seq_log_prob / (seq_length.float() ** self.alpha)
        return normalized





[docs]
class VerbosityBiasCorrection:
    """
    Correct for the model's tendency to be more confident on verbose outputs.

    Some models produce higher probabilities when generating longer,
    more detailed responses, even if they're not more accurate.
    """


[docs]
    def __init__(self):
        self.mean_confidence = None
        self.bin_edges = None
        self.bin_mean_confidences = None



[docs]
    def fit(self, lengths: list[int], confidences: list[float]):
        """
        Fit correction based on length-confidence relationship.

        Args:
            lengths: List of response lengths
            confidences: List of confidence scores
        """
        import numpy as np

        length_array = np.array(lengths, dtype=float)
        conf_array = np.array(confidences, dtype=float)

        self.mean_confidence = conf_array.mean()

        # Create quartile bins by length
        self.bin_edges = np.percentile(length_array, [0, 25, 50, 75, 100])
        self.bin_mean_confidences = []

        for i in range(len(self.bin_edges) - 1):
            if i < len(self.bin_edges) - 2:
                mask = (length_array >= self.bin_edges[i]) & (
                    length_array < self.bin_edges[i + 1]
                )
            else:
                mask = (length_array >= self.bin_edges[i]) & (
                    length_array <= self.bin_edges[i + 1]
                )
            if mask.sum() > 0:
                self.bin_mean_confidences.append(conf_array[mask].mean())
            else:
                self.bin_mean_confidences.append(self.mean_confidence)



[docs]
    def correct(self, length: int, confidence: float) -> float:
        """
        Apply verbosity bias correction.

        Args:
            length: Response length
            confidence: Original confidence score

        Returns:
            Corrected confidence
        """
        if self.bin_edges is None:
            return confidence

        import numpy as np

        # Find which length bin this falls into
        bin_idx = np.digitize(length, self.bin_edges) - 1
        bin_idx = np.clip(bin_idx, 0, len(self.bin_mean_confidences) - 1)

        # Correct by the ratio of overall mean confidence to bin mean confidence
        bin_mean = self.bin_mean_confidences[bin_idx]
        if bin_mean > 0:
            corrected = confidence * (self.mean_confidence / bin_mean)
        else:
            corrected = confidence

        return max(0.0, min(1.0, corrected))





[docs]
class HistogramBinning:
    """
    Histogram binning calibration for LLM confidence scores.

    Groups predictions by confidence and adjusts to empirical accuracy.
    """


[docs]
    def __init__(self, n_bins: int = 10):
        """
        Args:
            n_bins: Number of bins for calibration
        """
        self.n_bins = n_bins
        self.bin_boundaries = None
        self.bin_accuracies = None



[docs]
    def fit(self, confidences: torch.Tensor, correctness: torch.Tensor):
        """
        Fit binning calibration.

        Args:
            confidences: Model confidence scores (batch,)
            correctness: Binary correctness indicators (batch,)
        """
        import numpy as np

        confidences = confidences.cpu().numpy()
        correctness = correctness.cpu().numpy()

        # Create bins
        self.bin_boundaries = np.linspace(0, 1, self.n_bins + 1)
        self.bin_accuracies = np.zeros(self.n_bins)

        # Compute empirical accuracy in each bin
        for i in range(self.n_bins):
            lower = self.bin_boundaries[i]
            upper = self.bin_boundaries[i + 1]

            in_bin = (confidences >= lower) & (confidences < upper)
            if i == self.n_bins - 1:  # Include upper boundary in last bin
                in_bin = (confidences >= lower) & (confidences <= upper)

            if in_bin.sum() > 0:
                self.bin_accuracies[i] = correctness[in_bin].mean()
            else:
                self.bin_accuracies[i] = (lower + upper) / 2  # Default to bin center



[docs]
    def calibrate(self, confidence: float) -> float:
        """
        Apply calibration to a confidence score.

        Args:
            confidence: Original confidence (0-1)

        Returns:
            Calibrated confidence
        """
        if self.bin_boundaries is None:
            return confidence

        # Find bin using numpy digitize for consistency with fit()
        import numpy as np

        # np.digitize returns bin index where bin_boundaries[i-1] <= x < bin_boundaries[i]
        # We subtract 1 to get 0-indexed bins, and clip to valid range
        bin_idx = np.digitize(confidence, self.bin_boundaries) - 1
        bin_idx = np.clip(bin_idx, 0, self.n_bins - 1)

        return float(self.bin_accuracies[bin_idx])