A comprehensive Python library providing state-of-the-art methods for calibration, out-of-distribution detection, conformal prediction, and uncertainty estimation in deep learning.
Everything you need for uncertainty quantification in modern ML systems
Ensure your model's confidence matches its accuracy
Identify when your model encounters unfamiliar data
Distribution-free uncertainty with coverage guarantees
Abstain from predictions when uncertain
Detect and adapt to changing data distributions
Quantify uncertainty in language model outputs
Probabilistic approaches to neural network uncertainty
Efficiently select the most informative samples to label
Ready-to-use datasets and helper functions
Get started in seconds
pip install incerto
git clone https://github.com/steverab/incerto.git
cd incerto
pip install -e .
Simple, intuitive API for all uncertainty quantification tasks
This example demonstrates post-hoc calibration using temperature scaling. We collect model predictions on a validation set, fit a temperature parameter to minimize negative log-likelihood, then use the calibrated model on test data to achieve better-aligned confidence scores.
import torch
from torch.utils.data import DataLoader
from incerto.calibration import TemperatureScaling, ece_score
# Prepare validation and test data
val_loader = DataLoader(val_dataset, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64)
# Collect logits from validation set
val_logits, val_labels = [], []
with torch.no_grad():
for batch_x, batch_y in val_loader:
logits = model(batch_x)
val_logits.append(logits)
val_labels.append(batch_y)
val_logits = torch.cat(val_logits)
val_labels = torch.cat(val_labels)
# Fit temperature scaling on validation set
calibrator = TemperatureScaling()
calibrator.fit(val_logits, val_labels)
# Get calibrated predictions on test set
test_logits, test_labels = [], []
with torch.no_grad():
for batch_x, batch_y in test_loader:
logits = model(batch_x)
test_logits.append(logits)
test_labels.append(batch_y)
test_logits = torch.cat(test_logits)
test_labels = torch.cat(test_labels)
# Calibrate and evaluate
calibrated_probs = calibrator.predict(test_logits).probs
ece = ece_score(calibrated_probs, test_labels)
print(f"Expected Calibration Error: {ece:.4f}")
This example shows energy-based out-of-distribution detection. We compute energy scores for both in-distribution (e.g., CIFAR-10) and OOD (e.g., SVHN) data, then evaluate detection performance using AUROC to measure how well we can distinguish between the two distributions.
import torch
from torch.utils.data import DataLoader
from incerto.ood import Energy, auroc
# Load your trained model
model.eval()
# Prepare in-distribution and OOD datasets
id_loader = DataLoader(cifar10_test, batch_size=100)
ood_loader = DataLoader(svhn_test, batch_size=100)
# Initialize Energy-based OOD detector
detector = Energy(model, temperature=1.0)
# Compute OOD scores for ID data
id_scores = []
for batch_x, _ in id_loader:
scores = detector.score(batch_x)
id_scores.append(scores)
id_scores = torch.cat(id_scores)
# Compute OOD scores for OOD data
ood_scores = []
for batch_x, _ in ood_loader:
scores = detector.score(batch_x)
ood_scores.append(scores)
ood_scores = torch.cat(ood_scores)
# Evaluate detection performance
auc = auroc(id_scores, ood_scores)
print(f"OOD Detection AUROC: {auc:.4f}")
# Decision threshold for deployment
threshold = torch.quantile(id_scores, 0.95)
print(f"95th percentile threshold: {threshold:.4f}")
Adaptive Prediction Sets (APS) provides distribution-free coverage guarantees. We calibrate on held-out data to compute quantiles, then generate prediction sets that are guaranteed to contain the true label with probability 1-alpha, regardless of the underlying data distribution.
import torch
from torch.utils.data import DataLoader
from incerto.conformal import aps
# Load your trained classifier
model.eval()
# Prepare calibration and test data
calibration_loader = DataLoader(cal_dataset, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64)
# Create conformal predictor with 90% coverage guarantee
alpha = 0.1 # 1 - alpha = 90% coverage
predictor = aps(model, calibration_loader, alpha=alpha)
# Generate prediction sets with coverage guarantees
all_sets, all_labels = [], []
for batch_x, batch_y in test_loader:
pred_sets = predictor(batch_x)
all_sets.append(pred_sets)
all_labels.append(batch_y)
# Evaluate coverage and efficiency
coverage = 0
avg_set_size = 0
for sets, labels in zip(all_sets, all_labels):
coverage += (labels.unsqueeze(1) == sets).any(dim=1).float().mean()
avg_set_size += sets.size(1)
coverage /= len(all_sets)
avg_set_size /= len(all_sets)
print(f"Coverage: {coverage:.3f} (target: {1-alpha:.3f})")
print(f"Average set size: {avg_set_size:.2f}")
Selective prediction enables models to abstain from low-confidence predictions, trading coverage for accuracy. This example shows confidence-based rejection where predictions below a threshold are flagged for human review, improving reliability in safety-critical applications.
import torch
from torch.utils.data import DataLoader
from incerto.sp import SoftmaxThreshold
from incerto.sp.metrics import coverage, risk
# Load your trained model
model.eval()
test_loader = DataLoader(test_dataset, batch_size=100)
# Create selective predictor (wraps your model)
predictor = SoftmaxThreshold(model)
# Make predictions with confidence scores
all_logits, all_confs, all_labels = [], [], []
for batch_x, batch_y in test_loader:
logits, confs = predictor(batch_x, return_confidence=True)
all_logits.append(logits)
all_confs.append(confs)
all_labels.append(batch_y)
logits = torch.cat(all_logits)
confidences = torch.cat(all_confs)
labels = torch.cat(all_labels)
# Apply rejection threshold
threshold = 0.8
rejected = predictor.reject(confidences, threshold=threshold)
accepted = ~rejected
predictions = logits.argmax(dim=1)
print(f"Rejected: {rejected.sum()}/{len(labels)} "
f"({rejected.float().mean()*100:.1f}%)")
# Compute selective metrics
if accepted.any():
selective_acc = (predictions[accepted] == labels[accepted]).float().mean()
print(f"Accuracy on accepted: {selective_acc*100:.2f}%")
cov = coverage(accepted)
risk_val = risk(predictions, labels, accepted)
print(f"Coverage: {cov:.3f}")
print(f"Selective risk: {risk_val:.4f}")
Variational Bayesian Neural Networks learn distributions over weights using Bayes by Backprop. This example shows how to train with the variational loss (combining likelihood and KL divergence) and make predictions with epistemic and aleatoric uncertainty quantification.
import torch
from torch.utils.data import DataLoader
from incerto.bayesian import VariationalBayesNN
# Prepare data
train_loader = DataLoader(train_dataset, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=100)
# Create Variational Bayesian NN
# Specify architecture: input_dim, [hidden_sizes], output_dim
vbnn = VariationalBayesNN(
in_features=784,
hidden_sizes=[512, 256],
out_features=10,
prior_std=1.0
)
# Train with variational loss (likelihood + KL divergence)
optimizer = torch.optim.Adam(vbnn.parameters(), lr=0.001)
for epoch in range(10):
vbnn.train()
for batch_x, batch_y in train_loader:
optimizer.zero_grad()
# Variational loss with Monte Carlo sampling
loss = vbnn.variational_loss(batch_x, batch_y, num_samples=10)
loss.backward()
optimizer.step()
# Inference with uncertainty quantification
vbnn.eval()
test_x, test_y = next(iter(test_loader))
# Get predictions with variance estimates
with torch.no_grad():
mean_pred, variance = vbnn.predict(test_x)
print(f"Mean predictions shape: {mean_pred.shape}")
print(f"Average predictive variance: {variance.mean():.4f}")
# Identify high-uncertainty samples
high_unc_mask = variance > variance.quantile(0.9)
print(f"High uncertainty samples: {high_unc_mask.sum()}/{len(test_x)}")
Maximum Mean Discrepancy (MMD) detects distribution shift by comparing kernel embeddings between reference and production data. This example shows how to fit a detector on training data, monitor production batches, and set alert thresholds based on shift magnitude for timely model retraining.
import torch
from torch.utils.data import DataLoader
from incerto.shift import MMDShiftDetector
# Load reference (training) data
reference_loader = DataLoader(train_dataset, batch_size=128)
# Load production data (potentially shifted)
production_loader = DataLoader(production_dataset, batch_size=128)
# Create MMD shift detector with Gaussian kernel
mmd_detector = MMDShiftDetector(sigma=1.0)
# Fit on reference distribution
mmd_detector.fit(reference_loader)
# Compute shift score on production data
shift_score = mmd_detector.score(production_loader)
baseline_score = mmd_detector.score(reference_loader) # Self-test
# Calculate shift ratio
shift_ratio = shift_score / (baseline_score + 1e-10)
print(f"MMD shift score: {shift_score:.6f}")
print(f"Shift ratio: {shift_ratio:.2f}x")
# Alert based on shift magnitude
if shift_ratio > 2.0:
print("⚠️ CRITICAL: Significant distribution shift detected!")
print(" Recommendation: Retrain model immediately")
elif shift_ratio > 1.5:
print("⚠️ WARNING: Moderate shift detected")
print(" Recommendation: Monitor closely, consider retraining")
else:
print("✓ No significant shift detected")
# Save detector for production monitoring
mmd_detector.save("mmd_detector.pt")
# Production monitoring workflow
print("\n=== Production Monitoring ===")
print("1. Save detector for reuse")
print("2. Set thresholds based on acceptable degradation")
print("3. Monitor incoming batches periodically")
print("4. Alert when shift_ratio > threshold")
print("5. Retrain or adapt model when alerted")
Semantic entropy clusters multiple LLM generations by meaning and computes entropy over semantic clusters rather than exact text matches. This captures true model uncertainty about the answer, identifying when the model produces diverse semantic interpretations that indicate genuine uncertainty.
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
from incerto.llm import SemanticEntropy, TokenEntropy
# Load language model and embedding model
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
model.eval()
# Example prompt
prompt = "The capital of France is"
inputs = tokenizer(prompt, return_tensors="pt")
# --- Token-level uncertainty ---
with torch.no_grad():
outputs = model(**inputs, return_dict=True)
logits = outputs.logits
token_entropy = TokenEntropy.compute(logits)
print(f"Average token entropy: {token_entropy.mean():.4f}")
# --- Semantic Entropy: cluster semantically equivalent responses ---
num_samples = 10
responses = []
for _ in range(num_samples):
output_ids = model.generate(
**inputs,
max_length=50,
do_sample=True,
temperature=0.8,
top_p=0.9,
num_return_sequences=1
)
response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
responses.append(response)
# Compute semantic entropy with embedding model
semantic_unc = SemanticEntropy.compute(
responses,
similarity_threshold=0.85,
embedding_model=embedding_model # Pass model object, not string!
)
print(f"Semantic entropy: {semantic_unc['semantic_entropy']:.4f}")
print(f"Number of semantic clusters: {semantic_unc['num_clusters']}")
# High semantic entropy indicates uncertainty
if semantic_unc['semantic_entropy'] > 1.5:
print("⚠️ High uncertainty: Model gives diverse semantic answers")
else:
print("✓ Low uncertainty: Responses are semantically consistent")
Consistent interface across all uncertainty quantification methods
Built on PyTorch for seamless integration with your models
Implementations based on peer-reviewed publications
411 tests ensuring reliability and correctness
Simple API that doesn't sacrifice functionality
MIT licensed, free for commercial and research use
Join the community making ML models more trustworthy