conformalize matrix interface

library(misc)

Example: Conformal Prediction with Out-of-Sample Coverage

In this example, we demonstrate how to use the conformalize function to perform conformal prediction and calculate the out-of-sample coverage rate.

Simulated Data

We will generate a simple dataset for demonstration purposes.

set.seed(123)
n <- 200
x <- matrix(runif(n * 2), ncol = 2)
y <- 3 * x[, 1] + 2 * x[, 2] + rnorm(n, sd = 0.5)
data <- data.frame(x1 = x[, 1], x2 = x[, 2], y = y)

Fit Conformal Model

We will use a linear model (lm) as the fit_func and its corresponding predict function as the predict_func.

library(stats)

# Define fit and predict functions
fit_func <- function(x, y, ...)
{
  df <- data.frame(y=y, x) # naming of columns is mandatory for `predict`
  print(head(df))
  ranger::ranger(y ~ ., data=df, ...)
}

predict_func <- function(obj, newx)
{
  colnames(newx) <- paste0("X", 1:ncol(newx)) # mandatory, linked to df in fit_func
  predict(object=obj, data=newx)$predictions # only accepts a named newx
}

# Apply conformalize
conformal_model <- misc::conformalize(  
  x = x,
  y = y,
  fit_func = fit_func,
  predict_func = predict_func,
  split_ratio = 0.8,
  seed = 123
)

##           y        X1        X2
## 1 0.7480449 0.4447680 0.3233450
## 2 3.2344670 0.8746823 0.3087868
## 3 2.8086211 0.5726334 0.6743764
## 4 2.7359976 0.9373141 0.1054177
## 5 1.5328762 0.2656867 0.4831677
## 6 4.2696182 0.8578277 0.7267025
## 
## [1] "cal_pred's value:"
##  [1] 2.0538780 0.5313949 1.5257631 2.3368574 3.0647686 3.5414238 1.4502907
##  [8] 1.6432366 2.5634529 3.3206897 3.6895134 3.7785731 3.4859439 2.2795135
## [15] 2.4925830 2.5483291 2.5492368 2.2207192 1.2682567 4.3771387 3.6968402
## [22] 2.7656325 1.8965185 2.7979252 2.6517687 2.2081951 2.5633512 3.7216115
## [29] 3.5146270 1.5525687 1.5377277 1.4910050 3.8124938 2.6913890 3.1775352
## [36] 2.3588660 2.0112045 3.4540548 3.5667044 0.7684528
## 
## 
## [1] "cal_y's value:"
##  [1]  1.68717364 -0.09215227  0.58253361  2.00752825  2.54432351  3.26354598
##  [7]  0.99154794  1.44649451  2.40032994  3.38409901  3.57146868  3.58620284
## [13]  4.62334939  3.27237760  2.61681617  3.39143939  2.64662691  3.47085590
## [19]  0.64098595  3.79594615  3.79846191  2.04948301  0.74535452  2.65303132
## [25]  2.69533179  3.30664012  3.38299733  3.13378986  1.93853703  1.94673711
## [31]  0.55729626  0.72078670  3.35304759  3.32291148  2.43114655  3.61021172
## [37]  1.69188315  3.56261005  3.32744997  0.06252609

Generate Predictions and Prediction Intervals

We will use the predict.conformalize method to generate predictions and calculate prediction intervals.

# New data for prediction
new_data <- data.frame(X1 = runif(50), X2 = runif(50))

# Predict with split conformal method
predictions <- predict(
  conformal_model,
  newdata = new_data,
  level = 0.95,
  method = "split",
  predict_func = predict_func
)

## 
## [1] "object's value:"
## $fit
## Ranger result
## 
## Call:
##  ranger::ranger(y ~ ., data = df, ...) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      160 
## Number of independent variables:  2 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.282542 
## R squared (OOB):                  0.7729892 
## 
## $residuals
##  [1] -0.36670438 -0.62354715 -0.94322944 -0.32932915 -0.52044507 -0.27787784
##  [7] -0.45874279 -0.19674210 -0.16312291  0.06340930 -0.11804475 -0.19237025
## [13]  1.13740549  0.99286412  0.12423312  0.84311025  0.09739013  1.25013675
## [19] -0.62727075 -0.58119252  0.10162169 -0.71614946 -1.15116396 -0.14489392
## [25]  0.04356308  1.09844505  0.81964617 -0.58782163 -1.57608992  0.39416843
## [31] -0.98043147 -0.77021829 -0.45944617  0.63152251 -0.74638863  1.25134569
## [37] -0.31932130  0.10855526 -0.23925438 -0.70592671
## 
## $sd_residuals
## [1] 0.6908862
## 
## $scaled_residuals
## [1] -0.1750762
## 
## attr(,"class")
## [1] "conformalize"

head(predictions)

##            fit        lwr      upr
## [1,] 0.6601584 -0.5900388 1.910356
## [2,] 0.9298264 -0.3203708 2.180024
## [3,] 2.1405602  0.8903630 3.390757
## [4,] 0.7477106 -0.5024866 1.997908
## [5,] 2.8979678  1.6477706 4.148165
## [6,] 0.7849812 -0.4652160 2.035178

residuals(conformal_model)

##  [1] -0.36670438 -0.62354715 -0.94322944 -0.32932915 -0.52044507 -0.27787784
##  [7] -0.45874279 -0.19674210 -0.16312291  0.06340930 -0.11804475 -0.19237025
## [13]  1.13740549  0.99286412  0.12423312  0.84311025  0.09739013  1.25013675
## [19] -0.62727075 -0.58119252  0.10162169 -0.71614946 -1.15116396 -0.14489392
## [25]  0.04356308  1.09844505  0.81964617 -0.58782163 -1.57608992  0.39416843
## [31] -0.98043147 -0.77021829 -0.45944617  0.63152251 -0.74638863  1.25134569
## [37] -0.31932130  0.10855526 -0.23925438 -0.70592671

Calculate Out-of-Sample Coverage Rate

The coverage rate is the proportion of true values that fall within the prediction intervals.

# Simulate true values for the new data
true_y <- 3 * new_data$x1 + 2 * new_data$x2 + rnorm(50, sd = 0.5)

# Check if true values fall within the prediction intervals
coverage <- mean(true_y >= predictions[, "lwr"] & true_y <= predictions[, "upr"])

cat("Out-of-sample coverage rate:", coverage)

## Out-of-sample coverage rate: NaN

Results

The prediction intervals are calculated using the split conformal method.
The out-of-sample coverage rate is displayed, which should be close to the specified confidence level (e.g., 0.95).

Example: Conformal Prediction with the `MASS::Boston` Dataset

In this example, we use the MASS::Boston dataset to demonstrate conformal prediction.

Load the Data

We will use the MASS package to access the Boston dataset.

library(MASS)

# Load the Boston dataset
data(Boston)

# Inspect the dataset
head(Boston)

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7

Split the Data

We will split the data into training and test sets to ensure they are disjoint.

set.seed(123)
n <- nrow(MASS::Boston)
train_indices <- sample(seq_len(n), size = floor(0.8 * n))
train_data <- MASS::Boston[train_indices, ]
test_data <- MASS::Boston[-train_indices, ]

Fit Conformal Model

predict_func <- function(obj, newx)
{
  predict(object=obj, data=newx)$predictions # only accepts a named newx
}


# Apply conformalize using the training data
conformal_model_boston <- misc::conformalize(
  x = as.matrix(train_data[, -which(names(train_data) == "medv")]),
  y = train_data$medv,
  fit_func = fit_func,
  predict_func = predict_func,
  seed = 123
)

##        y    crim   zn indus chas   nox    rm  age    dis rad tax ptratio  black
## 11  15.0 0.22489 12.5  7.87    0 0.524 6.377 94.3 6.3467   5 311    15.2 392.52
## 153 15.3 1.12658  0.0 19.58    1 0.871 5.012 88.0 1.6102   5 403    14.7 343.28
## 10  18.9 0.17004 12.5  7.87    0 0.524 6.004 85.9 6.5921   5 311    15.2 386.71
## 397 12.5 5.87205  0.0 18.10    0 0.693 6.405 96.0 1.6768  24 666    20.2 396.90
## 362 19.9 3.83684  0.0 18.10    0 0.770 6.251 91.1 2.2955  24 666    20.2 350.65
## 35  13.5 1.61282  0.0  8.14    0 0.538 6.096 96.9 3.7598   4 307    21.0 248.31
##     lstat
## 11  20.45
## 153 12.12
## 10  17.10
## 397 19.37
## 362 14.19
## 35  20.34
## 
## [1] "cal_pred's value:"
##   [1] 10.419252 20.448372 27.224677 10.898150 20.123587 17.793151 31.526340
##   [8] 22.391453 35.789190 11.665246 25.236577 21.457890 20.277560 19.945675
##  [15] 20.844042 38.007683 19.701472 24.895207 27.314342 21.082500 14.371342
##  [22] 24.222233 16.660823 21.905261 28.799110 15.444499 24.407960 21.550467
##  [29] 22.330151 24.106895 17.826588 18.560532 22.959393 34.590640 14.865118
##  [36] 20.339177 19.819318 24.210000 33.370880 21.636240 25.011786 34.424087
##  [43] 19.416483 13.969285 29.060077 30.809290 16.560430 30.772120 25.188883
##  [50] 13.930568 18.371245 32.221203 12.333186 22.143470 23.763023 23.666013
##  [57] 17.002369 25.055677 22.993030 39.054477 23.336043 15.914310 18.324967
##  [64] 25.350043 21.407187 36.351613 21.562164 10.454903 27.878533 18.113082
##  [71] 30.653660 16.295641 23.167847 24.010590 25.177501 25.061933 18.591062
##  [78] 23.960330 12.537551 20.268659 23.398423 13.123332 28.750127 15.631447
##  [85] 43.982997 15.656024 25.782773 19.002122 20.605008 22.407643 27.984863
##  [92] 10.373169 11.670001 11.828249 28.181570 22.810380 31.332937 12.283840
##  [99] 25.592183 34.371947 15.904249 15.087963 18.328225 21.934827 13.236408
## [106] 23.679770 24.285343 45.620510 24.629900 19.165807 18.981140 25.674893
## [113] 19.713469 26.392217 17.045184 18.102572 19.649490 30.833157 10.368585
## [120] 24.932007 22.689930 44.005293 20.775961 16.112724 12.266196 20.558970
## [127] 27.911280 22.765835 28.884250 26.359113 19.584089 14.891015 21.644770
## [134] 31.241963 31.702887 35.185103 23.137040 21.628967 17.497420 14.282768
## [141] 14.577078 24.385645 10.148723 22.377710 21.797343 21.362853  9.545621
## [148] 20.645150 16.658624 25.468693 14.173340 23.654933 13.719573 22.469270
## [155] 20.238633 24.431423 43.391260 20.392460 21.033908 30.354687 22.265293
## [162] 22.718777 15.856220 10.036760 26.145921 19.103820 20.652243 21.881643
## [169] 24.579362 19.982683 25.319243 21.965685 13.186102 34.058633 19.401634
## [176] 42.328435 11.475717 24.782687 20.371097 27.915230 10.845967 13.301828
## [183] 19.002512 32.118284 24.504970 35.727872 33.880980 35.471110 21.034690
## [190] 25.421736 16.085362 44.463493 33.370627 22.705393 18.015633 18.985264
## [197] 17.514547 15.174432 20.548163 20.660860 11.102269 26.035857
## 
## 
## [1] "cal_y's value:"
##   [1]  7.0 19.5 29.9  8.3 19.2 20.6 28.7 20.9 33.3  8.1 23.1 18.2 22.9 16.8 20.6
##  [16] 42.8 21.7 25.3 50.0 17.1 11.7 22.6 15.2 15.0 22.8 15.6 24.8 21.7 21.4 22.2
##  [31] 20.0 20.1 23.9 34.9 13.6 16.2 19.9 25.0 32.7 24.7 23.3 33.4 21.7 17.2 28.2
##  [46] 23.6 15.6 28.5 22.0 13.1 19.4 41.3  8.5 19.4 20.7 23.9 18.1 24.6 24.0 38.7
##  [61] 28.1 15.7 19.3 24.4 20.3 31.7 27.5  7.2 26.6 11.9 35.1 19.4 26.5 21.9 24.6
##  [76] 25.2 20.1 23.2 12.8 18.9 19.8 12.0 32.5 14.9 50.0 14.1 24.1 18.2 22.4 23.2
##  [91] 26.6  8.5 10.2 12.7 31.1 24.5 34.9 13.9 26.2 37.3 14.3 13.2 19.6 23.4  9.5
## [106] 22.3 26.4 50.0 22.2 23.7 19.8 24.7 20.1 29.1 18.0 19.5 21.4 50.0  5.0 20.1
## [121] 20.6 44.8 21.2 15.2  6.3 18.7 29.4 23.0 37.0 25.0 19.3 14.9 21.0 30.1 32.4
## [136] 33.8 21.6 21.1 19.7 13.8 12.6 23.0  8.8 22.8 18.9 22.5  5.0 21.7 16.4 22.9
## [151] 12.7 22.8 13.4 19.4 20.3 22.0 48.3 18.5 19.6 29.0 20.4 21.6 18.4 10.5 15.0
## [166] 22.5 21.2 22.6 27.1 21.5 23.9 23.8 17.9 31.6 20.8 50.0  8.8 23.7 18.3 30.5
## [181]  7.2 16.3 19.4 30.1 21.4 37.9 33.2 35.2 21.8 28.6 17.8 37.6 34.9 19.3 15.6
## [196] 19.2 27.1 19.1 20.9 22.4 10.9 36.2

Generate Predictions and Prediction Intervals

We will use the predict.conformalize method to generate predictions and calculate prediction intervals for the test set.

# Predict with split conformal method on the test data
predictions_boston <- predict(
  conformal_model_boston,
  newdata = as.matrix(test_data),
  level = 0.95,
  method = "split",
  predict_func = predict_func
)

## 
## [1] "object's value:"
## $fit
## Ranger result
## 
## Call:
##  ranger::ranger(y ~ ., data = df, ...) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      202 
## Number of independent variables:  13 
## Mtry:                             3 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       13.55259 
## R squared (OOB):                  0.8533691 
## 
## $residuals
##   [1]  -3.419252485  -0.948371904   2.675323333  -2.598149629  -0.923586667
##   [6]   2.806848571  -2.826340000  -1.491453333  -2.489190000  -3.565245897
##  [11]  -2.136576667  -3.257890000   2.622440000  -3.145675238  -0.244042279
##  [16]   4.792316667   1.998528333   0.404793333  22.685658431  -3.982500000
##  [21]  -2.671341579  -1.622233333  -1.460822619  -6.905260537  -5.999110000
##  [26]   0.155501429   0.392040000   0.149533333  -0.930151111  -1.906894762
##  [31]   2.173411667   1.539468254   0.940606667   0.309360000  -1.265117857
##  [36]  -4.139176667   0.080682063   0.790000000  -0.670880000   3.063760000
##  [41]  -1.711786190  -1.024086667   2.283516667   3.230715208  -0.860076667
##  [46]  -7.209290000  -0.960430079  -2.272120000  -3.188883333  -0.830568072
##  [51]   1.028754762   9.078797302  -3.833185873  -2.743470000  -3.063023333
##  [56]   0.233986667   1.097631190  -0.455676667   1.006970000  -0.354476667
##  [61]   4.763956667  -0.214310000   0.975033333  -0.950043333  -1.107186667
##  [66]  -4.651613333   5.937836459  -3.254903344  -1.278533333  -6.213082086
##  [71]   4.446340000   3.104358571   3.332153333  -2.110590000  -0.577501429
##  [76]   0.138066667   1.508937921  -0.760330000   0.262449105  -1.368658889
##  [81]  -3.598423333  -1.123331889   3.749873333  -0.731447167   6.017003333
##  [86]  -1.556023540  -1.682773333  -0.802122381   1.794991905   0.792356667
##  [91]  -1.384863333  -1.873168579  -1.470001346   0.871751192   2.918430000
##  [96]   1.689620000   3.567063333   1.616160292   0.607816667   2.928053333
## [101]  -1.604249246  -1.887962857   1.271774740   1.465173333  -3.736407921
## [106]  -1.379770000   2.114656667   4.379490000  -2.429900000   4.534193333
## [111]   0.818860000  -0.974893333   0.386531429   2.707783333   0.954815952
## [116]   1.397428095   1.750509706  19.166842727  -5.368585249  -4.832006667
## [121]  -2.089930000   0.794706667   0.424038571  -0.912724448  -5.966195684
## [126]  -1.858970000   1.488720000   0.234165000   8.115750000  -1.359113333
## [131]  -0.284088571   0.008984731  -0.644770000  -1.141963333   0.697113333
## [136]  -1.385102684  -1.537040000  -0.528966667   2.202580000  -0.482767638
## [141]  -1.977078277  -1.385644762  -1.348723163   0.422290000  -2.897343333
## [146]   1.137146667  -4.545621403   1.054850000  -0.258624034  -2.568693333
## [151]  -1.473339524  -0.854933333  -0.319572550  -3.069270000   0.061366667
## [156]  -2.431423333   4.908740000  -1.892460000  -1.433907712  -1.354686667
## [161]  -1.865293333  -1.118776667   2.543779556   0.463239708 -11.145920805
## [166]   3.396180000   0.547756667   0.718356667   2.520637619   1.517316667
## [171]  -1.419243333   1.834314921   4.713897526  -2.458633333   1.398365831
## [176]   7.671565411  -2.675716524  -1.082686528  -2.071096667   2.584770000
## [181]  -3.645966658   2.998171877   0.397487619  -2.018283636  -3.104970000
## [186]   2.172127619  -0.680980000  -0.271110000   0.765309626   3.178264286
## [191]   1.714637993  -6.863493333   1.529373333  -3.405393333  -2.415632886
## [196]   0.214736190   9.585452619   3.925567955   0.351836667   1.739140000
## [201]  -0.202268616  10.164143333
## 
## $sd_residuals
## [1] 3.624106
## 
## $scaled_residuals
## [1] 0.01260864
## 
## attr(,"class")
## [1] "conformalize"

head(predictions_boston)

##           fit       lwr      upr
## [1,] 27.03134 20.200364 33.86231
## [2,] 19.20299 12.372013 26.03396
## [3,] 21.34472 14.513743 28.17569
## [4,] 18.77455 11.943577 25.60552
## [5,] 15.60764  8.776669 22.43861
## [6,] 21.31355 14.482574 28.14452

residuals(conformal_model)

##  [1] -0.36670438 -0.62354715 -0.94322944 -0.32932915 -0.52044507 -0.27787784
##  [7] -0.45874279 -0.19674210 -0.16312291  0.06340930 -0.11804475 -0.19237025
## [13]  1.13740549  0.99286412  0.12423312  0.84311025  0.09739013  1.25013675
## [19] -0.62727075 -0.58119252  0.10162169 -0.71614946 -1.15116396 -0.14489392
## [25]  0.04356308  1.09844505  0.81964617 -0.58782163 -1.57608992  0.39416843
## [31] -0.98043147 -0.77021829 -0.45944617  0.63152251 -0.74638863  1.25134569
## [37] -0.31932130  0.10855526 -0.23925438 -0.70592671

Calculate Out-of-Sample Coverage Rate 1

The coverage rate is the proportion of true values in the test set that fall within the prediction intervals.

# True values for the test set
true_y_boston <- test_data$medv

# Check if true values fall within the prediction intervals
coverage_boston <- mean(true_y_boston >= predictions_boston[, "lwr"] & true_y_boston <= predictions_boston[, "upr"])

cat("Out-of-sample coverage rate for Boston dataset:", coverage_boston)

## Out-of-sample coverage rate for Boston dataset: 0.9313725

Example: Conformal Prediction with Out-of-Sample Coverage

Simulated Data

Fit Conformal Model

Generate Predictions and Prediction Intervals

Calculate Out-of-Sample Coverage Rate

Results

Example: Conformal Prediction with the MASS::Boston Dataset

Load the Data

Split the Data

Fit Conformal Model

Generate Predictions and Prediction Intervals

Calculate Out-of-Sample Coverage Rate 1

Example: Conformal Prediction with the `MASS::Boston` Dataset