mlreserving
Machine learning based (longitudinal) reserving model
1""" 2Machine learning based (longitudinal) reserving model 3""" 4 5__version__ = "0.2.0" 6 7# Import main classes/functions that should be available at package level 8# from .module_name import ClassName, function_name 9 10from .ml_reserving import MLReserving 11from .utils import triangle_to_df, df_to_triangle 12 13__all__ = ["MLReserving", "triangle_to_df", "df_to_triangle", "__version__"]
25class MLReserving: 26 """ 27 Machine learning based reserving model 28 29 Parameters 30 ---------- 31 32 model : object, optional 33 model to use (must implement fit and predict methods), default is RidgeCV 34 35 level: a float; 36 Confidence level for prediction intervals. Default is 95, 37 equivalent to a miscoverage error of 5 (%) 38 39 replications: an integer; 40 Number of replications for simulated conformal (default is `None`), 41 for type_pi = "bootstrap" or "kde" 42 43 conformal_method: a string 44 conformal prediction method "splitconformal" or "localconformal" 45 46 type_pi: a string; 47 type of prediction interval: currently `None` 48 split conformal prediction without simulation, "kde" or "bootstrap" 49 50 use_factors : bool, default=False 51 Whether to treat origin and development years as categorical variables 52 53 random_state : int, default=42 54 Random state for reproducibility 55 """ 56 57 def __init__(self, 58 model=None, 59 level=95, 60 replications=None, 61 conformal_method="splitconformal", 62 type_pi=None, 63 use_factors=False, 64 random_state=42): 65 if model is None: 66 model = RidgeCV(alphas=[10**i for i in range(-5, 5)]) 67 assert conformal_method in ("splitconformal", "localconformal"),\ 68 "must have conformal_method in ('splitconformal', 'localconformal')" 69 self.conformal_method = conformal_method 70 self.model = PredictionInterval(model, level=level, 71 type_pi=type_pi, 72 type_split="sequential", 73 method=conformal_method, 74 replications=replications) 75 self.level = level 76 self.replications = replications 77 self.type_pi = type_pi 78 self.use_factors = use_factors 79 self.origin_col = None 80 self.development_col = None 81 self.value_col = None 82 self.max_dev = None 83 self.origin_years = None 84 self.cumulated = None 85 self.latest_ = None 86 self.ultimate_ = None 87 self.ultimate_lower_ = None 88 self.ultimate_upper_ = None 89 self.ibnr_mean_ = None 90 self.ibnr_lower_ = None 91 self.ibnr_upper_ = None 92 self.X_test_ = None 93 self.full_data_ = None 94 self.full_data_upper_ = None 95 self.full_data_lower_ = None 96 self.full_data_sims_ = [] 97 self.scaler = StandardScaler() 98 self.origin_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') 99 self.dev_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') 100 101 def fit(self, data, origin_col="origin", 102 development_col="development", 103 value_col="values", 104 cumulated=True): 105 """ 106 Fit the model to the triangle data 107 108 Parameters 109 ---------- 110 111 data : pandas.DataFrame 112 Input data with origin, development, and value columns 113 114 origin_col : str, default="origin" 115 Name of the origin year column 116 117 development_col : str, default="development" 118 Name of the development year column 119 120 value_col : str, default="values" 121 Name of the value column 122 123 cumulated: bool, default=True 124 If the triangle is cumulated 125 126 Returns 127 ------- 128 129 self : object 130 Returns self 131 """ 132 # Store column names 133 self.origin_col = origin_col 134 self.development_col = development_col 135 self.value_col = value_col 136 self.cumulated = cumulated 137 138 df = data.copy() 139 df["dev"] = df[development_col] - df[origin_col] + 1 140 df["calendar"] = df[origin_col] + df["dev"] - 1 141 df = df.sort_values([origin_col, "dev"]) 142 # If data is cumulated, convert to incremental first 143 if self.cumulated: 144 # Calculate incremental values 145 df[value_col] = df.groupby(origin_col)[value_col].diff().fillna(method='bfill') 146 147 self.max_dev = df["dev"].max() 148 self.origin_years = df[origin_col].unique() 149 150 # Create full grid of all possible combinations 151 full_grid = pd.MultiIndex.from_product( 152 [self.origin_years, range(1, self.max_dev + 1)], 153 names=[origin_col, "dev"] 154 ).to_frame(index=False) 155 156 # Merge with original data 157 full_data = pd.merge( 158 full_grid, 159 df[[origin_col, "dev", value_col]], 160 on=[origin_col, "dev"], 161 how="left" 162 ) 163 164 # Calculate calendar year 165 full_data["calendar"] = full_data[origin_col] + full_data["dev"] - 1 166 167 # Calculate latest values for each origin year 168 self.latest_ = full_data.groupby(origin_col)[value_col].last() 169 170 # Apply transformations 171 if self.use_factors: 172 # One-hot encode origin and development years 173 origin_encoded = self.origin_encoder.fit_transform(full_data[[origin_col]]) 174 dev_encoded = self.dev_encoder.fit_transform(full_data[["dev"]]) 175 176 # Create feature names for the encoded columns 177 origin_feature_names = [f"origin_{year}" for year in self.origin_years] 178 dev_feature_names = [f"dev_{i}" for i in range(1, self.max_dev + 1)] 179 180 # Add encoded features to the dataframe 181 full_data = pd.concat([ 182 full_data, 183 pd.DataFrame(origin_encoded, columns=origin_feature_names, index=full_data.index), 184 pd.DataFrame(dev_encoded, columns=dev_feature_names, index=full_data.index) 185 ], axis=1) 186 187 # Add calendar year as a feature 188 full_data["log_calendar"] = np.log(full_data["calendar"]) 189 feature_cols = origin_feature_names + dev_feature_names + ["log_calendar"] 190 else: 191 # Use log transformations 192 full_data["log_origin"] = np.log(full_data[origin_col]) 193 full_data["log_dev"] = np.log(full_data["dev"]) 194 full_data["log_calendar"] = np.log(full_data["calendar"]) 195 feature_cols = ["log_origin", "log_dev", "log_calendar"] 196 197 # Transform response if not NaN 198 full_data[f"arcsinh_{value_col}"] = full_data[value_col].apply( 199 lambda x: arcsinh(x) if pd.notnull(x) else x 200 ) 201 202 full_data["to_predict"] = full_data[value_col].isna() 203 204 self.full_data_ = deepcopy(full_data) 205 self.full_data_lower_ = deepcopy(full_data) 206 self.full_data_upper_ = deepcopy(full_data) 207 208 train_data = full_data[~full_data["to_predict"]] 209 test_data = full_data[full_data["to_predict"]] 210 211 # Prepare features for training 212 X_train = train_data[feature_cols].values 213 X_test = test_data[feature_cols].values 214 215 # Scale features 216 X_train_scaled = self.scaler.fit_transform(X_train) 217 self.X_test_ = self.scaler.transform(X_test) 218 219 y_train = train_data[f"arcsinh_{value_col}"].values 220 221 self.model.fit(X_train_scaled, y_train) 222 223 return self 224 225 def predict(self): 226 """ 227 Make predictions for the missing values in the triangle 228 229 Returns 230 ------- 231 232 DescribeResult 233 Named tuple containing mean, lower, and upper triangles 234 """ 235 preds = self.model.predict(self.X_test_, return_pi=True) 236 237 to_predict = self.full_data_["to_predict"] 238 239 # Transform predictions back to original scale 240 mean_pred = inv_arcsinh(preds.mean) 241 lower_pred = inv_arcsinh(preds.lower) 242 upper_pred = inv_arcsinh(preds.upper) 243 244 # Store predictions in the full data 245 self.full_data_.loc[to_predict, self.value_col] = mean_pred 246 self.full_data_lower_.loc[to_predict, self.value_col] = lower_pred 247 self.full_data_upper_.loc[to_predict, self.value_col] = upper_pred 248 249 # Calculate IBNR based on predicted values (in incremental form) 250 test_data = self.full_data_[to_predict] 251 252 # Group by origin year and sum predictions 253 self.ibnr_mean_ = test_data.groupby(self.origin_col)[self.value_col].sum() 254 self.ibnr_lower_ = self.full_data_lower_[to_predict].groupby(self.origin_col)[self.value_col].sum() 255 self.ibnr_upper_ = self.full_data_upper_[to_predict].groupby(self.origin_col)[self.value_col].sum() 256 257 # If data was originally cumulated, convert predictions back to cumulative 258 if self.cumulated: 259 for df in [self.full_data_, self.full_data_lower_, self.full_data_upper_]: 260 # Calculate cumulative values 261 df[self.value_col] = df.groupby(self.origin_col)[self.value_col].cumsum() 262 263 # Calculate triangles using utility function 264 mean_triangle = df_to_triangle( 265 self.full_data_, 266 origin_col=self.origin_col, 267 development_col="dev", 268 value_col=self.value_col 269 ) 270 lower_triangle = df_to_triangle( 271 self.full_data_lower_, 272 origin_col=self.origin_col, 273 development_col="dev", 274 value_col=self.value_col 275 ) 276 upper_triangle = df_to_triangle( 277 self.full_data_upper_, 278 origin_col=self.origin_col, 279 development_col="dev", 280 value_col=self.value_col 281 ) 282 283 # Calculate ultimate values 284 if self.cumulated: 285 # For cumulative data, ultimate is the last value in each origin year 286 self.ultimate_ = self.full_data_.groupby(self.origin_col)[self.value_col].last() 287 self.ultimate_lower_ = self.full_data_lower_.groupby(self.origin_col)[self.value_col].last() 288 self.ultimate_upper_ = self.full_data_upper_.groupby(self.origin_col)[self.value_col].last() 289 else: 290 # For incremental data, ultimate is latest + IBNR 291 self.ultimate_ = self.latest_ + self.ibnr_mean_ 292 self.ultimate_lower_ = self.latest_ + self.ibnr_lower_ 293 self.ultimate_upper_ = self.latest_ + self.ibnr_upper_ 294 295 DescribeResult = namedtuple("DescribeResult", 296 ("mean", "lower", "upper")) 297 return DescribeResult(mean_triangle.T, 298 lower_triangle.T, 299 upper_triangle.T) 300 301 def get_ibnr(self): 302 """ 303 Get the IBNR (Incurred But Not Reported) values for each origin year 304 305 Returns 306 ------- 307 308 pandas.DataFrame 309 IBNR values (mean, lower, upper) indexed by origin year 310 """ 311 if self.ibnr_mean_ is None: 312 raise ValueError("Model must be fitted and predict() must be called before getting IBNR values") 313 314 DescribeResult = namedtuple("DescribeResult", 315 ("mean", "lower", "upper")) 316 317 return DescribeResult(self.ibnr_mean_, self.ibnr_lower_, self.ibnr_upper_) 318 319 def get_latest(self): 320 """ 321 Get the latest known values for each origin year 322 323 Returns 324 ------- 325 326 pandas.Series 327 Latest known values indexed by origin year 328 """ 329 if self.latest_ is None: 330 raise ValueError("Model must be fitted before getting latest values") 331 return self.latest_ 332 333 def get_ultimate(self): 334 """ 335 Get the ultimate loss estimates for each origin year 336 337 Returns 338 ------- 339 340 pandas.DataFrame 341 Ultimate loss estimates (mean, lower, upper) indexed by origin year 342 """ 343 if self.ultimate_ is None: 344 raise ValueError("Model must be fitted before getting ultimate values") 345 346 DescribeResult = namedtuple("DescribeResult", 347 ("mean", "lower", "upper")) 348 349 return DescribeResult(self.ultimate_, 350 self.ultimate_lower_, 351 self.ultimate_upper_) 352 353 def get_summary(self): 354 """ 355 Get a summary of reserving results including latest values, ultimate estimates, 356 and IBNR values with confidence intervals. 357 358 Returns 359 ------- 360 361 dict 362 Dictionary containing two keys: 363 - 'ByOrigin': DataFrame with results by origin year 364 - 'Totals': Series with total values 365 """ 366 if self.ultimate_ is None: 367 raise ValueError("Model must be fitted before getting summary") 368 369 # Get latest values 370 latest = self.get_latest() 371 372 # Get ultimate values 373 ultimate = self.get_ultimate() 374 375 # Get IBNR values 376 ibnr = self.get_ibnr() 377 378 # Create summary by origin 379 summary_by_origin = pd.DataFrame({ 380 'Latest': latest, 381 'Mean Ultimate': ultimate.mean, 382 'Mean IBNR': ibnr.mean, 383 f'IBNR {self.level}%': ibnr.upper, 384 f'Ultimate Lo{self.level}': ultimate.lower, 385 f'Ultimate Hi{self.level}': ultimate.upper 386 }) 387 388 # Calculate totals 389 totals = pd.Series({ 390 'Latest': latest.sum(), 391 'Mean Ultimate': ultimate.mean.sum(), 392 'Mean IBNR': ibnr.mean.sum(), 393 f'Total IBNR {self.level}%': ibnr.upper.sum() 394 }) 395 396 return { 397 'ByOrigin': summary_by_origin, 398 'Totals': totals 399 }
Machine learning based reserving model
Parameters
model : object, optional model to use (must implement fit and predict methods), default is RidgeCV
level: a float; Confidence level for prediction intervals. Default is 95, equivalent to a miscoverage error of 5 (%)
replications: an integer;
Number of replications for simulated conformal (default is None
),
for type_pi = "bootstrap" or "kde"
conformal_method: a string conformal prediction method "splitconformal" or "localconformal"
type_pi: a string;
type of prediction interval: currently None
split conformal prediction without simulation, "kde" or "bootstrap"
use_factors : bool, default=False Whether to treat origin and development years as categorical variables
random_state : int, default=42 Random state for reproducibility
101 def fit(self, data, origin_col="origin", 102 development_col="development", 103 value_col="values", 104 cumulated=True): 105 """ 106 Fit the model to the triangle data 107 108 Parameters 109 ---------- 110 111 data : pandas.DataFrame 112 Input data with origin, development, and value columns 113 114 origin_col : str, default="origin" 115 Name of the origin year column 116 117 development_col : str, default="development" 118 Name of the development year column 119 120 value_col : str, default="values" 121 Name of the value column 122 123 cumulated: bool, default=True 124 If the triangle is cumulated 125 126 Returns 127 ------- 128 129 self : object 130 Returns self 131 """ 132 # Store column names 133 self.origin_col = origin_col 134 self.development_col = development_col 135 self.value_col = value_col 136 self.cumulated = cumulated 137 138 df = data.copy() 139 df["dev"] = df[development_col] - df[origin_col] + 1 140 df["calendar"] = df[origin_col] + df["dev"] - 1 141 df = df.sort_values([origin_col, "dev"]) 142 # If data is cumulated, convert to incremental first 143 if self.cumulated: 144 # Calculate incremental values 145 df[value_col] = df.groupby(origin_col)[value_col].diff().fillna(method='bfill') 146 147 self.max_dev = df["dev"].max() 148 self.origin_years = df[origin_col].unique() 149 150 # Create full grid of all possible combinations 151 full_grid = pd.MultiIndex.from_product( 152 [self.origin_years, range(1, self.max_dev + 1)], 153 names=[origin_col, "dev"] 154 ).to_frame(index=False) 155 156 # Merge with original data 157 full_data = pd.merge( 158 full_grid, 159 df[[origin_col, "dev", value_col]], 160 on=[origin_col, "dev"], 161 how="left" 162 ) 163 164 # Calculate calendar year 165 full_data["calendar"] = full_data[origin_col] + full_data["dev"] - 1 166 167 # Calculate latest values for each origin year 168 self.latest_ = full_data.groupby(origin_col)[value_col].last() 169 170 # Apply transformations 171 if self.use_factors: 172 # One-hot encode origin and development years 173 origin_encoded = self.origin_encoder.fit_transform(full_data[[origin_col]]) 174 dev_encoded = self.dev_encoder.fit_transform(full_data[["dev"]]) 175 176 # Create feature names for the encoded columns 177 origin_feature_names = [f"origin_{year}" for year in self.origin_years] 178 dev_feature_names = [f"dev_{i}" for i in range(1, self.max_dev + 1)] 179 180 # Add encoded features to the dataframe 181 full_data = pd.concat([ 182 full_data, 183 pd.DataFrame(origin_encoded, columns=origin_feature_names, index=full_data.index), 184 pd.DataFrame(dev_encoded, columns=dev_feature_names, index=full_data.index) 185 ], axis=1) 186 187 # Add calendar year as a feature 188 full_data["log_calendar"] = np.log(full_data["calendar"]) 189 feature_cols = origin_feature_names + dev_feature_names + ["log_calendar"] 190 else: 191 # Use log transformations 192 full_data["log_origin"] = np.log(full_data[origin_col]) 193 full_data["log_dev"] = np.log(full_data["dev"]) 194 full_data["log_calendar"] = np.log(full_data["calendar"]) 195 feature_cols = ["log_origin", "log_dev", "log_calendar"] 196 197 # Transform response if not NaN 198 full_data[f"arcsinh_{value_col}"] = full_data[value_col].apply( 199 lambda x: arcsinh(x) if pd.notnull(x) else x 200 ) 201 202 full_data["to_predict"] = full_data[value_col].isna() 203 204 self.full_data_ = deepcopy(full_data) 205 self.full_data_lower_ = deepcopy(full_data) 206 self.full_data_upper_ = deepcopy(full_data) 207 208 train_data = full_data[~full_data["to_predict"]] 209 test_data = full_data[full_data["to_predict"]] 210 211 # Prepare features for training 212 X_train = train_data[feature_cols].values 213 X_test = test_data[feature_cols].values 214 215 # Scale features 216 X_train_scaled = self.scaler.fit_transform(X_train) 217 self.X_test_ = self.scaler.transform(X_test) 218 219 y_train = train_data[f"arcsinh_{value_col}"].values 220 221 self.model.fit(X_train_scaled, y_train) 222 223 return self
Fit the model to the triangle data
Parameters
data : pandas.DataFrame Input data with origin, development, and value columns
origin_col : str, default="origin" Name of the origin year column
development_col : str, default="development" Name of the development year column
value_col : str, default="values" Name of the value column
cumulated: bool, default=True If the triangle is cumulated
Returns
self : object Returns self
225 def predict(self): 226 """ 227 Make predictions for the missing values in the triangle 228 229 Returns 230 ------- 231 232 DescribeResult 233 Named tuple containing mean, lower, and upper triangles 234 """ 235 preds = self.model.predict(self.X_test_, return_pi=True) 236 237 to_predict = self.full_data_["to_predict"] 238 239 # Transform predictions back to original scale 240 mean_pred = inv_arcsinh(preds.mean) 241 lower_pred = inv_arcsinh(preds.lower) 242 upper_pred = inv_arcsinh(preds.upper) 243 244 # Store predictions in the full data 245 self.full_data_.loc[to_predict, self.value_col] = mean_pred 246 self.full_data_lower_.loc[to_predict, self.value_col] = lower_pred 247 self.full_data_upper_.loc[to_predict, self.value_col] = upper_pred 248 249 # Calculate IBNR based on predicted values (in incremental form) 250 test_data = self.full_data_[to_predict] 251 252 # Group by origin year and sum predictions 253 self.ibnr_mean_ = test_data.groupby(self.origin_col)[self.value_col].sum() 254 self.ibnr_lower_ = self.full_data_lower_[to_predict].groupby(self.origin_col)[self.value_col].sum() 255 self.ibnr_upper_ = self.full_data_upper_[to_predict].groupby(self.origin_col)[self.value_col].sum() 256 257 # If data was originally cumulated, convert predictions back to cumulative 258 if self.cumulated: 259 for df in [self.full_data_, self.full_data_lower_, self.full_data_upper_]: 260 # Calculate cumulative values 261 df[self.value_col] = df.groupby(self.origin_col)[self.value_col].cumsum() 262 263 # Calculate triangles using utility function 264 mean_triangle = df_to_triangle( 265 self.full_data_, 266 origin_col=self.origin_col, 267 development_col="dev", 268 value_col=self.value_col 269 ) 270 lower_triangle = df_to_triangle( 271 self.full_data_lower_, 272 origin_col=self.origin_col, 273 development_col="dev", 274 value_col=self.value_col 275 ) 276 upper_triangle = df_to_triangle( 277 self.full_data_upper_, 278 origin_col=self.origin_col, 279 development_col="dev", 280 value_col=self.value_col 281 ) 282 283 # Calculate ultimate values 284 if self.cumulated: 285 # For cumulative data, ultimate is the last value in each origin year 286 self.ultimate_ = self.full_data_.groupby(self.origin_col)[self.value_col].last() 287 self.ultimate_lower_ = self.full_data_lower_.groupby(self.origin_col)[self.value_col].last() 288 self.ultimate_upper_ = self.full_data_upper_.groupby(self.origin_col)[self.value_col].last() 289 else: 290 # For incremental data, ultimate is latest + IBNR 291 self.ultimate_ = self.latest_ + self.ibnr_mean_ 292 self.ultimate_lower_ = self.latest_ + self.ibnr_lower_ 293 self.ultimate_upper_ = self.latest_ + self.ibnr_upper_ 294 295 DescribeResult = namedtuple("DescribeResult", 296 ("mean", "lower", "upper")) 297 return DescribeResult(mean_triangle.T, 298 lower_triangle.T, 299 upper_triangle.T)
Make predictions for the missing values in the triangle
Returns
DescribeResult Named tuple containing mean, lower, and upper triangles
48def triangle_to_df(triangle, origin_col="origin", 49 development_col="development", 50 value_col="values"): 51 """ 52 Convert a triangle format into a data frame with origin, development, and value columns 53 54 Parameters 55 ---------- 56 triangle : pandas.DataFrame 57 Triangle format with origin years as index and development years as columns 58 origin_col : str, default="origin" 59 Name of the origin year column 60 development_col : str, default="development" 61 Name of the development year column 62 value_col : str, default="values" 63 Name of the value column 64 65 Returns 66 ------- 67 pandas.DataFrame 68 Data frame with origin, development, and value columns 69 """ 70 # Reset index to get origin years as a column 71 df = triangle.reset_index() 72 73 # Melt the development columns into rows 74 df = pd.melt( 75 df, 76 id_vars=[origin_col], 77 var_name="dev", 78 value_name=value_col 79 ) 80 81 # Calculate development year and calendar year 82 df[development_col] = df[origin_col] + df["dev"] - 1 83 df["calendar"] = df[origin_col] + df["dev"] - 1 84 85 # Reorder columns and sort by calendar year 86 df = df[[origin_col, development_col, "dev", "calendar", value_col]] 87 88 df.sort_values("calendar", inplace=True) 89 90 return df
Convert a triangle format into a data frame with origin, development, and value columns
Parameters
triangle : pandas.DataFrame Triangle format with origin years as index and development years as columns origin_col : str, default="origin" Name of the origin year column development_col : str, default="development" Name of the development year column value_col : str, default="values" Name of the value column
Returns
pandas.DataFrame Data frame with origin, development, and value columns
10def df_to_triangle(df, origin_col="origin", development_col="development", value_col="values"): 11 """ 12 Convert a data frame with origin, development, and value columns into a triangle format 13 14 Parameters 15 ---------- 16 df : pandas.DataFrame 17 Input data with origin, development, and value columns 18 origin_col : str, default="origin" 19 Name of the origin year column 20 development_col : str, default="development" 21 Name of the development year column 22 value_col : str, default="values" 23 Name of the value column 24 25 Returns 26 ------- 27 pandas.DataFrame 28 Triangle format with origin years as index and development years as columns 29 """ 30 # Calculate development lag and calendar year 31 df = df.copy() 32 33 # If development_col is not 'dev', calculate it 34 if development_col != "dev": 35 df["dev"] = df[development_col] - df[origin_col] + 1 36 37 df["calendar"] = df[origin_col] + df["dev"] - 1 38 39 # Create triangle 40 triangle = df.pivot( 41 index=origin_col, 42 columns="dev", 43 values=value_col 44 ).sort_index() 45 46 return triangle
Convert a data frame with origin, development, and value columns into a triangle format
Parameters
df : pandas.DataFrame Input data with origin, development, and value columns origin_col : str, default="origin" Name of the origin year column development_col : str, default="development" Name of the development year column value_col : str, default="values" Name of the value column
Returns
pandas.DataFrame Triangle format with origin years as index and development years as columns