survivalist.nonparametric
1# This program is free software: you can redistribute it and/or modify 2# it under the terms of the GNU General Public License as published by 3# the Free Software Foundation, either version 3 of the License, or 4# (at your option) any later version. 5# 6# This program is distributed in the hope that it will be useful, 7# but WITHOUT ANY WARRANTY; without even the implied warranty of 8# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9# GNU General Public License for more details. 10# 11# You should have received a copy of the GNU General Public License 12# along with this program. If not, see <http://www.gnu.org/licenses/>. 13import numbers 14 15import numpy as np 16from scipy import stats 17from sklearn.base import BaseEstimator 18from sklearn.utils._param_validation import Interval, StrOptions 19from sklearn.utils.validation import ( 20 check_array, 21 check_consistent_length, 22 check_is_fitted, 23) 24 25from .util import check_y_survival 26 27__all__ = [ 28 "CensoringDistributionEstimator", 29 "kaplan_meier_estimator", 30 "nelson_aalen_estimator", 31 "ipc_weights", 32 "SurvivalFunctionEstimator", 33] 34 35 36def _compute_counts(event, time, order=None): 37 """Count right censored and uncensored samples at each unique time point. 38 39 Parameters 40 ---------- 41 event : array 42 Boolean event indicator. 43 44 time : array 45 Survival time or time of censoring. 46 47 order : array or None 48 Indices to order time in ascending order. 49 If None, order will be computed. 50 51 Returns 52 ------- 53 times : array 54 Unique time points. 55 56 n_events : array 57 Number of events at each time point. 58 59 n_at_risk : array 60 Number of samples that have not been censored or have not had an event at each time point. 61 62 n_censored : array 63 Number of censored samples at each time point. 64 """ 65 n_samples = event.shape[0] 66 67 if order is None: 68 order = np.argsort(time, kind="mergesort") 69 70 uniq_times = np.empty(n_samples, dtype=time.dtype) 71 uniq_events = np.empty(n_samples, dtype=int) 72 uniq_counts = np.empty(n_samples, dtype=int) 73 74 i = 0 75 prev_val = time[order[0]] 76 j = 0 77 while True: 78 count_event = 0 79 count = 0 80 while i < n_samples and prev_val == time[order[i]]: 81 if event[order[i]]: 82 count_event += 1 83 84 count += 1 85 i += 1 86 87 uniq_times[j] = prev_val 88 uniq_events[j] = count_event 89 uniq_counts[j] = count 90 j += 1 91 92 if i == n_samples: 93 break 94 95 prev_val = time[order[i]] 96 97 times = np.resize(uniq_times, j) 98 n_events = np.resize(uniq_events, j) 99 total_count = np.resize(uniq_counts, j) 100 n_censored = total_count - n_events 101 102 # offset cumulative sum by one 103 total_count = np.r_[0, total_count] 104 n_at_risk = n_samples - np.cumsum(total_count) 105 106 return times, n_events, n_at_risk[:-1], n_censored 107 108 109def _compute_counts_truncated(event, time_enter, time_exit): 110 """Compute counts for left truncated and right censored survival data. 111 112 Parameters 113 ---------- 114 event : array 115 Boolean event indicator. 116 117 time_start : array 118 Time when a subject entered the study. 119 120 time_exit : array 121 Time when a subject left the study due to an 122 event or censoring. 123 124 Returns 125 ------- 126 times : array 127 Unique time points. 128 129 n_events : array 130 Number of events at each time point. 131 132 n_at_risk : array 133 Number of samples that are censored or have an event at each time point. 134 """ 135 if (time_enter > time_exit).any(): 136 raise ValueError("exit time must be larger start time for all samples") 137 138 n_samples = event.shape[0] 139 140 uniq_times = np.sort( 141 np.unique(np.r_[time_enter, time_exit]), kind="mergesort") 142 total_counts = np.empty(len(uniq_times), dtype=int) 143 event_counts = np.empty(len(uniq_times), dtype=int) 144 145 order_enter = np.argsort(time_enter, kind="mergesort") 146 order_exit = np.argsort(time_exit, kind="mergesort") 147 s_time_enter = time_enter[order_enter] 148 s_time_exit = time_exit[order_exit] 149 150 t0 = uniq_times[0] 151 # everything larger is included 152 idx_enter = np.searchsorted(s_time_enter, t0, side="right") 153 # everything smaller is excluded 154 idx_exit = np.searchsorted(s_time_exit, t0, side="left") 155 156 total_counts[0] = idx_enter 157 # except people die on the day they enter 158 event_counts[0] = 0 159 160 for i in range(1, len(uniq_times)): 161 ti = uniq_times[i] 162 163 while idx_enter < n_samples and s_time_enter[idx_enter] < ti: 164 idx_enter += 1 165 166 while idx_exit < n_samples and s_time_exit[idx_exit] < ti: 167 idx_exit += 1 168 169 risk_set = np.setdiff1d( 170 order_enter[:idx_enter], order_exit[:idx_exit], assume_unique=True) 171 total_counts[i] = len(risk_set) 172 173 count_event = 0 174 k = idx_exit 175 while k < n_samples and s_time_exit[k] == ti: 176 if event[order_exit[k]]: 177 count_event += 1 178 k += 1 179 event_counts[i] = count_event 180 181 return uniq_times, event_counts, total_counts 182 183 184def _ci_logmlog(prob_survival, sigma_t, z): 185 """Compute the pointwise log-minus-log transformed confidence intervals""" 186 eps = np.finfo(prob_survival.dtype).eps 187 log_p = np.zeros_like(prob_survival) 188 np.log(prob_survival, where=prob_survival > eps, out=log_p) 189 theta = np.zeros_like(prob_survival) 190 np.true_divide(sigma_t, log_p, where=log_p < -eps, out=theta) 191 theta = np.array([[-1], [1]]) * theta * z 192 ci = np.exp(np.exp(theta) * log_p) 193 ci[:, prob_survival <= eps] = 0.0 194 ci[:, 1.0 - prob_survival <= eps] = 1.0 195 return ci 196 197 198def _km_ci_estimator(prob_survival, ratio_var, conf_level, conf_type): 199 if conf_type not in {"log-log"}: 200 raise ValueError( 201 f"conf_type must be None or a str among {{'log-log'}}, but was {conf_type!r}") 202 203 if not isinstance(conf_level, numbers.Real) or not np.isfinite(conf_level) or conf_level <= 0 or conf_level >= 1.0: 204 raise ValueError( 205 f"conf_level must be a float in the range (0.0, 1.0), but was {conf_level!r}") 206 207 z = stats.norm.isf((1.0 - conf_level) / 2.0) 208 sigma = np.sqrt(np.cumsum(ratio_var)) 209 ci = _ci_logmlog(prob_survival, sigma, z) 210 return ci 211 212 213def kaplan_meier_estimator( 214 event, 215 time_exit, 216 time_enter=None, 217 time_min=None, 218 reverse=False, 219 conf_level=0.95, 220 conf_type=None, 221): 222 """Kaplan-Meier estimator of survival function. 223 224 See [1]_ for further description. 225 226 Parameters 227 ---------- 228 event : array-like, shape = (n_samples,) 229 Contains binary event indicators. 230 231 time_exit : array-like, shape = (n_samples,) 232 Contains event/censoring times. 233 234 time_enter : array-like, shape = (n_samples,), optional 235 Contains time when each individual entered the study for 236 left truncated survival data. 237 238 time_min : float, optional 239 Compute estimator conditional on survival at least up to 240 the specified time. 241 242 reverse : bool, optional, default: False 243 Whether to estimate the censoring distribution. 244 When there are ties between times at which events are observed, 245 then events come first and are subtracted from the denominator. 246 Only available for right-censored data, i.e. `time_enter` must 247 be None. 248 249 conf_level : float, optional, default: 0.95 250 The level for a two-sided confidence interval on the survival curves. 251 252 conf_type : None or {'log-log'}, optional, default: None. 253 The type of confidence intervals to estimate. 254 If `None`, no confidence intervals are estimated. 255 If "log-log", estimate confidence intervals using 256 the log hazard or :math:`log(-log(S(t)))` as described in [2]_. 257 258 Returns 259 ------- 260 time : array, shape = (n_times,) 261 Unique times. 262 263 prob_survival : array, shape = (n_times,) 264 Survival probability at each unique time point. 265 If `time_enter` is provided, estimates are conditional probabilities. 266 267 conf_int : array, shape = (2, n_times) 268 Pointwise confidence interval of the Kaplan-Meier estimator 269 at each unique time point. 270 Only provided if `conf_type` is not None. 271 272 Examples 273 -------- 274 Creating a Kaplan-Meier curve: 275 276 >>> x, y, conf_int = kaplan_meier_estimator(event, time, conf_type="log-log") 277 >>> plt.step(x, y, where="post") 278 >>> plt.fill_between(x, conf_int[0], conf_int[1], alpha=0.25, step="post") 279 >>> plt.ylim(0, 1) 280 >>> plt.show() 281 282 See also 283 -------- 284 survivalist.nonparametric.SurvivalFunctionEstimator 285 Estimator API of the Kaplan-Meier estimator. 286 287 References 288 ---------- 289 .. [1] Kaplan, E. L. and Meier, P., "Nonparametric estimation from incomplete observations", 290 Journal of The American Statistical Association, vol. 53, pp. 457-481, 1958. 291 .. [2] Borgan Ø. and Liestøl K., "A Note on Confidence Intervals and Bands for the 292 Survival Function Based on Transformations", Scandinavian Journal of 293 Statistics. 1990;17(1):35–41. 294 """ 295 event, time_enter, time_exit = check_y_survival( 296 event, time_enter, time_exit, allow_all_censored=True) 297 check_consistent_length(event, time_enter, time_exit) 298 299 if conf_type is not None and reverse: 300 raise NotImplementedError( 301 "Confidence intervals of the censoring distribution is not implemented.") 302 303 if time_enter is None: 304 uniq_times, n_events, n_at_risk, n_censored = _compute_counts( 305 event, time_exit) 306 307 if reverse: 308 n_at_risk -= n_events 309 n_events = n_censored 310 else: 311 if reverse: 312 raise ValueError( 313 "The censoring distribution cannot be estimated from left truncated data") 314 315 uniq_times, n_events, n_at_risk = _compute_counts_truncated( 316 event, time_enter, time_exit) 317 318 # account for 0/0 = nan 319 ratio = np.divide( 320 n_events, 321 n_at_risk, 322 out=np.zeros(uniq_times.shape[0], dtype=float), 323 where=n_events != 0, 324 ) 325 values = 1.0 - ratio 326 327 if conf_type is not None: 328 ratio_var = np.divide( 329 n_events, 330 n_at_risk * (n_at_risk - n_events), 331 out=np.zeros(uniq_times.shape[0], dtype=float), 332 where=(n_events != 0) & (n_at_risk != n_events), 333 ) 334 335 if time_min is not None: 336 mask = uniq_times >= time_min 337 uniq_times = np.compress(mask, uniq_times) 338 values = np.compress(mask, values) 339 340 prob_survival = np.cumprod(values) 341 342 if conf_type is None: 343 return uniq_times, prob_survival 344 345 if time_min is not None: 346 ratio_var = np.compress(mask, ratio_var) 347 348 ci = _km_ci_estimator(prob_survival, ratio_var, conf_level, conf_type) 349 350 return uniq_times, prob_survival, ci 351 352 353def nelson_aalen_estimator(event, time): 354 """Nelson-Aalen estimator of cumulative hazard function. 355 356 See [1]_, [2]_ for further description. 357 358 Parameters 359 ---------- 360 event : array-like, shape = (n_samples,) 361 Contains binary event indicators. 362 363 time : array-like, shape = (n_samples,) 364 Contains event/censoring times. 365 366 Returns 367 ------- 368 time : array, shape = (n_times,) 369 Unique times. 370 371 cum_hazard : array, shape = (n_times,) 372 Cumulative hazard at each unique time point. 373 374 References 375 ---------- 376 .. [1] Nelson, W., "Theory and applications of hazard plotting for censored failure data", 377 Technometrics, vol. 14, pp. 945-965, 1972. 378 379 .. [2] Aalen, O. O., "Nonparametric inference for a family of counting processes", 380 Annals of Statistics, vol. 6, pp. 701–726, 1978. 381 """ 382 event, time = check_y_survival(event, time) 383 check_consistent_length(event, time) 384 uniq_times, n_events, n_at_risk, _ = _compute_counts(event, time) 385 386 y = np.cumsum(n_events / n_at_risk) 387 388 return uniq_times, y 389 390 391def ipc_weights(event, time): 392 """Compute inverse probability of censoring weights 393 394 Parameters 395 ---------- 396 event : array, shape = (n_samples,) 397 Boolean event indicator. 398 399 time : array, shape = (n_samples,) 400 Time when a subject experienced an event or was censored. 401 402 Returns 403 ------- 404 weights : array, shape = (n_samples,) 405 inverse probability of censoring weights 406 407 See also 408 -------- 409 CensoringDistributionEstimator 410 An estimator interface for estimating inverse probability 411 of censoring weights for unseen time points. 412 """ 413 if event.all(): 414 return np.ones(time.shape[0]) 415 416 unique_time, p = kaplan_meier_estimator(event, time, reverse=True) 417 418 idx = np.searchsorted(unique_time, time[event]) 419 Ghat = p[idx] 420 421 assert (Ghat > 0).all() 422 423 weights = np.zeros(time.shape[0]) 424 weights[event] = 1.0 / Ghat 425 426 return weights 427 428 429class SurvivalFunctionEstimator(BaseEstimator): 430 """Kaplan–Meier estimate of the survival function. 431 432 Parameters 433 ---------- 434 conf_level : float, optional, default: 0.95 435 The level for a two-sided confidence interval on the survival curves. 436 437 conf_type : None or {'log-log'}, optional, default: None. 438 The type of confidence intervals to estimate. 439 If `None`, no confidence intervals are estimated. 440 If "log-log", estimate confidence intervals using 441 the log hazard or :math:`log(-log(S(t)))`. 442 443 See also 444 -------- 445 survivalist.nonparametric.kaplan_meier_estimator 446 Functional API of the Kaplan-Meier estimator. 447 """ 448 449 _parameter_constraints = { 450 "conf_level": [Interval(numbers.Real, 0.0, 1.0, closed="neither")], 451 "conf_type": [None, StrOptions({"log-log"})], 452 } 453 454 def __init__(self, conf_level=0.95, conf_type=None): 455 self.conf_level = conf_level 456 self.conf_type = conf_type 457 458 def fit(self, y): 459 """Estimate survival distribution from training data. 460 461 Parameters 462 ---------- 463 y : structured array, shape = (n_samples,) 464 A structured array containing the binary event indicator 465 as first field, and time of event or time of censoring as 466 second field. 467 468 Returns 469 ------- 470 self 471 """ 472 self._validate_params() 473 event, time = check_y_survival(y, allow_all_censored=True) 474 475 values = kaplan_meier_estimator( 476 event, time, conf_level=self.conf_level, conf_type=self.conf_type) 477 if self.conf_type is None: 478 unique_time, prob = values 479 else: 480 unique_time, prob, conf_int = values 481 self.conf_int_ = np.column_stack((np.ones((2, 1)), conf_int)) 482 483 self.unique_time_ = np.r_[-np.inf, unique_time] 484 self.prob_ = np.r_[1.0, prob] 485 486 return self 487 488 def predict_proba(self, time, return_conf_int=False): 489 """Return probability of an event after given time point. 490 491 :math:`\\hat{S}(t) = P(T > t)` 492 493 Parameters 494 ---------- 495 time : array, shape = (n_samples,) 496 Time to estimate probability at. 497 498 return_conf_int : bool, optional, default: False 499 Whether to return the pointwise confidence interval 500 of the survival function. 501 Only available if :meth:`fit()` has been called 502 with the `conf_type` parameter set. 503 504 Returns 505 ------- 506 prob : array, shape = (n_samples,) 507 Probability of an event at the passed time points. 508 509 conf_int : array, shape = (2, n_samples) 510 Pointwise confidence interval at the passed time points. 511 Only provided if `return_conf_int` is True. 512 """ 513 check_is_fitted(self, "unique_time_") 514 if return_conf_int and not hasattr(self, "conf_int_"): 515 raise ValueError( 516 "If return_conf_int is True, SurvivalFunctionEstimator must be fitted with conf_int != None" 517 ) 518 519 time = check_array(time, ensure_2d=False, 520 estimator=self, input_name="time") 521 522 # K-M is undefined if estimate at last time point is non-zero 523 extends = time > self.unique_time_[-1] 524 if self.prob_[-1] > 0 and extends.any(): 525 raise ValueError( 526 f"time must be smaller than largest observed time point: {self.unique_time_[-1]}") 527 528 # beyond last time point is zero probability 529 Shat = np.empty(time.shape, dtype=float) 530 Shat[extends] = 0.0 531 532 valid = ~extends 533 time = time[valid] 534 idx = np.searchsorted(self.unique_time_, time) 535 # for non-exact matches, we need to shift the index to left 536 eps = np.finfo(self.unique_time_.dtype).eps 537 exact = np.absolute(self.unique_time_[idx] - time) < eps 538 idx[~exact] -= 1 539 Shat[valid] = self.prob_[idx] 540 541 if not return_conf_int: 542 return Shat 543 544 ci = np.empty((2, time.shape[0]), dtype=float) 545 ci[:, extends] = np.nan 546 ci[:, valid] = self.conf_int_[:, idx] 547 return Shat, ci 548 549 550class CensoringDistributionEstimator(SurvivalFunctionEstimator): 551 """Kaplan–Meier estimator for the censoring distribution.""" 552 553 def fit(self, y): 554 """Estimate censoring distribution from training data. 555 556 Parameters 557 ---------- 558 y : structured array, shape = (n_samples,) 559 A structured array containing the binary event indicator 560 as first field, and time of event or time of censoring as 561 second field. 562 563 Returns 564 ------- 565 self 566 """ 567 event, time = check_y_survival(y) 568 if event.all(): 569 self.unique_time_ = np.unique(time) 570 self.prob_ = np.ones(self.unique_time_.shape[0]) 571 else: 572 unique_time, prob = kaplan_meier_estimator( 573 event, time, reverse=True) 574 self.unique_time_ = np.r_[-np.inf, unique_time] 575 self.prob_ = np.r_[1.0, prob] 576 577 return self 578 579 def predict_ipcw(self, y): 580 """Return inverse probability of censoring weights at given time points. 581 582 :math:`\\omega_i = \\delta_i / \\hat{G}(y_i)` 583 584 Parameters 585 ---------- 586 y : structured array, shape = (n_samples,) 587 A structured array containing the binary event indicator 588 as first field, and time of event or time of censoring as 589 second field. 590 591 Returns 592 ------- 593 ipcw : array, shape = (n_samples,) 594 Inverse probability of censoring weights. 595 """ 596 event, time = check_y_survival(y) 597 Ghat = self.predict_proba(time[event]) 598 599 if (Ghat == 0.0).any(): 600 raise ValueError( 601 "censoring survival function is zero at one or more time points") 602 603 weights = np.zeros(time.shape[0]) 604 weights[event] = 1.0 / Ghat 605 606 return weights
551class CensoringDistributionEstimator(SurvivalFunctionEstimator): 552 """Kaplan–Meier estimator for the censoring distribution.""" 553 554 def fit(self, y): 555 """Estimate censoring distribution from training data. 556 557 Parameters 558 ---------- 559 y : structured array, shape = (n_samples,) 560 A structured array containing the binary event indicator 561 as first field, and time of event or time of censoring as 562 second field. 563 564 Returns 565 ------- 566 self 567 """ 568 event, time = check_y_survival(y) 569 if event.all(): 570 self.unique_time_ = np.unique(time) 571 self.prob_ = np.ones(self.unique_time_.shape[0]) 572 else: 573 unique_time, prob = kaplan_meier_estimator( 574 event, time, reverse=True) 575 self.unique_time_ = np.r_[-np.inf, unique_time] 576 self.prob_ = np.r_[1.0, prob] 577 578 return self 579 580 def predict_ipcw(self, y): 581 """Return inverse probability of censoring weights at given time points. 582 583 :math:`\\omega_i = \\delta_i / \\hat{G}(y_i)` 584 585 Parameters 586 ---------- 587 y : structured array, shape = (n_samples,) 588 A structured array containing the binary event indicator 589 as first field, and time of event or time of censoring as 590 second field. 591 592 Returns 593 ------- 594 ipcw : array, shape = (n_samples,) 595 Inverse probability of censoring weights. 596 """ 597 event, time = check_y_survival(y) 598 Ghat = self.predict_proba(time[event]) 599 600 if (Ghat == 0.0).any(): 601 raise ValueError( 602 "censoring survival function is zero at one or more time points") 603 604 weights = np.zeros(time.shape[0]) 605 weights[event] = 1.0 / Ghat 606 607 return weights
Kaplan–Meier estimator for the censoring distribution.
554 def fit(self, y): 555 """Estimate censoring distribution from training data. 556 557 Parameters 558 ---------- 559 y : structured array, shape = (n_samples,) 560 A structured array containing the binary event indicator 561 as first field, and time of event or time of censoring as 562 second field. 563 564 Returns 565 ------- 566 self 567 """ 568 event, time = check_y_survival(y) 569 if event.all(): 570 self.unique_time_ = np.unique(time) 571 self.prob_ = np.ones(self.unique_time_.shape[0]) 572 else: 573 unique_time, prob = kaplan_meier_estimator( 574 event, time, reverse=True) 575 self.unique_time_ = np.r_[-np.inf, unique_time] 576 self.prob_ = np.r_[1.0, prob] 577 578 return self
Estimate censoring distribution from training data.
Parameters
y : structured array, shape = (n_samples,) A structured array containing the binary event indicator as first field, and time of event or time of censoring as second field.
Returns
self
214def kaplan_meier_estimator( 215 event, 216 time_exit, 217 time_enter=None, 218 time_min=None, 219 reverse=False, 220 conf_level=0.95, 221 conf_type=None, 222): 223 """Kaplan-Meier estimator of survival function. 224 225 See [1]_ for further description. 226 227 Parameters 228 ---------- 229 event : array-like, shape = (n_samples,) 230 Contains binary event indicators. 231 232 time_exit : array-like, shape = (n_samples,) 233 Contains event/censoring times. 234 235 time_enter : array-like, shape = (n_samples,), optional 236 Contains time when each individual entered the study for 237 left truncated survival data. 238 239 time_min : float, optional 240 Compute estimator conditional on survival at least up to 241 the specified time. 242 243 reverse : bool, optional, default: False 244 Whether to estimate the censoring distribution. 245 When there are ties between times at which events are observed, 246 then events come first and are subtracted from the denominator. 247 Only available for right-censored data, i.e. `time_enter` must 248 be None. 249 250 conf_level : float, optional, default: 0.95 251 The level for a two-sided confidence interval on the survival curves. 252 253 conf_type : None or {'log-log'}, optional, default: None. 254 The type of confidence intervals to estimate. 255 If `None`, no confidence intervals are estimated. 256 If "log-log", estimate confidence intervals using 257 the log hazard or :math:`log(-log(S(t)))` as described in [2]_. 258 259 Returns 260 ------- 261 time : array, shape = (n_times,) 262 Unique times. 263 264 prob_survival : array, shape = (n_times,) 265 Survival probability at each unique time point. 266 If `time_enter` is provided, estimates are conditional probabilities. 267 268 conf_int : array, shape = (2, n_times) 269 Pointwise confidence interval of the Kaplan-Meier estimator 270 at each unique time point. 271 Only provided if `conf_type` is not None. 272 273 Examples 274 -------- 275 Creating a Kaplan-Meier curve: 276 277 >>> x, y, conf_int = kaplan_meier_estimator(event, time, conf_type="log-log") 278 >>> plt.step(x, y, where="post") 279 >>> plt.fill_between(x, conf_int[0], conf_int[1], alpha=0.25, step="post") 280 >>> plt.ylim(0, 1) 281 >>> plt.show() 282 283 See also 284 -------- 285 survivalist.nonparametric.SurvivalFunctionEstimator 286 Estimator API of the Kaplan-Meier estimator. 287 288 References 289 ---------- 290 .. [1] Kaplan, E. L. and Meier, P., "Nonparametric estimation from incomplete observations", 291 Journal of The American Statistical Association, vol. 53, pp. 457-481, 1958. 292 .. [2] Borgan Ø. and Liestøl K., "A Note on Confidence Intervals and Bands for the 293 Survival Function Based on Transformations", Scandinavian Journal of 294 Statistics. 1990;17(1):35–41. 295 """ 296 event, time_enter, time_exit = check_y_survival( 297 event, time_enter, time_exit, allow_all_censored=True) 298 check_consistent_length(event, time_enter, time_exit) 299 300 if conf_type is not None and reverse: 301 raise NotImplementedError( 302 "Confidence intervals of the censoring distribution is not implemented.") 303 304 if time_enter is None: 305 uniq_times, n_events, n_at_risk, n_censored = _compute_counts( 306 event, time_exit) 307 308 if reverse: 309 n_at_risk -= n_events 310 n_events = n_censored 311 else: 312 if reverse: 313 raise ValueError( 314 "The censoring distribution cannot be estimated from left truncated data") 315 316 uniq_times, n_events, n_at_risk = _compute_counts_truncated( 317 event, time_enter, time_exit) 318 319 # account for 0/0 = nan 320 ratio = np.divide( 321 n_events, 322 n_at_risk, 323 out=np.zeros(uniq_times.shape[0], dtype=float), 324 where=n_events != 0, 325 ) 326 values = 1.0 - ratio 327 328 if conf_type is not None: 329 ratio_var = np.divide( 330 n_events, 331 n_at_risk * (n_at_risk - n_events), 332 out=np.zeros(uniq_times.shape[0], dtype=float), 333 where=(n_events != 0) & (n_at_risk != n_events), 334 ) 335 336 if time_min is not None: 337 mask = uniq_times >= time_min 338 uniq_times = np.compress(mask, uniq_times) 339 values = np.compress(mask, values) 340 341 prob_survival = np.cumprod(values) 342 343 if conf_type is None: 344 return uniq_times, prob_survival 345 346 if time_min is not None: 347 ratio_var = np.compress(mask, ratio_var) 348 349 ci = _km_ci_estimator(prob_survival, ratio_var, conf_level, conf_type) 350 351 return uniq_times, prob_survival, ci
Kaplan-Meier estimator of survival function.
See 1 for further description.
Parameters
event : array-like, shape = (n_samples,) Contains binary event indicators.
time_exit : array-like, shape = (n_samples,) Contains event/censoring times.
time_enter : array-like, shape = (n_samples,), optional Contains time when each individual entered the study for left truncated survival data.
time_min : float, optional Compute estimator conditional on survival at least up to the specified time.
reverse : bool, optional, default: False
Whether to estimate the censoring distribution.
When there are ties between times at which events are observed,
then events come first and are subtracted from the denominator.
Only available for right-censored data, i.e. time_enter must
be None.
conf_level : float, optional, default: 0.95 The level for a two-sided confidence interval on the survival curves.
conf_type : None or {'log-log'}, optional, default: None.
The type of confidence intervals to estimate.
If None, no confidence intervals are estimated.
If "log-log", estimate confidence intervals using
the log hazard or \( log(-log(S(t))) \) as described in 2.
Returns
time : array, shape = (n_times,) Unique times.
prob_survival : array, shape = (n_times,)
Survival probability at each unique time point.
If time_enter is provided, estimates are conditional probabilities.
conf_int : array, shape = (2, n_times)
Pointwise confidence interval of the Kaplan-Meier estimator
at each unique time point.
Only provided if conf_type is not None.
Examples
Creating a Kaplan-Meier curve:
>>> x, y, conf_int = kaplan_meier_estimator(event, time, conf_type="log-log")
>>> plt.step(x, y, where="post")
>>> plt.fill_between(x, conf_int[0], conf_int[1], alpha=0.25, step="post")
>>> plt.ylim(0, 1)
>>> plt.show()
See also
SurvivalFunctionEstimator Estimator API of the Kaplan-Meier estimator.
References
-
Kaplan, E. L. and Meier, P., "Nonparametric estimation from incomplete observations", Journal of The American Statistical Association, vol. 53, pp. 457-481, 1958. ↩
-
Borgan Ø. and Liestøl K., "A Note on Confidence Intervals and Bands for the Survival Function Based on Transformations", Scandinavian Journal of Statistics. 1990;17(1):35–41. ↩
354def nelson_aalen_estimator(event, time): 355 """Nelson-Aalen estimator of cumulative hazard function. 356 357 See [1]_, [2]_ for further description. 358 359 Parameters 360 ---------- 361 event : array-like, shape = (n_samples,) 362 Contains binary event indicators. 363 364 time : array-like, shape = (n_samples,) 365 Contains event/censoring times. 366 367 Returns 368 ------- 369 time : array, shape = (n_times,) 370 Unique times. 371 372 cum_hazard : array, shape = (n_times,) 373 Cumulative hazard at each unique time point. 374 375 References 376 ---------- 377 .. [1] Nelson, W., "Theory and applications of hazard plotting for censored failure data", 378 Technometrics, vol. 14, pp. 945-965, 1972. 379 380 .. [2] Aalen, O. O., "Nonparametric inference for a family of counting processes", 381 Annals of Statistics, vol. 6, pp. 701–726, 1978. 382 """ 383 event, time = check_y_survival(event, time) 384 check_consistent_length(event, time) 385 uniq_times, n_events, n_at_risk, _ = _compute_counts(event, time) 386 387 y = np.cumsum(n_events / n_at_risk) 388 389 return uniq_times, y
Nelson-Aalen estimator of cumulative hazard function.
See 1, 2 for further description.
Parameters
event : array-like, shape = (n_samples,) Contains binary event indicators.
time : array-like, shape = (n_samples,) Contains event/censoring times.
Returns
time : array, shape = (n_times,) Unique times.
cum_hazard : array, shape = (n_times,) Cumulative hazard at each unique time point.
References
392def ipc_weights(event, time): 393 """Compute inverse probability of censoring weights 394 395 Parameters 396 ---------- 397 event : array, shape = (n_samples,) 398 Boolean event indicator. 399 400 time : array, shape = (n_samples,) 401 Time when a subject experienced an event or was censored. 402 403 Returns 404 ------- 405 weights : array, shape = (n_samples,) 406 inverse probability of censoring weights 407 408 See also 409 -------- 410 CensoringDistributionEstimator 411 An estimator interface for estimating inverse probability 412 of censoring weights for unseen time points. 413 """ 414 if event.all(): 415 return np.ones(time.shape[0]) 416 417 unique_time, p = kaplan_meier_estimator(event, time, reverse=True) 418 419 idx = np.searchsorted(unique_time, time[event]) 420 Ghat = p[idx] 421 422 assert (Ghat > 0).all() 423 424 weights = np.zeros(time.shape[0]) 425 weights[event] = 1.0 / Ghat 426 427 return weights
Compute inverse probability of censoring weights
Parameters
event : array, shape = (n_samples,) Boolean event indicator.
time : array, shape = (n_samples,) Time when a subject experienced an event or was censored.
Returns
weights : array, shape = (n_samples,) inverse probability of censoring weights
See also
CensoringDistributionEstimator An estimator interface for estimating inverse probability of censoring weights for unseen time points.
430class SurvivalFunctionEstimator(BaseEstimator): 431 """Kaplan–Meier estimate of the survival function. 432 433 Parameters 434 ---------- 435 conf_level : float, optional, default: 0.95 436 The level for a two-sided confidence interval on the survival curves. 437 438 conf_type : None or {'log-log'}, optional, default: None. 439 The type of confidence intervals to estimate. 440 If `None`, no confidence intervals are estimated. 441 If "log-log", estimate confidence intervals using 442 the log hazard or :math:`log(-log(S(t)))`. 443 444 See also 445 -------- 446 survivalist.nonparametric.kaplan_meier_estimator 447 Functional API of the Kaplan-Meier estimator. 448 """ 449 450 _parameter_constraints = { 451 "conf_level": [Interval(numbers.Real, 0.0, 1.0, closed="neither")], 452 "conf_type": [None, StrOptions({"log-log"})], 453 } 454 455 def __init__(self, conf_level=0.95, conf_type=None): 456 self.conf_level = conf_level 457 self.conf_type = conf_type 458 459 def fit(self, y): 460 """Estimate survival distribution from training data. 461 462 Parameters 463 ---------- 464 y : structured array, shape = (n_samples,) 465 A structured array containing the binary event indicator 466 as first field, and time of event or time of censoring as 467 second field. 468 469 Returns 470 ------- 471 self 472 """ 473 self._validate_params() 474 event, time = check_y_survival(y, allow_all_censored=True) 475 476 values = kaplan_meier_estimator( 477 event, time, conf_level=self.conf_level, conf_type=self.conf_type) 478 if self.conf_type is None: 479 unique_time, prob = values 480 else: 481 unique_time, prob, conf_int = values 482 self.conf_int_ = np.column_stack((np.ones((2, 1)), conf_int)) 483 484 self.unique_time_ = np.r_[-np.inf, unique_time] 485 self.prob_ = np.r_[1.0, prob] 486 487 return self 488 489 def predict_proba(self, time, return_conf_int=False): 490 """Return probability of an event after given time point. 491 492 :math:`\\hat{S}(t) = P(T > t)` 493 494 Parameters 495 ---------- 496 time : array, shape = (n_samples,) 497 Time to estimate probability at. 498 499 return_conf_int : bool, optional, default: False 500 Whether to return the pointwise confidence interval 501 of the survival function. 502 Only available if :meth:`fit()` has been called 503 with the `conf_type` parameter set. 504 505 Returns 506 ------- 507 prob : array, shape = (n_samples,) 508 Probability of an event at the passed time points. 509 510 conf_int : array, shape = (2, n_samples) 511 Pointwise confidence interval at the passed time points. 512 Only provided if `return_conf_int` is True. 513 """ 514 check_is_fitted(self, "unique_time_") 515 if return_conf_int and not hasattr(self, "conf_int_"): 516 raise ValueError( 517 "If return_conf_int is True, SurvivalFunctionEstimator must be fitted with conf_int != None" 518 ) 519 520 time = check_array(time, ensure_2d=False, 521 estimator=self, input_name="time") 522 523 # K-M is undefined if estimate at last time point is non-zero 524 extends = time > self.unique_time_[-1] 525 if self.prob_[-1] > 0 and extends.any(): 526 raise ValueError( 527 f"time must be smaller than largest observed time point: {self.unique_time_[-1]}") 528 529 # beyond last time point is zero probability 530 Shat = np.empty(time.shape, dtype=float) 531 Shat[extends] = 0.0 532 533 valid = ~extends 534 time = time[valid] 535 idx = np.searchsorted(self.unique_time_, time) 536 # for non-exact matches, we need to shift the index to left 537 eps = np.finfo(self.unique_time_.dtype).eps 538 exact = np.absolute(self.unique_time_[idx] - time) < eps 539 idx[~exact] -= 1 540 Shat[valid] = self.prob_[idx] 541 542 if not return_conf_int: 543 return Shat 544 545 ci = np.empty((2, time.shape[0]), dtype=float) 546 ci[:, extends] = np.nan 547 ci[:, valid] = self.conf_int_[:, idx] 548 return Shat, ci
Kaplan–Meier estimate of the survival function.
Parameters
conf_level : float, optional, default: 0.95 The level for a two-sided confidence interval on the survival curves.
conf_type : None or {'log-log'}, optional, default: None.
The type of confidence intervals to estimate.
If None, no confidence intervals are estimated.
If "log-log", estimate confidence intervals using
the log hazard or \( log(-log(S(t))) \).
See also
kaplan_meier_estimator Functional API of the Kaplan-Meier estimator.
459 def fit(self, y): 460 """Estimate survival distribution from training data. 461 462 Parameters 463 ---------- 464 y : structured array, shape = (n_samples,) 465 A structured array containing the binary event indicator 466 as first field, and time of event or time of censoring as 467 second field. 468 469 Returns 470 ------- 471 self 472 """ 473 self._validate_params() 474 event, time = check_y_survival(y, allow_all_censored=True) 475 476 values = kaplan_meier_estimator( 477 event, time, conf_level=self.conf_level, conf_type=self.conf_type) 478 if self.conf_type is None: 479 unique_time, prob = values 480 else: 481 unique_time, prob, conf_int = values 482 self.conf_int_ = np.column_stack((np.ones((2, 1)), conf_int)) 483 484 self.unique_time_ = np.r_[-np.inf, unique_time] 485 self.prob_ = np.r_[1.0, prob] 486 487 return self
Estimate survival distribution from training data.
Parameters
y : structured array, shape = (n_samples,) A structured array containing the binary event indicator as first field, and time of event or time of censoring as second field.
Returns
self
489 def predict_proba(self, time, return_conf_int=False): 490 """Return probability of an event after given time point. 491 492 :math:`\\hat{S}(t) = P(T > t)` 493 494 Parameters 495 ---------- 496 time : array, shape = (n_samples,) 497 Time to estimate probability at. 498 499 return_conf_int : bool, optional, default: False 500 Whether to return the pointwise confidence interval 501 of the survival function. 502 Only available if :meth:`fit()` has been called 503 with the `conf_type` parameter set. 504 505 Returns 506 ------- 507 prob : array, shape = (n_samples,) 508 Probability of an event at the passed time points. 509 510 conf_int : array, shape = (2, n_samples) 511 Pointwise confidence interval at the passed time points. 512 Only provided if `return_conf_int` is True. 513 """ 514 check_is_fitted(self, "unique_time_") 515 if return_conf_int and not hasattr(self, "conf_int_"): 516 raise ValueError( 517 "If return_conf_int is True, SurvivalFunctionEstimator must be fitted with conf_int != None" 518 ) 519 520 time = check_array(time, ensure_2d=False, 521 estimator=self, input_name="time") 522 523 # K-M is undefined if estimate at last time point is non-zero 524 extends = time > self.unique_time_[-1] 525 if self.prob_[-1] > 0 and extends.any(): 526 raise ValueError( 527 f"time must be smaller than largest observed time point: {self.unique_time_[-1]}") 528 529 # beyond last time point is zero probability 530 Shat = np.empty(time.shape, dtype=float) 531 Shat[extends] = 0.0 532 533 valid = ~extends 534 time = time[valid] 535 idx = np.searchsorted(self.unique_time_, time) 536 # for non-exact matches, we need to shift the index to left 537 eps = np.finfo(self.unique_time_.dtype).eps 538 exact = np.absolute(self.unique_time_[idx] - time) < eps 539 idx[~exact] -= 1 540 Shat[valid] = self.prob_[idx] 541 542 if not return_conf_int: 543 return Shat 544 545 ci = np.empty((2, time.shape[0]), dtype=float) 546 ci[:, extends] = np.nan 547 ci[:, valid] = self.conf_int_[:, idx] 548 return Shat, ci
Return probability of an event after given time point.
\( \hat{S}(t) = P(T > t) \)
Parameters
time : array, shape = (n_samples,) Time to estimate probability at.
return_conf_int : bool, optional, default: False
Whether to return the pointwise confidence interval
of the survival function.
Only available if fit()() has been called
with the conf_type parameter set.
Returns
prob : array, shape = (n_samples,) Probability of an event at the passed time points.
conf_int : array, shape = (2, n_samples)
Pointwise confidence interval at the passed time points.
Only provided if return_conf_int is True.