survivalist.nonparametric

  1# This program is free software: you can redistribute it and/or modify
  2# it under the terms of the GNU General Public License as published by
  3# the Free Software Foundation, either version 3 of the License, or
  4# (at your option) any later version.
  5#
  6# This program is distributed in the hope that it will be useful,
  7# but WITHOUT ANY WARRANTY; without even the implied warranty of
  8# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  9# GNU General Public License for more details.
 10#
 11# You should have received a copy of the GNU General Public License
 12# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 13import numbers
 14
 15import numpy as np
 16from scipy import stats
 17from sklearn.base import BaseEstimator
 18from sklearn.utils._param_validation import Interval, StrOptions
 19from sklearn.utils.validation import (
 20    check_array,
 21    check_consistent_length,
 22    check_is_fitted,
 23)
 24
 25from .util import check_y_survival
 26
 27__all__ = [
 28    "CensoringDistributionEstimator",
 29    "kaplan_meier_estimator",
 30    "nelson_aalen_estimator",
 31    "ipc_weights",
 32    "SurvivalFunctionEstimator",
 33]
 34
 35
 36def _compute_counts(event, time, order=None):
 37    """Count right censored and uncensored samples at each unique time point.
 38
 39    Parameters
 40    ----------
 41    event : array
 42        Boolean event indicator.
 43
 44    time : array
 45        Survival time or time of censoring.
 46
 47    order : array or None
 48        Indices to order time in ascending order.
 49        If None, order will be computed.
 50
 51    Returns
 52    -------
 53    times : array
 54        Unique time points.
 55
 56    n_events : array
 57        Number of events at each time point.
 58
 59    n_at_risk : array
 60        Number of samples that have not been censored or have not had an event at each time point.
 61
 62    n_censored : array
 63        Number of censored samples at each time point.
 64    """
 65    n_samples = event.shape[0]
 66
 67    if order is None:
 68        order = np.argsort(time, kind="mergesort")
 69
 70    uniq_times = np.empty(n_samples, dtype=time.dtype)
 71    uniq_events = np.empty(n_samples, dtype=int)
 72    uniq_counts = np.empty(n_samples, dtype=int)
 73
 74    i = 0
 75    prev_val = time[order[0]]
 76    j = 0
 77    while True:
 78        count_event = 0
 79        count = 0
 80        while i < n_samples and prev_val == time[order[i]]:
 81            if event[order[i]]:
 82                count_event += 1
 83
 84            count += 1
 85            i += 1
 86
 87        uniq_times[j] = prev_val
 88        uniq_events[j] = count_event
 89        uniq_counts[j] = count
 90        j += 1
 91
 92        if i == n_samples:
 93            break
 94
 95        prev_val = time[order[i]]
 96
 97    times = np.resize(uniq_times, j)
 98    n_events = np.resize(uniq_events, j)
 99    total_count = np.resize(uniq_counts, j)
100    n_censored = total_count - n_events
101
102    # offset cumulative sum by one
103    total_count = np.r_[0, total_count]
104    n_at_risk = n_samples - np.cumsum(total_count)
105
106    return times, n_events, n_at_risk[:-1], n_censored
107
108
109def _compute_counts_truncated(event, time_enter, time_exit):
110    """Compute counts for left truncated and right censored survival data.
111
112    Parameters
113    ----------
114    event : array
115        Boolean event indicator.
116
117    time_start : array
118        Time when a subject entered the study.
119
120    time_exit : array
121        Time when a subject left the study due to an
122        event or censoring.
123
124    Returns
125    -------
126    times : array
127        Unique time points.
128
129    n_events : array
130        Number of events at each time point.
131
132    n_at_risk : array
133        Number of samples that are censored or have an event at each time point.
134    """
135    if (time_enter > time_exit).any():
136        raise ValueError("exit time must be larger start time for all samples")
137
138    n_samples = event.shape[0]
139
140    uniq_times = np.sort(
141        np.unique(np.r_[time_enter, time_exit]), kind="mergesort")
142    total_counts = np.empty(len(uniq_times), dtype=int)
143    event_counts = np.empty(len(uniq_times), dtype=int)
144
145    order_enter = np.argsort(time_enter, kind="mergesort")
146    order_exit = np.argsort(time_exit, kind="mergesort")
147    s_time_enter = time_enter[order_enter]
148    s_time_exit = time_exit[order_exit]
149
150    t0 = uniq_times[0]
151    # everything larger is included
152    idx_enter = np.searchsorted(s_time_enter, t0, side="right")
153    # everything smaller is excluded
154    idx_exit = np.searchsorted(s_time_exit, t0, side="left")
155
156    total_counts[0] = idx_enter
157    # except people die on the day they enter
158    event_counts[0] = 0
159
160    for i in range(1, len(uniq_times)):
161        ti = uniq_times[i]
162
163        while idx_enter < n_samples and s_time_enter[idx_enter] < ti:
164            idx_enter += 1
165
166        while idx_exit < n_samples and s_time_exit[idx_exit] < ti:
167            idx_exit += 1
168
169        risk_set = np.setdiff1d(
170            order_enter[:idx_enter], order_exit[:idx_exit], assume_unique=True)
171        total_counts[i] = len(risk_set)
172
173        count_event = 0
174        k = idx_exit
175        while k < n_samples and s_time_exit[k] == ti:
176            if event[order_exit[k]]:
177                count_event += 1
178            k += 1
179        event_counts[i] = count_event
180
181    return uniq_times, event_counts, total_counts
182
183
184def _ci_logmlog(prob_survival, sigma_t, z):
185    """Compute the pointwise log-minus-log transformed confidence intervals"""
186    eps = np.finfo(prob_survival.dtype).eps
187    log_p = np.zeros_like(prob_survival)
188    np.log(prob_survival, where=prob_survival > eps, out=log_p)
189    theta = np.zeros_like(prob_survival)
190    np.true_divide(sigma_t, log_p, where=log_p < -eps, out=theta)
191    theta = np.array([[-1], [1]]) * theta * z
192    ci = np.exp(np.exp(theta) * log_p)
193    ci[:, prob_survival <= eps] = 0.0
194    ci[:, 1.0 - prob_survival <= eps] = 1.0
195    return ci
196
197
198def _km_ci_estimator(prob_survival, ratio_var, conf_level, conf_type):
199    if conf_type not in {"log-log"}:
200        raise ValueError(
201            f"conf_type must be None or a str among {{'log-log'}}, but was {conf_type!r}")
202
203    if not isinstance(conf_level, numbers.Real) or not np.isfinite(conf_level) or conf_level <= 0 or conf_level >= 1.0:
204        raise ValueError(
205            f"conf_level must be a float in the range (0.0, 1.0), but was {conf_level!r}")
206
207    z = stats.norm.isf((1.0 - conf_level) / 2.0)
208    sigma = np.sqrt(np.cumsum(ratio_var))
209    ci = _ci_logmlog(prob_survival, sigma, z)
210    return ci
211
212
213def kaplan_meier_estimator(
214    event,
215    time_exit,
216    time_enter=None,
217    time_min=None,
218    reverse=False,
219    conf_level=0.95,
220    conf_type=None,
221):
222    """Kaplan-Meier estimator of survival function.
223
224    See [1]_ for further description.
225
226    Parameters
227    ----------
228    event : array-like, shape = (n_samples,)
229        Contains binary event indicators.
230
231    time_exit : array-like, shape = (n_samples,)
232        Contains event/censoring times.
233
234    time_enter : array-like, shape = (n_samples,), optional
235        Contains time when each individual entered the study for
236        left truncated survival data.
237
238    time_min : float, optional
239        Compute estimator conditional on survival at least up to
240        the specified time.
241
242    reverse : bool, optional, default: False
243        Whether to estimate the censoring distribution.
244        When there are ties between times at which events are observed,
245        then events come first and are subtracted from the denominator.
246        Only available for right-censored data, i.e. `time_enter` must
247        be None.
248
249    conf_level : float, optional, default: 0.95
250        The level for a two-sided confidence interval on the survival curves.
251
252    conf_type : None or {'log-log'}, optional, default: None.
253        The type of confidence intervals to estimate.
254        If `None`, no confidence intervals are estimated.
255        If "log-log", estimate confidence intervals using
256        the log hazard or :math:`log(-log(S(t)))` as described in [2]_.
257
258    Returns
259    -------
260    time : array, shape = (n_times,)
261        Unique times.
262
263    prob_survival : array, shape = (n_times,)
264        Survival probability at each unique time point.
265        If `time_enter` is provided, estimates are conditional probabilities.
266
267    conf_int : array, shape = (2, n_times)
268        Pointwise confidence interval of the Kaplan-Meier estimator
269        at each unique time point.
270        Only provided if `conf_type` is not None.
271
272    Examples
273    --------
274    Creating a Kaplan-Meier curve:
275
276    >>> x, y, conf_int = kaplan_meier_estimator(event, time, conf_type="log-log")
277    >>> plt.step(x, y, where="post")
278    >>> plt.fill_between(x, conf_int[0], conf_int[1], alpha=0.25, step="post")
279    >>> plt.ylim(0, 1)
280    >>> plt.show()
281
282    See also
283    --------
284    survivalist.nonparametric.SurvivalFunctionEstimator
285        Estimator API of the Kaplan-Meier estimator.
286
287    References
288    ----------
289    .. [1] Kaplan, E. L. and Meier, P., "Nonparametric estimation from incomplete observations",
290           Journal of The American Statistical Association, vol. 53, pp. 457-481, 1958.
291    .. [2] Borgan Ø. and Liestøl K., "A Note on Confidence Intervals and Bands for the
292           Survival Function Based on Transformations", Scandinavian Journal of
293           Statistics. 1990;17(1):35–41.
294    """
295    event, time_enter, time_exit = check_y_survival(
296        event, time_enter, time_exit, allow_all_censored=True)
297    check_consistent_length(event, time_enter, time_exit)
298
299    if conf_type is not None and reverse:
300        raise NotImplementedError(
301            "Confidence intervals of the censoring distribution is not implemented.")
302
303    if time_enter is None:
304        uniq_times, n_events, n_at_risk, n_censored = _compute_counts(
305            event, time_exit)
306
307        if reverse:
308            n_at_risk -= n_events
309            n_events = n_censored
310    else:
311        if reverse:
312            raise ValueError(
313                "The censoring distribution cannot be estimated from left truncated data")
314
315        uniq_times, n_events, n_at_risk = _compute_counts_truncated(
316            event, time_enter, time_exit)
317
318    # account for 0/0 = nan
319    ratio = np.divide(
320        n_events,
321        n_at_risk,
322        out=np.zeros(uniq_times.shape[0], dtype=float),
323        where=n_events != 0,
324    )
325    values = 1.0 - ratio
326
327    if conf_type is not None:
328        ratio_var = np.divide(
329            n_events,
330            n_at_risk * (n_at_risk - n_events),
331            out=np.zeros(uniq_times.shape[0], dtype=float),
332            where=(n_events != 0) & (n_at_risk != n_events),
333        )
334
335    if time_min is not None:
336        mask = uniq_times >= time_min
337        uniq_times = np.compress(mask, uniq_times)
338        values = np.compress(mask, values)
339
340    prob_survival = np.cumprod(values)
341
342    if conf_type is None:
343        return uniq_times, prob_survival
344
345    if time_min is not None:
346        ratio_var = np.compress(mask, ratio_var)
347
348    ci = _km_ci_estimator(prob_survival, ratio_var, conf_level, conf_type)
349
350    return uniq_times, prob_survival, ci
351
352
353def nelson_aalen_estimator(event, time):
354    """Nelson-Aalen estimator of cumulative hazard function.
355
356    See [1]_, [2]_ for further description.
357
358    Parameters
359    ----------
360    event : array-like, shape = (n_samples,)
361        Contains binary event indicators.
362
363    time : array-like, shape = (n_samples,)
364        Contains event/censoring times.
365
366    Returns
367    -------
368    time : array, shape = (n_times,)
369        Unique times.
370
371    cum_hazard : array, shape = (n_times,)
372        Cumulative hazard at each unique time point.
373
374    References
375    ----------
376    .. [1] Nelson, W., "Theory and applications of hazard plotting for censored failure data",
377           Technometrics, vol. 14, pp. 945-965, 1972.
378
379    .. [2] Aalen, O. O., "Nonparametric inference for a family of counting processes",
380           Annals of Statistics, vol. 6, pp. 701–726, 1978.
381    """
382    event, time = check_y_survival(event, time)
383    check_consistent_length(event, time)
384    uniq_times, n_events, n_at_risk, _ = _compute_counts(event, time)
385
386    y = np.cumsum(n_events / n_at_risk)
387
388    return uniq_times, y
389
390
391def ipc_weights(event, time):
392    """Compute inverse probability of censoring weights
393
394    Parameters
395    ----------
396    event : array, shape = (n_samples,)
397        Boolean event indicator.
398
399    time : array, shape = (n_samples,)
400        Time when a subject experienced an event or was censored.
401
402    Returns
403    -------
404    weights : array, shape = (n_samples,)
405        inverse probability of censoring weights
406
407    See also
408    --------
409    CensoringDistributionEstimator
410        An estimator interface for estimating inverse probability
411        of censoring weights for unseen time points.
412    """
413    if event.all():
414        return np.ones(time.shape[0])
415
416    unique_time, p = kaplan_meier_estimator(event, time, reverse=True)
417
418    idx = np.searchsorted(unique_time, time[event])
419    Ghat = p[idx]
420
421    assert (Ghat > 0).all()
422
423    weights = np.zeros(time.shape[0])
424    weights[event] = 1.0 / Ghat
425
426    return weights
427
428
429class SurvivalFunctionEstimator(BaseEstimator):
430    """Kaplan–Meier estimate of the survival function.
431
432    Parameters
433    ----------
434    conf_level : float, optional, default: 0.95
435        The level for a two-sided confidence interval on the survival curves.
436
437    conf_type : None or {'log-log'}, optional, default: None.
438        The type of confidence intervals to estimate.
439        If `None`, no confidence intervals are estimated.
440        If "log-log", estimate confidence intervals using
441        the log hazard or :math:`log(-log(S(t)))`.
442
443    See also
444    --------
445    survivalist.nonparametric.kaplan_meier_estimator
446        Functional API of the Kaplan-Meier estimator.
447    """
448
449    _parameter_constraints = {
450        "conf_level": [Interval(numbers.Real, 0.0, 1.0, closed="neither")],
451        "conf_type": [None, StrOptions({"log-log"})],
452    }
453
454    def __init__(self, conf_level=0.95, conf_type=None):
455        self.conf_level = conf_level
456        self.conf_type = conf_type
457
458    def fit(self, y):
459        """Estimate survival distribution from training data.
460
461        Parameters
462        ----------
463        y : structured array, shape = (n_samples,)
464            A structured array containing the binary event indicator
465            as first field, and time of event or time of censoring as
466            second field.
467
468        Returns
469        -------
470        self
471        """
472        self._validate_params()
473        event, time = check_y_survival(y, allow_all_censored=True)
474
475        values = kaplan_meier_estimator(
476            event, time, conf_level=self.conf_level, conf_type=self.conf_type)
477        if self.conf_type is None:
478            unique_time, prob = values
479        else:
480            unique_time, prob, conf_int = values
481            self.conf_int_ = np.column_stack((np.ones((2, 1)), conf_int))
482
483        self.unique_time_ = np.r_[-np.inf, unique_time]
484        self.prob_ = np.r_[1.0, prob]
485
486        return self
487
488    def predict_proba(self, time, return_conf_int=False):
489        """Return probability of an event after given time point.
490
491        :math:`\\hat{S}(t) = P(T > t)`
492
493        Parameters
494        ----------
495        time : array, shape = (n_samples,)
496            Time to estimate probability at.
497
498        return_conf_int : bool, optional, default: False
499            Whether to return the pointwise confidence interval
500            of the survival function.
501            Only available if :meth:`fit()` has been called
502            with the `conf_type` parameter set.
503
504        Returns
505        -------
506        prob : array, shape = (n_samples,)
507            Probability of an event at the passed time points.
508
509        conf_int : array, shape = (2, n_samples)
510            Pointwise confidence interval at the passed time points.
511            Only provided if `return_conf_int` is True.
512        """
513        check_is_fitted(self, "unique_time_")
514        if return_conf_int and not hasattr(self, "conf_int_"):
515            raise ValueError(
516                "If return_conf_int is True, SurvivalFunctionEstimator must be fitted with conf_int != None"
517            )
518
519        time = check_array(time, ensure_2d=False,
520                           estimator=self, input_name="time")
521
522        # K-M is undefined if estimate at last time point is non-zero
523        extends = time > self.unique_time_[-1]
524        if self.prob_[-1] > 0 and extends.any():
525            raise ValueError(
526                f"time must be smaller than largest observed time point: {self.unique_time_[-1]}")
527
528        # beyond last time point is zero probability
529        Shat = np.empty(time.shape, dtype=float)
530        Shat[extends] = 0.0
531
532        valid = ~extends
533        time = time[valid]
534        idx = np.searchsorted(self.unique_time_, time)
535        # for non-exact matches, we need to shift the index to left
536        eps = np.finfo(self.unique_time_.dtype).eps
537        exact = np.absolute(self.unique_time_[idx] - time) < eps
538        idx[~exact] -= 1
539        Shat[valid] = self.prob_[idx]
540
541        if not return_conf_int:
542            return Shat
543
544        ci = np.empty((2, time.shape[0]), dtype=float)
545        ci[:, extends] = np.nan
546        ci[:, valid] = self.conf_int_[:, idx]
547        return Shat, ci
548
549
550class CensoringDistributionEstimator(SurvivalFunctionEstimator):
551    """Kaplan–Meier estimator for the censoring distribution."""
552
553    def fit(self, y):
554        """Estimate censoring distribution from training data.
555
556        Parameters
557        ----------
558        y : structured array, shape = (n_samples,)
559            A structured array containing the binary event indicator
560            as first field, and time of event or time of censoring as
561            second field.
562
563        Returns
564        -------
565        self
566        """
567        event, time = check_y_survival(y)
568        if event.all():
569            self.unique_time_ = np.unique(time)
570            self.prob_ = np.ones(self.unique_time_.shape[0])
571        else:
572            unique_time, prob = kaplan_meier_estimator(
573                event, time, reverse=True)
574            self.unique_time_ = np.r_[-np.inf, unique_time]
575            self.prob_ = np.r_[1.0, prob]
576
577        return self
578
579    def predict_ipcw(self, y):
580        """Return inverse probability of censoring weights at given time points.
581
582        :math:`\\omega_i = \\delta_i / \\hat{G}(y_i)`
583
584        Parameters
585        ----------
586        y : structured array, shape = (n_samples,)
587            A structured array containing the binary event indicator
588            as first field, and time of event or time of censoring as
589            second field.
590
591        Returns
592        -------
593        ipcw : array, shape = (n_samples,)
594            Inverse probability of censoring weights.
595        """
596        event, time = check_y_survival(y)
597        Ghat = self.predict_proba(time[event])
598
599        if (Ghat == 0.0).any():
600            raise ValueError(
601                "censoring survival function is zero at one or more time points")
602
603        weights = np.zeros(time.shape[0])
604        weights[event] = 1.0 / Ghat
605
606        return weights
class CensoringDistributionEstimator(SurvivalFunctionEstimator):
551class CensoringDistributionEstimator(SurvivalFunctionEstimator):
552    """Kaplan–Meier estimator for the censoring distribution."""
553
554    def fit(self, y):
555        """Estimate censoring distribution from training data.
556
557        Parameters
558        ----------
559        y : structured array, shape = (n_samples,)
560            A structured array containing the binary event indicator
561            as first field, and time of event or time of censoring as
562            second field.
563
564        Returns
565        -------
566        self
567        """
568        event, time = check_y_survival(y)
569        if event.all():
570            self.unique_time_ = np.unique(time)
571            self.prob_ = np.ones(self.unique_time_.shape[0])
572        else:
573            unique_time, prob = kaplan_meier_estimator(
574                event, time, reverse=True)
575            self.unique_time_ = np.r_[-np.inf, unique_time]
576            self.prob_ = np.r_[1.0, prob]
577
578        return self
579
580    def predict_ipcw(self, y):
581        """Return inverse probability of censoring weights at given time points.
582
583        :math:`\\omega_i = \\delta_i / \\hat{G}(y_i)`
584
585        Parameters
586        ----------
587        y : structured array, shape = (n_samples,)
588            A structured array containing the binary event indicator
589            as first field, and time of event or time of censoring as
590            second field.
591
592        Returns
593        -------
594        ipcw : array, shape = (n_samples,)
595            Inverse probability of censoring weights.
596        """
597        event, time = check_y_survival(y)
598        Ghat = self.predict_proba(time[event])
599
600        if (Ghat == 0.0).any():
601            raise ValueError(
602                "censoring survival function is zero at one or more time points")
603
604        weights = np.zeros(time.shape[0])
605        weights[event] = 1.0 / Ghat
606
607        return weights

Kaplan–Meier estimator for the censoring distribution.

def fit(self, y):
554    def fit(self, y):
555        """Estimate censoring distribution from training data.
556
557        Parameters
558        ----------
559        y : structured array, shape = (n_samples,)
560            A structured array containing the binary event indicator
561            as first field, and time of event or time of censoring as
562            second field.
563
564        Returns
565        -------
566        self
567        """
568        event, time = check_y_survival(y)
569        if event.all():
570            self.unique_time_ = np.unique(time)
571            self.prob_ = np.ones(self.unique_time_.shape[0])
572        else:
573            unique_time, prob = kaplan_meier_estimator(
574                event, time, reverse=True)
575            self.unique_time_ = np.r_[-np.inf, unique_time]
576            self.prob_ = np.r_[1.0, prob]
577
578        return self

Estimate censoring distribution from training data.

Parameters

y : structured array, shape = (n_samples,) A structured array containing the binary event indicator as first field, and time of event or time of censoring as second field.

Returns

self

def kaplan_meier_estimator( event, time_exit, time_enter=None, time_min=None, reverse=False, conf_level=0.95, conf_type=None):
214def kaplan_meier_estimator(
215    event,
216    time_exit,
217    time_enter=None,
218    time_min=None,
219    reverse=False,
220    conf_level=0.95,
221    conf_type=None,
222):
223    """Kaplan-Meier estimator of survival function.
224
225    See [1]_ for further description.
226
227    Parameters
228    ----------
229    event : array-like, shape = (n_samples,)
230        Contains binary event indicators.
231
232    time_exit : array-like, shape = (n_samples,)
233        Contains event/censoring times.
234
235    time_enter : array-like, shape = (n_samples,), optional
236        Contains time when each individual entered the study for
237        left truncated survival data.
238
239    time_min : float, optional
240        Compute estimator conditional on survival at least up to
241        the specified time.
242
243    reverse : bool, optional, default: False
244        Whether to estimate the censoring distribution.
245        When there are ties between times at which events are observed,
246        then events come first and are subtracted from the denominator.
247        Only available for right-censored data, i.e. `time_enter` must
248        be None.
249
250    conf_level : float, optional, default: 0.95
251        The level for a two-sided confidence interval on the survival curves.
252
253    conf_type : None or {'log-log'}, optional, default: None.
254        The type of confidence intervals to estimate.
255        If `None`, no confidence intervals are estimated.
256        If "log-log", estimate confidence intervals using
257        the log hazard or :math:`log(-log(S(t)))` as described in [2]_.
258
259    Returns
260    -------
261    time : array, shape = (n_times,)
262        Unique times.
263
264    prob_survival : array, shape = (n_times,)
265        Survival probability at each unique time point.
266        If `time_enter` is provided, estimates are conditional probabilities.
267
268    conf_int : array, shape = (2, n_times)
269        Pointwise confidence interval of the Kaplan-Meier estimator
270        at each unique time point.
271        Only provided if `conf_type` is not None.
272
273    Examples
274    --------
275    Creating a Kaplan-Meier curve:
276
277    >>> x, y, conf_int = kaplan_meier_estimator(event, time, conf_type="log-log")
278    >>> plt.step(x, y, where="post")
279    >>> plt.fill_between(x, conf_int[0], conf_int[1], alpha=0.25, step="post")
280    >>> plt.ylim(0, 1)
281    >>> plt.show()
282
283    See also
284    --------
285    survivalist.nonparametric.SurvivalFunctionEstimator
286        Estimator API of the Kaplan-Meier estimator.
287
288    References
289    ----------
290    .. [1] Kaplan, E. L. and Meier, P., "Nonparametric estimation from incomplete observations",
291           Journal of The American Statistical Association, vol. 53, pp. 457-481, 1958.
292    .. [2] Borgan Ø. and Liestøl K., "A Note on Confidence Intervals and Bands for the
293           Survival Function Based on Transformations", Scandinavian Journal of
294           Statistics. 1990;17(1):35–41.
295    """
296    event, time_enter, time_exit = check_y_survival(
297        event, time_enter, time_exit, allow_all_censored=True)
298    check_consistent_length(event, time_enter, time_exit)
299
300    if conf_type is not None and reverse:
301        raise NotImplementedError(
302            "Confidence intervals of the censoring distribution is not implemented.")
303
304    if time_enter is None:
305        uniq_times, n_events, n_at_risk, n_censored = _compute_counts(
306            event, time_exit)
307
308        if reverse:
309            n_at_risk -= n_events
310            n_events = n_censored
311    else:
312        if reverse:
313            raise ValueError(
314                "The censoring distribution cannot be estimated from left truncated data")
315
316        uniq_times, n_events, n_at_risk = _compute_counts_truncated(
317            event, time_enter, time_exit)
318
319    # account for 0/0 = nan
320    ratio = np.divide(
321        n_events,
322        n_at_risk,
323        out=np.zeros(uniq_times.shape[0], dtype=float),
324        where=n_events != 0,
325    )
326    values = 1.0 - ratio
327
328    if conf_type is not None:
329        ratio_var = np.divide(
330            n_events,
331            n_at_risk * (n_at_risk - n_events),
332            out=np.zeros(uniq_times.shape[0], dtype=float),
333            where=(n_events != 0) & (n_at_risk != n_events),
334        )
335
336    if time_min is not None:
337        mask = uniq_times >= time_min
338        uniq_times = np.compress(mask, uniq_times)
339        values = np.compress(mask, values)
340
341    prob_survival = np.cumprod(values)
342
343    if conf_type is None:
344        return uniq_times, prob_survival
345
346    if time_min is not None:
347        ratio_var = np.compress(mask, ratio_var)
348
349    ci = _km_ci_estimator(prob_survival, ratio_var, conf_level, conf_type)
350
351    return uniq_times, prob_survival, ci

Kaplan-Meier estimator of survival function.

See 1 for further description.

Parameters

event : array-like, shape = (n_samples,) Contains binary event indicators.

time_exit : array-like, shape = (n_samples,) Contains event/censoring times.

time_enter : array-like, shape = (n_samples,), optional Contains time when each individual entered the study for left truncated survival data.

time_min : float, optional Compute estimator conditional on survival at least up to the specified time.

reverse : bool, optional, default: False Whether to estimate the censoring distribution. When there are ties between times at which events are observed, then events come first and are subtracted from the denominator. Only available for right-censored data, i.e. time_enter must be None.

conf_level : float, optional, default: 0.95 The level for a two-sided confidence interval on the survival curves.

conf_type : None or {'log-log'}, optional, default: None. The type of confidence intervals to estimate. If None, no confidence intervals are estimated. If "log-log", estimate confidence intervals using the log hazard or \( log(-log(S(t))) \) as described in 2.

Returns

time : array, shape = (n_times,) Unique times.

prob_survival : array, shape = (n_times,) Survival probability at each unique time point. If time_enter is provided, estimates are conditional probabilities.

conf_int : array, shape = (2, n_times) Pointwise confidence interval of the Kaplan-Meier estimator at each unique time point. Only provided if conf_type is not None.

Examples

Creating a Kaplan-Meier curve:

>>> x, y, conf_int = kaplan_meier_estimator(event, time, conf_type="log-log")
>>> plt.step(x, y, where="post")
>>> plt.fill_between(x, conf_int[0], conf_int[1], alpha=0.25, step="post")
>>> plt.ylim(0, 1)
>>> plt.show()

See also

SurvivalFunctionEstimator Estimator API of the Kaplan-Meier estimator.

References


  1. Kaplan, E. L. and Meier, P., "Nonparametric estimation from incomplete observations", Journal of The American Statistical Association, vol. 53, pp. 457-481, 1958. 

  2. Borgan Ø. and Liestøl K., "A Note on Confidence Intervals and Bands for the Survival Function Based on Transformations", Scandinavian Journal of Statistics. 1990;17(1):35–41. 

def nelson_aalen_estimator(event, time):
354def nelson_aalen_estimator(event, time):
355    """Nelson-Aalen estimator of cumulative hazard function.
356
357    See [1]_, [2]_ for further description.
358
359    Parameters
360    ----------
361    event : array-like, shape = (n_samples,)
362        Contains binary event indicators.
363
364    time : array-like, shape = (n_samples,)
365        Contains event/censoring times.
366
367    Returns
368    -------
369    time : array, shape = (n_times,)
370        Unique times.
371
372    cum_hazard : array, shape = (n_times,)
373        Cumulative hazard at each unique time point.
374
375    References
376    ----------
377    .. [1] Nelson, W., "Theory and applications of hazard plotting for censored failure data",
378           Technometrics, vol. 14, pp. 945-965, 1972.
379
380    .. [2] Aalen, O. O., "Nonparametric inference for a family of counting processes",
381           Annals of Statistics, vol. 6, pp. 701–726, 1978.
382    """
383    event, time = check_y_survival(event, time)
384    check_consistent_length(event, time)
385    uniq_times, n_events, n_at_risk, _ = _compute_counts(event, time)
386
387    y = np.cumsum(n_events / n_at_risk)
388
389    return uniq_times, y

Nelson-Aalen estimator of cumulative hazard function.

See 1, 2 for further description.

Parameters

event : array-like, shape = (n_samples,) Contains binary event indicators.

time : array-like, shape = (n_samples,) Contains event/censoring times.

Returns

time : array, shape = (n_times,) Unique times.

cum_hazard : array, shape = (n_times,) Cumulative hazard at each unique time point.

References


  1. Nelson, W., "Theory and applications of hazard plotting for censored failure data", Technometrics, vol. 14, pp. 945-965, 1972. 

  2. Aalen, O. O., "Nonparametric inference for a family of counting processes", Annals of Statistics, vol. 6, pp. 701–726, 1978. 

def ipc_weights(event, time):
392def ipc_weights(event, time):
393    """Compute inverse probability of censoring weights
394
395    Parameters
396    ----------
397    event : array, shape = (n_samples,)
398        Boolean event indicator.
399
400    time : array, shape = (n_samples,)
401        Time when a subject experienced an event or was censored.
402
403    Returns
404    -------
405    weights : array, shape = (n_samples,)
406        inverse probability of censoring weights
407
408    See also
409    --------
410    CensoringDistributionEstimator
411        An estimator interface for estimating inverse probability
412        of censoring weights for unseen time points.
413    """
414    if event.all():
415        return np.ones(time.shape[0])
416
417    unique_time, p = kaplan_meier_estimator(event, time, reverse=True)
418
419    idx = np.searchsorted(unique_time, time[event])
420    Ghat = p[idx]
421
422    assert (Ghat > 0).all()
423
424    weights = np.zeros(time.shape[0])
425    weights[event] = 1.0 / Ghat
426
427    return weights

Compute inverse probability of censoring weights

Parameters

event : array, shape = (n_samples,) Boolean event indicator.

time : array, shape = (n_samples,) Time when a subject experienced an event or was censored.

Returns

weights : array, shape = (n_samples,) inverse probability of censoring weights

See also

CensoringDistributionEstimator An estimator interface for estimating inverse probability of censoring weights for unseen time points.

class SurvivalFunctionEstimator(sklearn.base.BaseEstimator):
430class SurvivalFunctionEstimator(BaseEstimator):
431    """Kaplan–Meier estimate of the survival function.
432
433    Parameters
434    ----------
435    conf_level : float, optional, default: 0.95
436        The level for a two-sided confidence interval on the survival curves.
437
438    conf_type : None or {'log-log'}, optional, default: None.
439        The type of confidence intervals to estimate.
440        If `None`, no confidence intervals are estimated.
441        If "log-log", estimate confidence intervals using
442        the log hazard or :math:`log(-log(S(t)))`.
443
444    See also
445    --------
446    survivalist.nonparametric.kaplan_meier_estimator
447        Functional API of the Kaplan-Meier estimator.
448    """
449
450    _parameter_constraints = {
451        "conf_level": [Interval(numbers.Real, 0.0, 1.0, closed="neither")],
452        "conf_type": [None, StrOptions({"log-log"})],
453    }
454
455    def __init__(self, conf_level=0.95, conf_type=None):
456        self.conf_level = conf_level
457        self.conf_type = conf_type
458
459    def fit(self, y):
460        """Estimate survival distribution from training data.
461
462        Parameters
463        ----------
464        y : structured array, shape = (n_samples,)
465            A structured array containing the binary event indicator
466            as first field, and time of event or time of censoring as
467            second field.
468
469        Returns
470        -------
471        self
472        """
473        self._validate_params()
474        event, time = check_y_survival(y, allow_all_censored=True)
475
476        values = kaplan_meier_estimator(
477            event, time, conf_level=self.conf_level, conf_type=self.conf_type)
478        if self.conf_type is None:
479            unique_time, prob = values
480        else:
481            unique_time, prob, conf_int = values
482            self.conf_int_ = np.column_stack((np.ones((2, 1)), conf_int))
483
484        self.unique_time_ = np.r_[-np.inf, unique_time]
485        self.prob_ = np.r_[1.0, prob]
486
487        return self
488
489    def predict_proba(self, time, return_conf_int=False):
490        """Return probability of an event after given time point.
491
492        :math:`\\hat{S}(t) = P(T > t)`
493
494        Parameters
495        ----------
496        time : array, shape = (n_samples,)
497            Time to estimate probability at.
498
499        return_conf_int : bool, optional, default: False
500            Whether to return the pointwise confidence interval
501            of the survival function.
502            Only available if :meth:`fit()` has been called
503            with the `conf_type` parameter set.
504
505        Returns
506        -------
507        prob : array, shape = (n_samples,)
508            Probability of an event at the passed time points.
509
510        conf_int : array, shape = (2, n_samples)
511            Pointwise confidence interval at the passed time points.
512            Only provided if `return_conf_int` is True.
513        """
514        check_is_fitted(self, "unique_time_")
515        if return_conf_int and not hasattr(self, "conf_int_"):
516            raise ValueError(
517                "If return_conf_int is True, SurvivalFunctionEstimator must be fitted with conf_int != None"
518            )
519
520        time = check_array(time, ensure_2d=False,
521                           estimator=self, input_name="time")
522
523        # K-M is undefined if estimate at last time point is non-zero
524        extends = time > self.unique_time_[-1]
525        if self.prob_[-1] > 0 and extends.any():
526            raise ValueError(
527                f"time must be smaller than largest observed time point: {self.unique_time_[-1]}")
528
529        # beyond last time point is zero probability
530        Shat = np.empty(time.shape, dtype=float)
531        Shat[extends] = 0.0
532
533        valid = ~extends
534        time = time[valid]
535        idx = np.searchsorted(self.unique_time_, time)
536        # for non-exact matches, we need to shift the index to left
537        eps = np.finfo(self.unique_time_.dtype).eps
538        exact = np.absolute(self.unique_time_[idx] - time) < eps
539        idx[~exact] -= 1
540        Shat[valid] = self.prob_[idx]
541
542        if not return_conf_int:
543            return Shat
544
545        ci = np.empty((2, time.shape[0]), dtype=float)
546        ci[:, extends] = np.nan
547        ci[:, valid] = self.conf_int_[:, idx]
548        return Shat, ci

Kaplan–Meier estimate of the survival function.

Parameters

conf_level : float, optional, default: 0.95 The level for a two-sided confidence interval on the survival curves.

conf_type : None or {'log-log'}, optional, default: None. The type of confidence intervals to estimate. If None, no confidence intervals are estimated. If "log-log", estimate confidence intervals using the log hazard or \( log(-log(S(t))) \).

See also

kaplan_meier_estimator Functional API of the Kaplan-Meier estimator.

def fit(self, y):
459    def fit(self, y):
460        """Estimate survival distribution from training data.
461
462        Parameters
463        ----------
464        y : structured array, shape = (n_samples,)
465            A structured array containing the binary event indicator
466            as first field, and time of event or time of censoring as
467            second field.
468
469        Returns
470        -------
471        self
472        """
473        self._validate_params()
474        event, time = check_y_survival(y, allow_all_censored=True)
475
476        values = kaplan_meier_estimator(
477            event, time, conf_level=self.conf_level, conf_type=self.conf_type)
478        if self.conf_type is None:
479            unique_time, prob = values
480        else:
481            unique_time, prob, conf_int = values
482            self.conf_int_ = np.column_stack((np.ones((2, 1)), conf_int))
483
484        self.unique_time_ = np.r_[-np.inf, unique_time]
485        self.prob_ = np.r_[1.0, prob]
486
487        return self

Estimate survival distribution from training data.

Parameters

y : structured array, shape = (n_samples,) A structured array containing the binary event indicator as first field, and time of event or time of censoring as second field.

Returns

self

def predict_proba(self, time, return_conf_int=False):
489    def predict_proba(self, time, return_conf_int=False):
490        """Return probability of an event after given time point.
491
492        :math:`\\hat{S}(t) = P(T > t)`
493
494        Parameters
495        ----------
496        time : array, shape = (n_samples,)
497            Time to estimate probability at.
498
499        return_conf_int : bool, optional, default: False
500            Whether to return the pointwise confidence interval
501            of the survival function.
502            Only available if :meth:`fit()` has been called
503            with the `conf_type` parameter set.
504
505        Returns
506        -------
507        prob : array, shape = (n_samples,)
508            Probability of an event at the passed time points.
509
510        conf_int : array, shape = (2, n_samples)
511            Pointwise confidence interval at the passed time points.
512            Only provided if `return_conf_int` is True.
513        """
514        check_is_fitted(self, "unique_time_")
515        if return_conf_int and not hasattr(self, "conf_int_"):
516            raise ValueError(
517                "If return_conf_int is True, SurvivalFunctionEstimator must be fitted with conf_int != None"
518            )
519
520        time = check_array(time, ensure_2d=False,
521                           estimator=self, input_name="time")
522
523        # K-M is undefined if estimate at last time point is non-zero
524        extends = time > self.unique_time_[-1]
525        if self.prob_[-1] > 0 and extends.any():
526            raise ValueError(
527                f"time must be smaller than largest observed time point: {self.unique_time_[-1]}")
528
529        # beyond last time point is zero probability
530        Shat = np.empty(time.shape, dtype=float)
531        Shat[extends] = 0.0
532
533        valid = ~extends
534        time = time[valid]
535        idx = np.searchsorted(self.unique_time_, time)
536        # for non-exact matches, we need to shift the index to left
537        eps = np.finfo(self.unique_time_.dtype).eps
538        exact = np.absolute(self.unique_time_[idx] - time) < eps
539        idx[~exact] -= 1
540        Shat[valid] = self.prob_[idx]
541
542        if not return_conf_int:
543            return Shat
544
545        ci = np.empty((2, time.shape[0]), dtype=float)
546        ci[:, extends] = np.nan
547        ci[:, valid] = self.conf_int_[:, idx]
548        return Shat, ci

Return probability of an event after given time point.

\( \hat{S}(t) = P(T > t) \)

Parameters

time : array, shape = (n_samples,) Time to estimate probability at.

return_conf_int : bool, optional, default: False Whether to return the pointwise confidence interval of the survival function. Only available if fit()() has been called with the conf_type parameter set.

Returns

prob : array, shape = (n_samples,) Probability of an event at the passed time points.

conf_int : array, shape = (2, n_samples) Pointwise confidence interval at the passed time points. Only provided if return_conf_int is True.