survivalist.compare

  1from collections import OrderedDict
  2
  3import numpy as np
  4import pandas as pd
  5from scipy import stats
  6from sklearn.utils.validation import check_array
  7
  8from .util import check_array_survival
  9
 10__all__ = ["compare_survival"]
 11
 12
 13def compare_survival(y, group_indicator, return_stats=False):
 14    """K-sample log-rank hypothesis test of identical survival functions.
 15
 16    Compares the pooled hazard rate with each group-specific
 17    hazard rate. The alternative hypothesis is that the hazard
 18    rate of at least one group differs from the others at some time.
 19
 20    See [1]_ for more details.
 21
 22    Parameters
 23    ----------
 24    y : structured array, shape = (n_samples,)
 25        A structured array containing the binary event indicator
 26        as first field, and time of event or time of censoring as
 27        second field.
 28
 29    group_indicator : array-like, shape = (n_samples,)
 30        Group membership of each sample.
 31
 32    return_stats : bool, optional, default: False
 33        Whether to return a data frame with statistics for each group
 34        and the covariance matrix of the test statistic.
 35
 36    Returns
 37    -------
 38    chisq : float
 39        Test statistic.
 40    pvalue : float
 41        Two-sided p-value with respect to the null hypothesis
 42        that the hazard rates across all groups are equal.
 43    stats : pandas.DataFrame
 44        Summary statistics for each group:  number of samples,
 45        observed number of events, expected number of events,
 46        and test statistic.
 47        Only provided if `return_stats` is True.
 48    covariance : array, shape=(n_groups, n_groups)
 49        Covariance matrix of the test statistic.
 50        Only provided if `return_stats` is True.
 51
 52    References
 53    ----------
 54    .. [1] Fleming, T. R. and Harrington, D. P.
 55           A Class of Hypothesis Tests for One and Two Samples of Censored Survival Data.
 56           Communications In Statistics 10 (1981): 763-794.
 57    """
 58
 59    event, time = check_array_survival(group_indicator, y)
 60    group_indicator = check_array(
 61        group_indicator,
 62        dtype="O",
 63        ensure_2d=False,
 64        estimator="compare_survival",
 65        input_name="group_indicator",
 66    )
 67
 68    n_samples = time.shape[0]
 69    groups, group_counts = np.unique(group_indicator, return_counts=True)
 70    n_groups = groups.shape[0]
 71    if n_groups == 1:
 72        raise ValueError(
 73            "At least two groups must be specified, but only one was provided.")
 74
 75    # sort descending
 76    o = np.argsort(-time, kind="mergesort")
 77    x = group_indicator[o]
 78    event = event[o]
 79    time = time[o]
 80
 81    at_risk = np.zeros(n_groups, dtype=int)
 82    observed = np.zeros(n_groups, dtype=int)
 83    expected = np.zeros(n_groups, dtype=float)
 84    covar = np.zeros((n_groups, n_groups), dtype=float)
 85
 86    covar_indices = np.diag_indices(n_groups)
 87
 88    k = 0
 89    while k < n_samples:
 90        ti = time[k]
 91        total_events = 0
 92        while k < n_samples and ti == time[k]:
 93            idx = np.searchsorted(groups, x[k])
 94            if event[k]:
 95                observed[idx] += 1
 96                total_events += 1
 97            at_risk[idx] += 1
 98            k += 1
 99
100        if total_events != 0:
101            total_at_risk = k
102            expected += at_risk * (total_events / total_at_risk)
103            if total_at_risk > 1:
104                multiplier = total_events * \
105                    (total_at_risk - total_events) / \
106                    (total_at_risk * (total_at_risk - 1))
107                temp = at_risk * multiplier
108                covar[covar_indices] += temp
109                covar -= np.outer(temp, at_risk) / total_at_risk
110
111    df = n_groups - 1
112    zz = observed[:df] - expected[:df]
113    chisq = np.linalg.solve(covar[:df, :df], zz).dot(zz)
114    pval = stats.chi2.sf(chisq, df)
115
116    if return_stats:
117        table = OrderedDict()
118        table["counts"] = group_counts
119        table["observed"] = observed
120        table["expected"] = expected
121        table["statistic"] = observed - expected
122        table = pd.DataFrame.from_dict(table)
123        table.index = pd.Index(groups, name="group", dtype=groups.dtype)
124        return chisq, pval, table, covar
125
126    return chisq, pval
def compare_survival(y, group_indicator, return_stats=False):
 14def compare_survival(y, group_indicator, return_stats=False):
 15    """K-sample log-rank hypothesis test of identical survival functions.
 16
 17    Compares the pooled hazard rate with each group-specific
 18    hazard rate. The alternative hypothesis is that the hazard
 19    rate of at least one group differs from the others at some time.
 20
 21    See [1]_ for more details.
 22
 23    Parameters
 24    ----------
 25    y : structured array, shape = (n_samples,)
 26        A structured array containing the binary event indicator
 27        as first field, and time of event or time of censoring as
 28        second field.
 29
 30    group_indicator : array-like, shape = (n_samples,)
 31        Group membership of each sample.
 32
 33    return_stats : bool, optional, default: False
 34        Whether to return a data frame with statistics for each group
 35        and the covariance matrix of the test statistic.
 36
 37    Returns
 38    -------
 39    chisq : float
 40        Test statistic.
 41    pvalue : float
 42        Two-sided p-value with respect to the null hypothesis
 43        that the hazard rates across all groups are equal.
 44    stats : pandas.DataFrame
 45        Summary statistics for each group:  number of samples,
 46        observed number of events, expected number of events,
 47        and test statistic.
 48        Only provided if `return_stats` is True.
 49    covariance : array, shape=(n_groups, n_groups)
 50        Covariance matrix of the test statistic.
 51        Only provided if `return_stats` is True.
 52
 53    References
 54    ----------
 55    .. [1] Fleming, T. R. and Harrington, D. P.
 56           A Class of Hypothesis Tests for One and Two Samples of Censored Survival Data.
 57           Communications In Statistics 10 (1981): 763-794.
 58    """
 59
 60    event, time = check_array_survival(group_indicator, y)
 61    group_indicator = check_array(
 62        group_indicator,
 63        dtype="O",
 64        ensure_2d=False,
 65        estimator="compare_survival",
 66        input_name="group_indicator",
 67    )
 68
 69    n_samples = time.shape[0]
 70    groups, group_counts = np.unique(group_indicator, return_counts=True)
 71    n_groups = groups.shape[0]
 72    if n_groups == 1:
 73        raise ValueError(
 74            "At least two groups must be specified, but only one was provided.")
 75
 76    # sort descending
 77    o = np.argsort(-time, kind="mergesort")
 78    x = group_indicator[o]
 79    event = event[o]
 80    time = time[o]
 81
 82    at_risk = np.zeros(n_groups, dtype=int)
 83    observed = np.zeros(n_groups, dtype=int)
 84    expected = np.zeros(n_groups, dtype=float)
 85    covar = np.zeros((n_groups, n_groups), dtype=float)
 86
 87    covar_indices = np.diag_indices(n_groups)
 88
 89    k = 0
 90    while k < n_samples:
 91        ti = time[k]
 92        total_events = 0
 93        while k < n_samples and ti == time[k]:
 94            idx = np.searchsorted(groups, x[k])
 95            if event[k]:
 96                observed[idx] += 1
 97                total_events += 1
 98            at_risk[idx] += 1
 99            k += 1
100
101        if total_events != 0:
102            total_at_risk = k
103            expected += at_risk * (total_events / total_at_risk)
104            if total_at_risk > 1:
105                multiplier = total_events * \
106                    (total_at_risk - total_events) / \
107                    (total_at_risk * (total_at_risk - 1))
108                temp = at_risk * multiplier
109                covar[covar_indices] += temp
110                covar -= np.outer(temp, at_risk) / total_at_risk
111
112    df = n_groups - 1
113    zz = observed[:df] - expected[:df]
114    chisq = np.linalg.solve(covar[:df, :df], zz).dot(zz)
115    pval = stats.chi2.sf(chisq, df)
116
117    if return_stats:
118        table = OrderedDict()
119        table["counts"] = group_counts
120        table["observed"] = observed
121        table["expected"] = expected
122        table["statistic"] = observed - expected
123        table = pd.DataFrame.from_dict(table)
124        table.index = pd.Index(groups, name="group", dtype=groups.dtype)
125        return chisq, pval, table, covar
126
127    return chisq, pval

K-sample log-rank hypothesis test of identical survival functions.

Compares the pooled hazard rate with each group-specific hazard rate. The alternative hypothesis is that the hazard rate of at least one group differs from the others at some time.

See 1 for more details.

Parameters

y : structured array, shape = (n_samples,) A structured array containing the binary event indicator as first field, and time of event or time of censoring as second field.

group_indicator : array-like, shape = (n_samples,) Group membership of each sample.

return_stats : bool, optional, default: False Whether to return a data frame with statistics for each group and the covariance matrix of the test statistic.

Returns

chisq : float Test statistic. pvalue : float Two-sided p-value with respect to the null hypothesis that the hazard rates across all groups are equal. stats : pandas.DataFrame Summary statistics for each group: number of samples, observed number of events, expected number of events, and test statistic. Only provided if return_stats is True. covariance : array, shape=(n_groups, n_groups) Covariance matrix of the test statistic. Only provided if return_stats is True.

References


  1. Fleming, T. R. and Harrington, D. P. A Class of Hypothesis Tests for One and Two Samples of Censored Survival Data. Communications In Statistics 10 (1981): 763-794.