survivalist.compare
1from collections import OrderedDict 2 3import numpy as np 4import pandas as pd 5from scipy import stats 6from sklearn.utils.validation import check_array 7 8from .util import check_array_survival 9 10__all__ = ["compare_survival"] 11 12 13def compare_survival(y, group_indicator, return_stats=False): 14 """K-sample log-rank hypothesis test of identical survival functions. 15 16 Compares the pooled hazard rate with each group-specific 17 hazard rate. The alternative hypothesis is that the hazard 18 rate of at least one group differs from the others at some time. 19 20 See [1]_ for more details. 21 22 Parameters 23 ---------- 24 y : structured array, shape = (n_samples,) 25 A structured array containing the binary event indicator 26 as first field, and time of event or time of censoring as 27 second field. 28 29 group_indicator : array-like, shape = (n_samples,) 30 Group membership of each sample. 31 32 return_stats : bool, optional, default: False 33 Whether to return a data frame with statistics for each group 34 and the covariance matrix of the test statistic. 35 36 Returns 37 ------- 38 chisq : float 39 Test statistic. 40 pvalue : float 41 Two-sided p-value with respect to the null hypothesis 42 that the hazard rates across all groups are equal. 43 stats : pandas.DataFrame 44 Summary statistics for each group: number of samples, 45 observed number of events, expected number of events, 46 and test statistic. 47 Only provided if `return_stats` is True. 48 covariance : array, shape=(n_groups, n_groups) 49 Covariance matrix of the test statistic. 50 Only provided if `return_stats` is True. 51 52 References 53 ---------- 54 .. [1] Fleming, T. R. and Harrington, D. P. 55 A Class of Hypothesis Tests for One and Two Samples of Censored Survival Data. 56 Communications In Statistics 10 (1981): 763-794. 57 """ 58 59 event, time = check_array_survival(group_indicator, y) 60 group_indicator = check_array( 61 group_indicator, 62 dtype="O", 63 ensure_2d=False, 64 estimator="compare_survival", 65 input_name="group_indicator", 66 ) 67 68 n_samples = time.shape[0] 69 groups, group_counts = np.unique(group_indicator, return_counts=True) 70 n_groups = groups.shape[0] 71 if n_groups == 1: 72 raise ValueError( 73 "At least two groups must be specified, but only one was provided.") 74 75 # sort descending 76 o = np.argsort(-time, kind="mergesort") 77 x = group_indicator[o] 78 event = event[o] 79 time = time[o] 80 81 at_risk = np.zeros(n_groups, dtype=int) 82 observed = np.zeros(n_groups, dtype=int) 83 expected = np.zeros(n_groups, dtype=float) 84 covar = np.zeros((n_groups, n_groups), dtype=float) 85 86 covar_indices = np.diag_indices(n_groups) 87 88 k = 0 89 while k < n_samples: 90 ti = time[k] 91 total_events = 0 92 while k < n_samples and ti == time[k]: 93 idx = np.searchsorted(groups, x[k]) 94 if event[k]: 95 observed[idx] += 1 96 total_events += 1 97 at_risk[idx] += 1 98 k += 1 99 100 if total_events != 0: 101 total_at_risk = k 102 expected += at_risk * (total_events / total_at_risk) 103 if total_at_risk > 1: 104 multiplier = total_events * \ 105 (total_at_risk - total_events) / \ 106 (total_at_risk * (total_at_risk - 1)) 107 temp = at_risk * multiplier 108 covar[covar_indices] += temp 109 covar -= np.outer(temp, at_risk) / total_at_risk 110 111 df = n_groups - 1 112 zz = observed[:df] - expected[:df] 113 chisq = np.linalg.solve(covar[:df, :df], zz).dot(zz) 114 pval = stats.chi2.sf(chisq, df) 115 116 if return_stats: 117 table = OrderedDict() 118 table["counts"] = group_counts 119 table["observed"] = observed 120 table["expected"] = expected 121 table["statistic"] = observed - expected 122 table = pd.DataFrame.from_dict(table) 123 table.index = pd.Index(groups, name="group", dtype=groups.dtype) 124 return chisq, pval, table, covar 125 126 return chisq, pval
14def compare_survival(y, group_indicator, return_stats=False): 15 """K-sample log-rank hypothesis test of identical survival functions. 16 17 Compares the pooled hazard rate with each group-specific 18 hazard rate. The alternative hypothesis is that the hazard 19 rate of at least one group differs from the others at some time. 20 21 See [1]_ for more details. 22 23 Parameters 24 ---------- 25 y : structured array, shape = (n_samples,) 26 A structured array containing the binary event indicator 27 as first field, and time of event or time of censoring as 28 second field. 29 30 group_indicator : array-like, shape = (n_samples,) 31 Group membership of each sample. 32 33 return_stats : bool, optional, default: False 34 Whether to return a data frame with statistics for each group 35 and the covariance matrix of the test statistic. 36 37 Returns 38 ------- 39 chisq : float 40 Test statistic. 41 pvalue : float 42 Two-sided p-value with respect to the null hypothesis 43 that the hazard rates across all groups are equal. 44 stats : pandas.DataFrame 45 Summary statistics for each group: number of samples, 46 observed number of events, expected number of events, 47 and test statistic. 48 Only provided if `return_stats` is True. 49 covariance : array, shape=(n_groups, n_groups) 50 Covariance matrix of the test statistic. 51 Only provided if `return_stats` is True. 52 53 References 54 ---------- 55 .. [1] Fleming, T. R. and Harrington, D. P. 56 A Class of Hypothesis Tests for One and Two Samples of Censored Survival Data. 57 Communications In Statistics 10 (1981): 763-794. 58 """ 59 60 event, time = check_array_survival(group_indicator, y) 61 group_indicator = check_array( 62 group_indicator, 63 dtype="O", 64 ensure_2d=False, 65 estimator="compare_survival", 66 input_name="group_indicator", 67 ) 68 69 n_samples = time.shape[0] 70 groups, group_counts = np.unique(group_indicator, return_counts=True) 71 n_groups = groups.shape[0] 72 if n_groups == 1: 73 raise ValueError( 74 "At least two groups must be specified, but only one was provided.") 75 76 # sort descending 77 o = np.argsort(-time, kind="mergesort") 78 x = group_indicator[o] 79 event = event[o] 80 time = time[o] 81 82 at_risk = np.zeros(n_groups, dtype=int) 83 observed = np.zeros(n_groups, dtype=int) 84 expected = np.zeros(n_groups, dtype=float) 85 covar = np.zeros((n_groups, n_groups), dtype=float) 86 87 covar_indices = np.diag_indices(n_groups) 88 89 k = 0 90 while k < n_samples: 91 ti = time[k] 92 total_events = 0 93 while k < n_samples and ti == time[k]: 94 idx = np.searchsorted(groups, x[k]) 95 if event[k]: 96 observed[idx] += 1 97 total_events += 1 98 at_risk[idx] += 1 99 k += 1 100 101 if total_events != 0: 102 total_at_risk = k 103 expected += at_risk * (total_events / total_at_risk) 104 if total_at_risk > 1: 105 multiplier = total_events * \ 106 (total_at_risk - total_events) / \ 107 (total_at_risk * (total_at_risk - 1)) 108 temp = at_risk * multiplier 109 covar[covar_indices] += temp 110 covar -= np.outer(temp, at_risk) / total_at_risk 111 112 df = n_groups - 1 113 zz = observed[:df] - expected[:df] 114 chisq = np.linalg.solve(covar[:df, :df], zz).dot(zz) 115 pval = stats.chi2.sf(chisq, df) 116 117 if return_stats: 118 table = OrderedDict() 119 table["counts"] = group_counts 120 table["observed"] = observed 121 table["expected"] = expected 122 table["statistic"] = observed - expected 123 table = pd.DataFrame.from_dict(table) 124 table.index = pd.Index(groups, name="group", dtype=groups.dtype) 125 return chisq, pval, table, covar 126 127 return chisq, pval
K-sample log-rank hypothesis test of identical survival functions.
Compares the pooled hazard rate with each group-specific hazard rate. The alternative hypothesis is that the hazard rate of at least one group differs from the others at some time.
See 1 for more details.
Parameters
y : structured array, shape = (n_samples,) A structured array containing the binary event indicator as first field, and time of event or time of censoring as second field.
group_indicator : array-like, shape = (n_samples,) Group membership of each sample.
return_stats : bool, optional, default: False Whether to return a data frame with statistics for each group and the covariance matrix of the test statistic.
Returns
chisq : float
Test statistic.
pvalue : float
Two-sided p-value with respect to the null hypothesis
that the hazard rates across all groups are equal.
stats : pandas.DataFrame
Summary statistics for each group: number of samples,
observed number of events, expected number of events,
and test statistic.
Only provided if return_stats is True.
covariance : array, shape=(n_groups, n_groups)
Covariance matrix of the test statistic.
Only provided if return_stats is True.
References
-
Fleming, T. R. and Harrington, D. P. A Class of Hypothesis Tests for One and Two Samples of Censored Survival Data. Communications In Statistics 10 (1981): 763-794. ↩