40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191 | def fit(self, X, y, sample_weight=None): # noqa - capital X is a sklearn convention
'''Returns the top causes of y from among X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
n_samples = rows = number of observations.
n_features = columns = number of drivers/causes.
y : array-line of shape (n_samples)
Returns
-------
self : object
Returns the instance itself.
'''
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X) # noqa: N806 X can be in uppercase
if not isinstance(y, pd.Series):
y = pd.Series(y)
if X.shape[0] != y.shape[0]:
raise ValueError(f'X has {X.shape[0]} rows, but y has {y.shape[0]} rows')
# If values contain ±Inf treat it an NaN
with pd.option_context('mode.use_inf_as_na', True):
# If sample weights are not give, treat it as 1 for each row.
# If sample weights are NaN, treat it as 0.
if sample_weight is None:
sample_weight = y.notnull().astype(int)
# If no weights are specified, each category must have at least 3 rows
min_weight = 3 if self.min_weight is None else self.min_weight
elif not isinstance(sample_weight, pd.Series):
sample_weight = pd.Series(sample_weight)
sample_weight.fillna(0)
# Calculate summary stats
n = sample_weight.sum()
weighted_y = y * sample_weight
mean = weighted_y.sum() / n
var = ((y - mean) ** 2 * sample_weight).sum() / n
# Calculate impact for every column consistently
results = {}
for column, series in X.items():
# Ignore columns identical to y
if (series == y).all():
warnings.warn(f'column {column}: skipped. Identical to y')
# Process column as NUMERIC, ORDERED CATEGORICAL or CATEGORICAL based on dtype
# https://numpy.org/doc/stable/reference/generated/numpy.dtype.kind.html
kind = series.dtype.kind
# By default, assume that this column can't impact y
result = results[column] = {
'value': np.nan,
'gain': np.nan,
'p': 1.0,
'type': kind,
}
# ORDERED CATEGORICAL if kind is signed or unsigned int
# TODO: Currently, it's treated as numeric. Fix this based # of distinct ints.
if kind in 'iu':
series = series.astype(float)
kind = 'f'
# NUMERIC if kind is float
if kind in 'f':
# Drop missing values, pairwise
pair = pd.DataFrame({'values': series, 'weight': sample_weight, 'y': y})
pair.dropna(inplace=True)
# Run linear regression to see if y increases/decreases with column
# TODO: use weighted regression
from scipy.stats import linregress
reg = linregress(pair['values'], pair['y'])
# If slope is +ve, pick value at the 95th percentile
# If slope is -ve, pick value at the 5th percentile
pair = pair.sort_values('values', ascending=True)
top = np.interp(
self.percentile if reg.slope >= 0 else 1 - self.percentile,
pair['weight'].cumsum() / pair['weight'].sum(),
pair['values'],
)
# Predict the gain based on linear regression
gain = reg.slope * top + reg.intercept - mean
if gain > 0:
result.update(value=top, gain=gain, p=reg.pvalue, type='num')
# CATEGORICAL if kind is boolean, object, str or unicode
elif kind in 'bOSU':
# Group into a DataFrame with 3 columns {value, weight, mean}
# value: Each row has every unique value in the column
# weight: Sum of sample_weights in each group
# mean: mean(y) in each group, weighted by sample_weights
group = (
pd.DataFrame(
{'values': series, 'weight': sample_weight, 'weighted_y': weighted_y}
)
.dropna()
.groupby('values', sort=False)
.sum()
)
group['mean'] = group['weighted_y'] / group['weight']
# Pick the groups with highest mean(y), at >=95th percentile (or whatever).
# Ensure each group has at least min_weight samples.
group.sort_values('mean', inplace=True, ascending=True)
best_values = group.dropna(subset=['mean'])[
(group['weight'].cumsum() / group['weight'].sum() >= self.percentile)
& (group['weight'] >= min_weight)
]
# If there's at least 1 group over 95th percentile with enough weights...
if len(best_values):
# X[series == top] is the largest group (by weights) above the 95th pc
top = best_values.sort_values('weight').iloc[-1]
gain = top['mean'] - mean
# Only consider positive gains
if gain > 0:
# Calculate p value using Welch test: scipy.stats.mstats.ttest_ind()
# https://en.wikipedia.org/wiki/Welch%27s_t-test
# github.com/scipy/scipy/blob/v1.5.4/scipy/stats/mstats_basic.py
subset = series == top.name
subseries = y[subset]
submean, subn = subseries.mean(), sample_weight[subset].sum()
with np.errstate(divide='ignore', invalid='ignore'):
diff = subseries - submean
vn1 = (diff**2 * sample_weight[subset]).sum() / subn
vn2 = var / n
df = (vn1 + vn2) ** 2 / (
vn1**2 / (subn - 1) + vn2**2 / (n - 1)
)
df = 1 if np.isnan(df) else df
with np.errstate(divide='ignore', invalid='ignore'):
t = gain / (vn1 + vn2) ** 0.5
import scipy.special as special
p = special.betainc(0.5 * df, 0.5, df / (df + t * t))
# Update the result
result.update(value=top.name, gain=gain, p=p, type='cat')
# WARN if kind is complex, timestamp, datetime, etc
else:
warnings.warn(f'column {column}: unknown type {kind}')
results = pd.DataFrame(results).T
results.loc[results['p'] > self.max_p, ('value', 'gain')] = np.nan
self.result_ = results.sort_values('gain', ascending=False)
return self
|