-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathstatistics.py
355 lines (297 loc) · 12.2 KB
/
statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
from ..utils import DataikuException
from .utils import DSSDatasetSelectionBuilder
from .future import DSSFuture
from ..utils import DSSInternalDict
import json
from .metrics import ComputedMetrics
from .discussion import DSSObjectDiscussions
class DSSStatisticsWorksheet(object):
"""
A handle to interact with a worksheet.
"""
def __init__(self, client, project_key, dataset_name, worksheet_id):
self.client = client
self.project_key = project_key
self.dataset_name = dataset_name
self.worksheet_id = worksheet_id
def delete(self):
"""
Deletes the worksheet
"""
return self.client._perform_empty(
"DELETE", "/projects/%s/datasets/%s/statistics/worksheets/%s" % (self.project_key, self.dataset_name, self.worksheet_id))
def get_settings(self):
"""
Fetches the settings of this worksheet.
:return: an object to interact with the settings
:rtype: :class:`DSSStatisticsWorksheetSettings`
"""
worksheet_json = self.client._perform_json(
"GET", "/projects/%s/datasets/%s/statistics/worksheets/%s" % (
self.project_key, self.dataset_name, self.worksheet_id)
)
return DSSStatisticsWorksheetSettings(self.client, self.project_key,
self.dataset_name, self.worksheet_id, worksheet_json)
def run_worksheet(self, wait=True):
"""
Computes the results of the whole worksheet.
:returns: a :class:`~dataikuapi.dss.future.DSSFuture` handle
"""
root_card = self.get_settings().get_raw()['rootCard']
return self.run_card(root_card, wait=wait)
def run_card(self, card, wait=True):
"""
Runs a card in the context of the worksheet.
Note: the card does not need to belong to the worksheet.
:param card: a card to compute
:type card: :class:`DSSStatisticsCardSettings` or dict (obtained from ``DSSStatisticsCardSettings.get_raw()``)
:returns: a :class:`~dataikuapi.dss.future.DSSFuture` handle to the task of computing card's results
"""
card = DSSStatisticsCardSettings._from_card_or_dict(self.client, card)
future_response = self.client._perform_json(
"POST",
"/projects/%s/datasets/%s/statistics/worksheets/%s/actions/run-card" % (
self.project_key, self.dataset_name, self.worksheet_id),
body=card.get_raw()
)
future = DSSFuture(self.client, future_response.get(
"jobId", None), future_response)
return future.wait_for_result() if wait else future
def run_computation(self, computation, wait=True):
"""
Runs a computation in the context of the worksheet.
:param computation: a card to compute
:type computation: :class:`DSSStatisticsComputationSettings` or dict (obtained from ``DSSStatisticsComputationSettings.get_raw()``)
:returns: a :class:`~dataikuapi.dss.future.DSSFuture` handle to the task of computing computation's results
"""
computation = DSSStatisticsComputationSettings._from_computation_or_dict(
computation)
future_response = self.client._perform_json(
"POST",
"/projects/%s/datasets/%s/statistics/worksheets/%s/actions/run-computation" % (
self.project_key, self.dataset_name, self.worksheet_id),
body=computation.get_raw()
)
future = DSSFuture(self.client, future_response.get(
"jobId", None), future_response)
return future.wait_for_result() if wait else future
class DSSStatisticsWorksheetSettings(DSSInternalDict):
def __init__(self, client, project_key, dataset_name, worksheet_id, worksheet_definition):
super(DSSStatisticsWorksheetSettings,
self).__init__(worksheet_definition)
self.client = client
self.project_key = project_key
self.dataset_name = dataset_name
self.worksheet_id = worksheet_id
def add_card(self, card):
"""
Adds a new card to the worksheet.
:param card: card to be added
:type card: :class:`DSSStatisticsCardSettings` or dict (obtained from ``DSSStatisticsCardSettings.get_raw()``)
"""
card = DSSStatisticsCardSettings._from_card_or_dict(self.client, card)
self._internal_dict['rootCard']['cards'].append(card.get_raw())
def list_cards(self):
"""
Lists the cards of this worksheet.
:rtype: list of :class:`DSSStatisticsCardSettings`
"""
return [DSSStatisticsCardSettings(self.client, card_definition)
for card_definition in self._internal_dict['rootCard']['cards']]
def get_raw(self):
"""
Gets a reference to the raw settings of the worksheet.
:rtype: dict
"""
return self._internal_dict
def set_sampling_settings(self, selection):
"""
Sets the sampling settings of the worksheet
:type card: :class:`DSSDatasetSelectionBuilder` or dict (obtained from ``get_raw_sampling_selection()``)
"""
raw_selection = selection.build() if isinstance(
selection, DSSDatasetSelectionBuilder) else selection
self._internal_dict['dataSpec']['datasetSelection'] = raw_selection
def get_raw_sampling_settings(self):
"""
Gets a reference to the raw sampling settings of the worksheet.
:rtype: dict
"""
return self._internal_dict['dataSpec']['datasetSelection']
def save(self):
"""
Saves the settings to DSS
"""
self._internal_dict = self.client._perform_json(
"PUT",
"/projects/%s/datasets/%s/statistics/worksheets/%s" % (
self.project_key, self.dataset_name, self.worksheet_id),
body=self._internal_dict
)
class DSSStatisticsCardSettings(DSSInternalDict):
def __init__(self, client, card_definition):
super(DSSStatisticsCardSettings, self).__init__(card_definition)
self.client = client
self._internal_dict = card_definition
def get_raw(self):
"""
Gets a reference to the raw settings of the card.
:rtype: dict
"""
return self._internal_dict
def compile(self):
"""
Gets the underlying computation used to compute the card results.
:rtype: DSSStatisticsComputationSettings
"""
computation_json = self.client._perform_json(
"POST", "/statistics/cards/compile", body=self._internal_dict
)
return DSSStatisticsComputationSettings(computation_json)
@staticmethod
def _from_card_or_dict(client, card_or_dict):
if isinstance(card_or_dict, DSSStatisticsCardSettings):
card_or_dict = card_or_dict.get_raw()
return DSSStatisticsCardSettings(client, card_or_dict)
class DSSStatisticsComputationSettings(DSSInternalDict):
def __init__(self, computation_definition):
super(DSSStatisticsComputationSettings,
self).__init__(computation_definition)
self._internal_dict = computation_definition
def get_raw(self):
"""
Gets the raw settings of the computation.
:rtype: dict
"""
return self._internal_dict
@staticmethod
def _from_computation_or_dict(computation_or_dict):
if isinstance(computation_or_dict, ComputationBase):
computation_or_dict = computation_or_dict.to_model()
if isinstance(computation_or_dict, DSSStatisticsComputationSettings):
computation_or_dict = computation_or_dict.get_raw()
return DSSStatisticsComputationSettings(computation_or_dict)
class ComputationBase(object):
def __init__(self):
pass
def grouped_by_alphanum(self, column, max_values=10, group_others=False):
return GroupedComputation(self, {
"type" : "anum",
"column" : column,
"maxValues": max_values,
"groupOthers": group_others
})
def grouped_by_bins(self, column, nb_bins=None, bin_size=None, keep_na=False):
if nb_bins is not None:
return GroupedComputation(self, {
"type" : "binned",
"column" : column,
"mode": "FIXED_NB",
"nbBins" : nb_bins,
"keepNA" : keep_na
})
elif bin_size is not None:
return GroupedComputation(self, {
"type" : "binned",
"column" : column,
"mode": "FIXED_SIZE",
"binSize" : bin_size,
"keepNA" : keep_na
})
class DescriptiveStatistics(ComputationBase):
def __init__(self, columns, mean=False, sum=False, stddev=False, variance=False, skewness=False,kurtosis=False,sem=False):
self.columns = columns
self.mean = mean
self.sum = sum
self.stddev = stddev
self.variance = variance
self.skewness = skewness
self.kurtosis = kurtosis
self.sem = sem
def to_model(self):
computations = []
for col in self.columns:
if self.mean:
computations.append({"type": "mean", "column": col})
if self.sum:
computations.append({"type": "sum", "column": col})
if self.stddev:
computations.append({"type": "std_dev", "column": col})
if self.variance:
computations.append({"type": "variance", "column": col})
if self.skewness:
computations.append({"type": "skewness", "column": col})
if self.kurtosis:
computations.append({"type": "kurtosis", "column": col})
if self.sem:
computations.append({"type": "sem", "column": col})
return {"type": "multi", "computations" : computations}
class Quantiles(ComputationBase):
def __init__(self, column, freqs=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99], confidence=None):
self.column = column
self.freqs = freqs
self.confidence = confidence
def to_model(self):
return {
"type": "quantiles",
"column" : self.column,
"freqs": self.freqs,
"confidence": self.confidence
}
class TTest1Sample(ComputationBase):
def __init__(self, column, hypothesized_mean):
self.column = column
self.hypothesized_mean = hypothesized_mean
def to_model(self):
return {
"type": "ttest_1samp",
"column": self.column,
"hypothesizedMean" : self.hypothesized_mean
}
class DistributionFit(ComputationBase):
def __init__(self, column, type="normal", test=True, **kwargs):
self.column = column
self.type = type
self.test = test
self.distribution_args = kwargs
def to_model(self):
distribution = {
"type" : self.type
}
distribution.update(self.distribution_args)
return {
"type": "fit_distribution",
"column" : self.column,
"distribution": distribution,
"test" :self.test
}
class _BasicBivariateComputation(ComputationBase):
def __init__(self, type, column1, column2):
self.type = type
self.column1 = column1
self.column2 = column2
def to_model(self):
return {
"type": self.type,
"xColumn": self.column1,
"yColumn": self.column2
}
class Pearson(_BasicBivariateComputation):
def __init__(self, column1, column2):
super(Pearson, self).__init__("pearson", column1, column2)
class Covariance(_BasicBivariateComputation):
def __init__(self, column1, column2):
super(Pearson, self).__init__("covariance", column1, column2)
class Spearman(_BasicBivariateComputation):
def __init__(self, column1, column2):
super(Pearson, self).__init__("spearman", column1, column2)
class GroupedComputation(ComputationBase):
def __init__(self, computation, grouping):
self.computation = computation
self.grouping = grouping
def to_model(self):
return {
"type": "grouped",
"computation" : self.computation.to_model(),
"grouping": self.grouping
}