-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathanalysis.py
431 lines (351 loc) · 18.2 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
from ..utils import DataikuUTF8CSVReader
from ..utils import DataikuStreamedHttpUTF8CSVReader
import json
from .metrics import ComputedMetrics
from .ml import DSSMLTask
from .utils import DSSDatasetSelectionBuilder
class DSSAnalysisStepBuilder(object):
def __init__(self, step_type=None, step_name=None):
self.step = {'metaType':'PROCESSOR', 'type':step_type, 'name':step_name, 'params':{}}
def build(self):
"""Returns the built step dict"""
return self.step
def with_type(self, step_type):
"""Sets the step's type"""
self.step["type"] = step_type
return self
def with_name(self, step_name):
"""Sets the step's name"""
self.step["name"] = step_name
return self
class DSSAnalysisDefinition():
"""
Object to manipulate the definition of a visual analysis
"""
def __init__(self, analysis, acp):
self.analysis = analysis
self.acp = acp
def get_raw(self):
"""
Gets the raw dictionary of the visual analysis definition
"""
return self.acp
def get_raw_script(self):
"""
Gets the raw dictionary of visual analysis' script settings (including steps, sampling, ...)
"""
acp = self.get_raw()
if not 'script' in acp:
acp['script'] = {'steps':[]}
return acp['script']
def get_raw_script_steps(self):
"""
Gets the raw dictionary of visual analysis' script steps
"""
script = self.get_raw_script()
if not 'steps' in script:
script['steps'] = []
return script['steps']
def get_raw_script_sampling(self):
"""
Gets the raw dictionary of visual analysis' script sampling
"""
script = self.get_raw_script()
if not 'explorationSampling' in script:
script['explorationSampling'] = {}
return script['explorationSampling']
def save(self):
"""
Shortcut to :meth:`DSSAnalysis.set_definition()`
"""
self.analysis.set_definition(self)
def add_step(self, step):
"""
Add a step to the script
:param object selection: A :class:`DSSAnalysisStepBuilder` to build the settings of the step.
"""
steps = self.get_raw_script_steps()
if isinstance(step, DSSAnalysisStepBuilder):
steps.append(step.build())
else:
steps.append(step)
def set_script_sampling_selection(self, selection):
"""
Sets the sampling for the script
:param object selection: A :class:`DSSDatasetSelectionBuilder` to build the settings of the extract of the dataset.
"""
sampling = self.get_raw_script_sampling()
if isinstance(selection, DSSDatasetSelectionBuilder):
sampling['selection'] = selection.build()
else:
sampling['selection'] = selection
class DSSAnalysis(object):
"""A handle to interact with a DSS visual analysis"""
def __init__(self, client, project_key, analysis_id):
self.client = client
self.project_key = project_key
self.analysis_id = analysis_id
########################################################
# Analysis deletion
########################################################
def delete(self, drop_data=False):
"""
Delete the dataset
:param bool drop_data: Should the data of the dataset be dropped
"""
return self.client._perform_empty("DELETE", "/projects/%s/lab/%s/" % (self.project_key, self.analysis_id))
########################################################
# Analysis definition
########################################################
def get_definition(self):
"""
Get the definition of the analysis
:return: a DSSAnalysisDefinition object to interact with the settings
:rtype: :class:`dataikuapi.dss.analysis.DSSAnalysisDefinition`
"""
acp = self.client._perform_json("GET", "/projects/%s/lab/%s/" % (self.project_key, self.analysis_id))
return DSSAnalysisDefinition(self, acp)
def set_definition(self, definition):
"""
Set the definition of the analysis
Args:
definition: the definition, as a JSON object or a :class:`dataikuapi.dss.analysis.DSSAnalysisDefinition`.
You should only set a definition object that has been retrieved using the get_definition call.
"""
if isinstance(definition, DSSAnalysisDefinition):
acp = definition.get_raw()
else:
acp = definition
return self.client._perform_json("PUT", "/projects/%s/lab/%s/" % (self.project_key, self.analysis_id), body=acp)
########################################################
# ML
########################################################
def create_prediction_ml_task(self,
target_variable,
ml_backend_type="PY_MEMORY",
guess_policy="DEFAULT",
prediction_type=None,
wait_guess_complete=True):
"""Creates a new prediction task in this visual analysis lab
for a dataset.
:param string target_variable: Variable to predict
:param string ml_backend_type: ML backend to use, one of PY_MEMORY, MLLIB or H2O
:param string guess_policy: Policy to use for setting the default parameters. Valid values are: DEFAULT, SIMPLE_FORMULA, DECISION_TREE, EXPLANATORY and PERFORMANCE
:param string prediction_type: The type of prediction problem this is. If not provided the prediction type will be guessed. Valid values are: BINARY_CLASSIFICATION, REGRESSION, MULTICLASS
:param boolean wait_guess_complete: if False, the returned ML task will be in 'guessing' state, i.e. analyzing the input dataset to determine feature handling and algorithms.
You should wait for the guessing to be completed by calling
``wait_guess_complete`` on the returned object before doing anything
else (in particular calling ``train`` or ``get_settings``)
:return :class dataiku.dss.ml.DSSMLTask
"""
obj = {
"taskType" : "PREDICTION",
"targetVariable" : target_variable,
"backendType": ml_backend_type,
"guessPolicy": guess_policy
}
if prediction_type is not None:
obj["predictionType"] = prediction_type
ref = self.client._perform_json("POST", "/projects/%s/lab/%s/models/" % (self.project_key, self.analysis_id), body=obj)
mltask = DSSMLTask(self.client, self.project_key, self.analysis_id, ref["mlTaskId"])
if wait_guess_complete:
mltask.wait_guess_complete()
return mltask
def create_clustering_ml_task(self,
ml_backend_type="PY_MEMORY",
guess_policy="KMEANS",
wait_guess_complete=True):
"""Creates a new clustering task in this visual analysis lab for a dataset.
The returned ML task will be in 'guessing' state, i.e. analyzing
the input dataset to determine feature handling and algorithms.
You should wait for the guessing to be completed by calling
``wait_guess_complete`` on the returned object before doing anything
else (in particular calling ``train`` or ``get_settings``)
:param string ml_backend_type: ML backend to use, one of PY_MEMORY, MLLIB or H2O
:param string guess_policy: Policy to use for setting the default parameters. Valid values are: KMEANS and ANOMALY_DETECTION
:param boolean wait_guess_complete: if False, the returned ML task will be in 'guessing' state, i.e. analyzing the input dataset to determine feature handling and algorithms.
You should wait for the guessing to be completed by calling
``wait_guess_complete`` on the returned object before doing anything
else (in particular calling ``train`` or ``get_settings``)
"""
obj = {
"taskType" : "CLUSTERING",
"backendType": ml_backend_type,
"guessPolicy": guess_policy
}
ref = self.client._perform_json("POST", "/projects/%s/lab/%s/models/" % (self.project_key, self.analysis_id), body=obj)
mltask = DSSMLTask(self.client, self.project_key, self.analysis_id, ref["mlTaskId"])
if wait_guess_complete:
mltask.wait_guess_complete()
return mltask
def create_timeseries_forecasting_ml_task(self, target_variable,
time_variable,
timeseries_identifiers=None,
guess_policy="TIMESERIES_DEFAULT",
wait_guess_complete=True):
"""Creates a new time series forecasting task in this visual analysis lab for a dataset.
:param string target_variable: The variable to forecast
:param string time_variable: Column to be used as time variable. Should be a Date (parsed) column.
:param list timeseries_identifiers: List of columns to be used as time series identifiers (when the dataset has multiple series)
:param string guess_policy: Policy to use for setting the default parameters.
Valid values are: TIMESERIES_DEFAULT, TIMESERIES_STATISTICAL, and TIMESERIES_DEEP_LEARNING
:param boolean wait_guess_complete: If False, the returned ML task will be in 'guessing' state, i.e. analyzing the input dataset to determine feature handling and algorithms.
You should wait for the guessing to be completed by calling
``wait_guess_complete`` on the returned object before doing anything
else (in particular calling ``train`` or ``get_settings``)
:return :class dataiku.dss.ml.DSSMLTask
"""
obj = {
"taskType": "PREDICTION",
"targetVariable": target_variable,
"timeVariable": time_variable,
"timeseriesIdentifiers": timeseries_identifiers,
"backendType": "PY_MEMORY",
"guessPolicy": guess_policy,
"predictionType": "TIMESERIES_FORECAST"
}
ref = self.client._perform_json(
"POST",
"/projects/{project_key}/lab/{analysis_id}/models/".format(project_key=self.project_key, analysis_id=self.analysis_id),
body=obj
)
mltask = DSSMLTask(self.client, self.project_key, self.analysis_id, ref["mlTaskId"])
if wait_guess_complete:
mltask.wait_guess_complete()
return mltask
def create_causal_prediction_ml_task(self,
outcome_variable,
treatment_variable,
prediction_type=None,
wait_guess_complete=True):
"""Creates a new causal prediction task in this visual analysis lab
for a dataset.
:param string outcome_variable: Variable to predict
:param string treatment_variable: Treatment variable
:param string prediction_type: The type of prediction problem, guessed if not provided. Valid values: CAUSAL_BINARY_CLASSIFICATION, CAUSAL_REGRESSION
:param boolean wait_guess_complete: if False, the returned ML task will be in 'guessing' state, i.e. analyzing the input dataset to determine feature handling and algorithms.
You should wait for the guessing to be completed by calling
``wait_guess_complete`` on the returned object before doing anything
else (in particular calling ``train`` or ``get_settings``)
:return :class dataiku.dss.ml.DSSMLTask
"""
obj = {
"taskType": "PREDICTION",
"targetVariable": outcome_variable,
"treatmentVariable": treatment_variable,
"backendType": "PY_MEMORY",
"guessPolicy": "CAUSAL_PREDICTION"
}
if prediction_type is not None:
obj["predictionType"] = prediction_type
ref = self.client._perform_json("POST", "/projects/%s/lab/%s/models/" % (self.project_key, self.analysis_id), body=obj)
mltask = DSSMLTask(self.client, self.project_key, self.analysis_id, ref["mlTaskId"])
if wait_guess_complete:
mltask.wait_guess_complete()
return mltask
def list_ml_tasks(self):
"""
List the ML tasks in this visual analysis
Returns:
the list of the ML tasks summaries, each one as a JSON object
"""
return self.client._perform_json("GET", "/projects/%s/lab/%s/models/" % (self.project_key, self.analysis_id))
def get_ml_task(self, mltask_id):
"""
Get a handle to interact with a specific ML task
Args:
mltask_id: the identifier of the desired ML task
Returns:
A :class:`dataikuapi.dss.ml.DSSMLTask` ML task handle
"""
return DSSMLTask(self.client, self.project_key, self.analysis_id, mltask_id)
# some basic steps
class DSSFormulaStepBuilder(DSSAnalysisStepBuilder):
def __init__(self, step_name=None):
super(DSSFormulaStepBuilder, self).__init__(step_type='CreateColumnWithGREL', step_name=step_name)
def with_output_column(self, column_name):
"""Sets the step's output column's name"""
self.step["params"]["column"] = column_name
return self
def with_error_column(self, column_name):
"""Sets the step's output column's name"""
self.step["params"]["errorColumn"] = column_name
return self
def with_expression(self, expression):
"""Sets the step's expression"""
self.step["params"]["expression"] = expression
return self
class AppliesToStepBuilder(DSSAnalysisStepBuilder):
def __init__(self, step_type=None, step_name=None):
super(AppliesToStepBuilder, self).__init__(step_type=step_type, step_name=step_name)
self.step["params"]["appliesTo"] = 'SINGLE_COLUMN'
def with_column_selection_mode(self, column_selection_mode):
"""Sets the step's column selection mode (SINGLE_COLUMN, COLUMNS, PATTERN, ALL)"""
self.step["params"]["appliesTo"] = column_selection_mode
return self
def with_columns(self, *column_names):
"""Sets the step's selected columns"""
self.step["params"]["columns"] = [c for c in column_names]
return self
def with_column_regex(self, regex):
"""Sets the step's column selection regular expression"""
self.step["params"]["appliesToPattern"] = regex
return self
def with_single_column_selection(self, column_name):
"""Sets the step's as applying to a single column"""
return self.with_column_selection_mode('SINGLE_COLUMN').with_columns(column_name)
def with_multiple_column_selection(self, *column_names):
"""Sets the step's as applying to a single column"""
return self.with_column_selection_mode('COLUMNS').with_columns(column_names)
def with_regex_column_selection(self, regex):
"""Sets the step's as applying to a single column"""
return self.with_column_selection_mode('PATTERN').with_column_regex(regex)
def with_all_column_selection(self, column_name):
"""Sets the step's as applying to all columns"""
return self.with_column_selection_mode('ALL')
class FilterAndFlagStepBuilder(AppliesToStepBuilder):
def __init__(self, step_type=None, step_name=None):
super(FilterAndFlagStepBuilder, self).__init__(step_type=step_type, step_name=step_name)
self.step["params"]["booleanMode"] = 'AND'
self.step["params"]["action"] = 'REMOVE_ROW'
def with_action(self, action):
"""Sets the step's action on match (KEEP_ROW, REMOVE_ROW, CLEAR_CELL, DONTCLEAR_CELL, FLAG)"""
self.step["params"]["action"] = action
return self
def with_boolean_mode(self, boolean_mode):
"""Sets the step's mode for combining matches in different columns (AND, OR)"""
self.step["params"]["booleanMode"] = boolean_mode
return self
def with_flag_column(self, column_name):
"""Sets the step's column for outputing the flag"""
self.step["params"]["flagColumn"] = column_name
return self
class FilterOnValueStepBuilder(FilterAndFlagStepBuilder):
def __init__(self, step_name=None):
super(FilterOnValueStepBuilder, self).__init__(step_type='FlagOnValue', step_name=step_name)
def with_values(self, *values):
"""Sets the step's flagged values"""
self.step["params"]["values"] = [v for v in values]
return self
def with_matching_mode(self, matching_mode):
"""Sets the step's matching_mode (FULL_STRING, SUBSTRING, PATTERN)"""
self.step["params"]["matchingMode"] = matching_mode
return self
def with_normalization_mode(self, normalization_mode):
"""Sets the step's normalization_mode (EXACT, LOWERCASE, NORMALIZED)"""
self.step["params"]["normalizationMode"] = normalization_mode
return self
class FilterOnBadTypeStepBuilder(FilterAndFlagStepBuilder):
def __init__(self, step_name=None):
super(FilterOnBadTypeStepBuilder, self).__init__(step_type='FilterOnBadType', step_name=step_name)
def with_meaning(self, meaning):
"""Sets the step's meaning to check"""
self.step["params"]["type"] = meaning
return self
class RemoveRowsStepBuilder(AppliesToStepBuilder):
def __init__(self, step_name=None):
super(RemoveRowsStepBuilder, self).__init__(step_type='RemoveRowsOnEmpty', step_name=step_name)
def with_meaning(self, keep):
"""Sets the step's behavior when an empty value is found : True=keep, False=drop (default)"""
self.step["params"]["keep"] = keep
return self