-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathmanagedfolder.py
629 lines (472 loc) · 23 KB
/
managedfolder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
from ..utils import DataikuException
from ..utils import DataikuUTF8CSVReader
from ..utils import DataikuStreamedHttpUTF8CSVReader
from .utils import DSSTaggableObjectListItem, DSSTaggableObjectSettings
import json
import sys
import os
from requests import utils
from .metrics import ComputedMetrics
from .future import DSSFuture
from .discussion import DSSObjectDiscussions
from .dataset import DSSDataset
try:
basestring
except NameError:
basestring = str
class DSSManagedFolder(object):
"""
A handle to interact with a managed folder on the DSS instance.
.. important::
Do not create this class directly, instead use :meth:`dataikuapi.dss.project.get_managed_folder`
"""
def __init__(self, client, project_key, odb_id):
self.client = client
self.project = client.get_project(project_key)
self.project_key = project_key
self.odb_id = odb_id
@property
def id(self):
"""
Returns the internal identifier of the managed folder, which is a 8-character random string, not
to be confused with the managed folder's name.
:rtype: string
"""
return self.odb_id
########################################################
# Managed folder deletion
########################################################
def delete(self):
"""
Delete the managed folder from the flow, and objects using it (recipes or labeling tasks)
.. attention::
This call doesn't delete the managed folder's contents
"""
return self.client._perform_empty(
"DELETE", "/projects/%s/managedfolders/%s" % (self.project_key, self.odb_id))
########################################################
# Managed folder renaming
########################################################
def rename(self, new_name):
"""
Rename the managed folder
:param str new_name: the new name of the managed folder
.. note::
The new name cannot be made of whitespaces only.
"""
body = {
"id": self.odb_id,
"newName": new_name
}
return self.client._perform_empty(
"POST",
u"/projects/{}/actions/renameManagedFolder".format(self.project_key),
body=body
)
########################################################
# Managed folder definition
########################################################
def get_definition(self):
"""
Get the definition of this managed folder. The definition contains name, description
checklists, tags, connection and path parameters, metrics and checks setup.
.. caution::
Deprecated. Please use :meth:`get_settings`
:returns: the managed folder definition.
:rtype: dict
"""
return self.client._perform_json(
"GET", "/projects/%s/managedfolders/%s" % (self.project_key, self.odb_id))
def set_definition(self, definition):
"""
Set the definition of this managed folder.
.. caution::
Deprecated. Please use :meth:`get_settings` then :meth:`~DSSManagedFolderSettings.save()`
.. note::
the fields `id` and `projectKey` can't be modified
Usage example:
.. code-block:: python
folder_definition = folder.get_definition()
folder_definition['tags'] = ['tag1','tag2']
folder.set_definition(folder_definition)
:param dict definition: the new state of the definition for the folder. You should only set a definition object
that has been retrieved using the :meth:`get_definition` call
:returns: a message upon successful completion of the definition update. Only contains one `msg` field
:rtype: dict
"""
return self.client._perform_json(
"PUT", "/projects/%s/managedfolders/%s" % (self.project_key, self.odb_id),
body=definition)
def get_settings(self):
"""
Returns the settings of this managed folder as a :class:`DSSManagedFolderSettings`.
You must use :meth:`~DSSManagedFolderSettings.save()` on the returned object to make your changes effective
on the managed folder.
.. code-block:: python
# Example: activating discrete partitioning
folder = project.get_managed_folder("my_folder_id")
settings = folder.get_settings()
settings.add_discrete_partitioning_dimension("country")
settings.save()
:returns: the settings of the managed folder
:rtype: :class:`DSSManagedFolderSettings`
"""
data = self.client._perform_json("GET", "/projects/%s/managedfolders/%s" % (self.project_key, self.odb_id))
return DSSManagedFolderSettings(self, data)
########################################################
# Managed folder contents
########################################################
def list_contents(self):
"""
Get the list of files in the managed folder
Usage example:
.. code-block:: python
for content in folder.list_contents()['items']:
last_modified_seconds = content["lastModified"] / 1000
last_modified_str = datetime.fromtimestamp(last_modified_seconds).strftime("%Y-%m-%d %H:%m:%S")
print("size=%s mtime=%s %s" % (content["size"], last_modified_str, content["path"]))
:returns: the list of files, in the `items` field. Each item has fields:
* **path** : path of the file inside the folder
* **size** : size of the file in bytes
* **lastModified** : last modification time, in milliseconds since epoch
:rtype: dict
"""
return self.client._perform_json(
"GET", "/projects/%s/managedfolders/%s/contents" % (self.project_key, self.odb_id))
def get_file(self, path):
"""
Get a file from the managed folder
Usage example:
.. code-block:: python
with folder.get_file("/kaggle_titanic_train.csv") as fd:
df = pandas.read_csv(fd.raw)
:param string path: the path of the file to read within the folder
:returns: the HTTP request to stream the data from
:rtype: :class:`requests.models.Response`
"""
return self.client._perform_raw(
"GET", "/projects/%s/managedfolders/%s/contents/%s" % (self.project_key, self.odb_id, utils.quote(path)))
def delete_file(self, path):
"""
Delete a file from the managed folder
:param string path: the path of the file to read within the folder
.. note::
No error is raised if the file doesn't exist
"""
return self.client._perform_empty(
"DELETE", "/projects/%s/managedfolders/%s/contents/%s" % (self.project_key, self.odb_id, utils.quote(path)))
def put_file(self, path, f):
"""
Upload the file to the managed folder. If the file already exists in the folder, it is overwritten.
Usage example:
.. code-block:: python
with open("./some_local.csv") as fd:
uploaded = folder.put_file("/target.csv", fd).json()
print("Uploaded %s bytes" % uploaded["size"])
:param string path: the path of the file to write within the folder
:param file f: a file-like
.. note::
if using a string for the `f` parameter, the string itself is taken as the file content to upload
:returns: information on the file uploaded to the folder, as a dict of:
* **path** : path of the file inside the folder
* **size** : size of the file in bytes
* **lastModified** : last modification time, in milliseconds since epoch
:rtype: dict
"""
return self.client._perform_json_upload(
"POST", "/projects/%s/managedfolders/%s/contents/%s" % (self.project_key, self.odb_id, utils.quote(path)),
"", f).json()
def upload_folder(self, path, folder):
"""
Upload the content of a folder to a managed folder.
.. note::
`upload_folder("/some/target", "./a/source/")` will result in "target" containing the contents of "source",
but not the "source" folder being a child of "target"
:param str path: the destination path of the folder in the managed folder
:param str folder: local path (absolute or relative) of the source folder to upload
"""
for root, _, files in os.walk(folder):
for file in files:
filename = os.path.join(root, file)
with open(filename, "rb") as f:
rel_posix_path = "/".join(os.path.relpath(filename, folder).split(os.sep))
self.put_file("{}/{}".format(path, rel_posix_path), f)
########################################################
# Managed folder actions
########################################################
def compute_metrics(self, metric_ids=None, probes=None):
"""
Compute metrics on this managed folder.
Usage example:
.. code-block:: python
future_resp = folder.compute_metrics()
future = DSSFuture(client, future_resp.get("jobId", None), future_resp)
metrics = future.wait_for_result()
print("Computed in %s ms" % (metrics["endTime"] - metrics["startTime"]))
for computed in metrics["computed"]:
print("Metric %s = %s" % (computed["metricId"], computed["value"]))
:param metric_ids: (optional) identifiers of metrics to compute, among the metrics defined
on the folder
:type metric_ids: list[string]
:param probes: (optional) definition of metrics probes to use, in place of the ones defined
on the folder. The current set of probes on the folder is the `probes` field
in the dict returned by :meth:`get_definition`
:type probes: dict
:returns: a future as dict representing the task of computing the probes
:rtype: dict
"""
url = "/projects/%s/managedfolders/%s/actions" % (self.project_key, self.odb_id)
if metric_ids is not None:
return self.client._perform_json(
"POST" , "%s/computeMetricsFromIds" % url,
body={"metricIds" : metric_ids})
elif probes is not None:
return self.client._perform_json(
"POST" , "%s/computeMetrics" % url,
body=probes)
else:
return self.client._perform_json(
"POST" , "%s/computeMetrics" % url)
########################################################
# Metrics
########################################################
def get_last_metric_values(self):
"""
Get the last values of the metrics on this managed folder.
:returns: a handle on the values of the metrics
:rtype: :class:`dataikuapi.dss.metrics.ComputedMetrics`
"""
return ComputedMetrics(self.client._perform_json(
"GET", "/projects/%s/managedfolders/%s/metrics/last" % (self.project_key, self.odb_id)))
def get_metric_history(self, metric):
"""
Get the history of the values of a metric on this managed folder.
Usage example:
.. code-block:: python
history = folder.get_metric_history("basic:COUNT_FILES")
for value in history["values"]:
time_str = datetime.fromtimestamp(value["time"] / 1000).strftime("%Y-%m-%d %H:%m:%S")
print("%s : %s" % (time_str, value["value"]))
:param string metric: identifier of the metric to get values of
:returns: an object containing the values of the metric, cast to the appropriate type (double,
boolean,...). The identifier of the metric is in a **metricId** field.
:rtype: dict
"""
return self.client._perform_json(
"GET", "/projects/%s/managedfolders/%s/metrics/history" % (self.project_key, self.odb_id),
params={'metricLookup' : metric if isinstance(metric, str) or isinstance(metric, unicode) else json.dumps(metric)})
########################################################
# Misc
########################################################
def get_zone(self):
"""
Get the flow zone of this managed folder.
:returns: a flow zone
:rtype: :class:`dataikuapi.dss.flow.DSSFlowZone`
"""
return self.project.get_flow().get_zone_of_object(self)
def move_to_zone(self, zone):
"""
Move this object to a flow zone.
:param object zone: a :class:`dataikuapi.dss.flow.DSSFlowZone` where to move the object, or its identifier
"""
if isinstance(zone, basestring):
zone = self.project.get_flow().get_zone(zone)
zone.add_item(self)
def share_to_zone(self, zone):
"""
Share this object to a flow zone.
:param object zone: a :class:`dataikuapi.dss.flow.DSSFlowZone` where to share the object, or its identifier
"""
if isinstance(zone, basestring):
zone = self.project.get_flow().get_zone(zone)
zone.add_shared(self)
def unshare_from_zone(self, zone):
"""
Unshare this object from a flow zone.
:param object zone: a :class:`dataikuapi.dss.flow.DSSFlowZone` from where to unshare the object, or its identifier
"""
if isinstance(zone, basestring):
zone = self.project.get_flow().get_zone(zone)
zone.remove_shared(self)
def get_usages(self):
"""
Get the recipes referencing this folder.
Usage example:
.. code-block:: python
for usage in folder.get_usages():
if usage["type"] == 'RECIPE_INPUT':
print("Used as input of %s" % usage["objectId"])
:returns: a list of usages, each one a dict of:
* **type** : the type of usage, either "RECIPE_INPUT" or "RECIPE_OUTPUT"
* **objectId** : name of the recipe
* **objectProjectKey** : project of the recipe
:rtype: list[dict]
"""
return self.client._perform_json("GET", "/projects/%s/managedfolders/%s/usages" % (self.project_key, self.odb_id))
def get_object_discussions(self):
"""
Get a handle to manage discussions on the managed folder.
:returns: the handle to manage discussions
:rtype: :class:`dataikuapi.dss.discussion.DSSObjectDiscussions`
"""
return DSSObjectDiscussions(self.client, self.project_key, "MANAGED_FOLDER", self.odb_id)
########################################################
# utilities
########################################################
def copy_to(self, target, write_mode="OVERWRITE"):
"""
Copy the data of this folder to another folder.
:param object target: a :class:`dataikuapi.dss.managedfolder.DSSManagedFolder` representing the target location of this copy
:returns: a DSSFuture representing the operation
:rtype: :class:`dataikuapi.dss.future.DSSFuture`
"""
dqr = {
"targetProjectKey" : target.project_key,
"targetFolderId": target.odb_id,
"writeMode" : write_mode
}
future_resp = self.client._perform_json("POST", "/projects/%s/managedfolders/%s/actions/copyTo" % (self.project_key, self.odb_id), body=dqr)
return DSSFuture(self.client, future_resp.get("jobId", None), future_resp)
def create_dataset_from_files(self, dataset_name):
"""
Create a new dataset of type 'FilesInFolder', taking its files from this managed folder, and
return a handle to interact with it.
The created dataset does not have its format and schema initialized, it is recommended to use
:meth:`~dataikuapi.dss.dataset.DSSDataset.autodetect_settings` on the returned object
:param str dataset_name: the name of the dataset to create. Must not already exist
:returns: A dataset handle
:rtype: :class:`dataikuapi.dss.dataset.DSSDataset`
"""
obj = {
"name": dataset_name,
"projectKey": self.project_key,
"type": "FilesInFolder",
"params": {
"folderSmartId": self.odb_id
}
}
self.client._perform_json("POST", "/projects/%s/datasets/" % self.project_key, body=obj)
return DSSDataset(self.client, self.project_key, dataset_name)
class DSSManagedFolderSettings(DSSTaggableObjectSettings):
"""
Base settings class for a DSS managed folder.
Do not instantiate this class directly, use :meth:`DSSManagedFolder.get_settings`
Use :meth:`save` to save your changes
"""
def __init__(self, folder, settings):
super(DSSManagedFolderSettings, self).__init__(settings)
self.folder = folder
self.settings = settings
def get_raw(self):
"""
Get the managed folder settings.
:returns: the settings, as a dict. The definition of the actual location of the files in the
managed folder is a **params** sub-dict.
:rtype: dict
"""
return self.settings
def get_raw_params(self):
"""
Get the type-specific (S3/ filesystem/ HDFS/ ...) params as a dict.
:returns: the type-specific patams. Each type defines a set of fields; commonly found fields are :
* **connection** : name of the connection used by the managed folder
* **path** : root of the managed folder within the connection
* **bucket** or **container** : the bucket/container name on cloud storages
:rtype: dict
"""
return self.settings["params"]
@property
def type(self):
"""
Get the type of filesystem that the managed folder uses.
:rtype: string
"""
return self.settings["type"]
def save(self):
"""
Save the changes to the settings on the managed folder.
Usage example:
.. code-block:: python
folder = project.get_managed_folder("my_folder_id")
settings = folder.get_settings()
settings.set_connection_and_path("some_S3_connection", None)
settings.get_raw_params()["bucket"] = "some_S3_bucket"
settings.save()
"""
self.folder.client._perform_empty(
"PUT", "/projects/%s/managedfolders/%s" % (self.folder.project_key, self.folder.odb_id),
body=self.settings)
########################################################
# Partitioning
########################################################
def remove_partitioning(self):
"""
Make the managed folder non-partitioned.
"""
self.settings["partitioning"] = {"dimensions" : []}
def add_discrete_partitioning_dimension(self, dim_name):
"""
Add a discrete partitioning dimension.
:param string dim_name: name of the partitioning dimension
"""
self.settings["partitioning"]["dimensions"].append({"name": dim_name, "type": "value"})
def add_time_partitioning_dimension(self, dim_name, period="DAY"):
"""
Add a time partitioning dimension.
:param string dim_name: name of the partitioning dimension
:param string period: granularity of the partitioning dimension (YEAR, MONTH, DAY (default), HOUR)
"""
self.settings["partitioning"]["dimensions"].append({"name": dim_name, "type": "time", "params":{"period": period}})
def set_partitioning_file_pattern(self, pattern):
"""
Set the partitioning pattern of the folder. The pattern indicates which paths inside the folder belong to
which partition. Partition dimensions are written with:
* `%{dim_name}` for discrete dimensions
* `%Y` (=year), `%M` (=month), `%D` (=day) and `%H` (=hour) for time dimensions
Besides the `%...` variables for injecting the partition dimensions, the pattern is a regular expression.
Usage example:
.. code-block:: python
# partition a managed folder by month
folder = project.get_managed_folder("my_folder_id")
settings = folder.get_settings()
settings.add_time_partitioning_dimension("my_date", "MONTH")
settings.set_partitioning_file_pattern("/year=%Y/month=%M/.*")
settings.save()
:param string pattern: the partitioning pattern
"""
self.settings["partitioning"]["filePathPattern"] = pattern
########################################################
# Basic
########################################################
def set_connection_and_path(self, connection, path):
"""
Change the managed folder connection and/or path.
.. note::
When changing the connection or path, the folder's files aren't moved or copied to the new location
.. attention::
When changing the connection for a connection with a different type, for example going from a S3 connection
to an Azure Blob Storage connection, only the managed folder type is changed. Type-specific fields are not
converted. In the example of a S3 to Azure conversion, the S3 bucket isn't converted to a storage account
container.
:param string connection: the name of a file-based connection. If `None`, the connection of the managed folder is
left unchanged
:param string path: a path relative to the connection root. If `None`, the path of the managed folder is left
unchanged
"""
if connection is not None:
if connection != self.settings["params"]["connection"]:
# get the actual connection type (and check that it exists)
connection_info = self.folder.client.get_connection(connection).get_info(self.folder.project_key)
connection_type = connection_info["type"]
if connection_type == 'EC2':
self.settings["type"] = 'S3' # the fsprovider type is different
elif connection_type == 'SSH':
# can be SCP or SFTP, default to SCP if connection type changed
if self.settings["type"] not in ['SCP', 'SFTP']:
self.settings["type"] = 'SCP'
else:
self.settings["type"] = connection_type
self.settings["params"]["connection"] = connection
if path is not None:
self.settings["params"]["path"] = path