Skip to content

[Experimental] EDA typed computation definitions #61

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: release/7.0
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 143 additions & 0 deletions dataikuapi/dss/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,149 @@ def get_raw(self):

@staticmethod
def _from_computation_or_dict(computation_or_dict):
if isinstance(computation_or_dict, ComputationBase):
computation_or_dict = computation_or_dict.to_model()
if isinstance(computation_or_dict, DSSStatisticsComputationSettings):
computation_or_dict = computation_or_dict.get_raw()
return DSSStatisticsComputationSettings(computation_or_dict)

class ComputationBase(object):
def __init__(self):
pass

def grouped_by_alphanum(self, column, max_values=10, group_others=False):
return GroupedComputation(self, {
"type" : "anum",
"column" : column,
"maxValues": max_values,
"groupOthers": group_others
})

def grouped_by_bins(self, column, nb_bins=None, bin_size=None, keep_na=False):
if nb_bins is not None:
return GroupedComputation(self, {
"type" : "binned",
"column" : column,
"mode": "FIXED_NB",
"nbBins" : nb_bins,
"keepNA" : keep_na
})
elif bin_size is not None:
return GroupedComputation(self, {
"type" : "binned",
"column" : column,
"mode": "FIXED_SIZE",
"binSize" : bin_size,
"keepNA" : keep_na
})

class DescriptiveStatistics(ComputationBase):
def __init__(self, columns, mean=False, sum=False, stddev=False, variance=False, skewness=False,kurtosis=False,sem=False):
self.columns = columns
self.mean = mean
self.sum = sum
self.stddev = stddev
self.variance = variance
self.skewness = skewness
self.kurtosis = kurtosis
self.sem = sem

def to_model(self):
computations = []
for col in self.columns:
if self.mean:
computations.append({"type": "mean", "column": col})
if self.sum:
computations.append({"type": "sum", "column": col})
if self.stddev:
computations.append({"type": "std_dev", "column": col})
if self.variance:
computations.append({"type": "variance", "column": col})
if self.skewness:
computations.append({"type": "skewness", "column": col})
if self.kurtosis:
computations.append({"type": "kurtosis", "column": col})
if self.sem:
computations.append({"type": "sem", "column": col})
return {"type": "multi", "computations" : computations}

class Quantiles(ComputationBase):
def __init__(self, column, freqs=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99], confidence=None):
self.column = column
self.freqs = freqs
self.confidence = confidence

def to_model(self):
return {
"type": "quantiles",
"column" : self.column,
"freqs": self.freqs,
"confidence": self.confidence
}

class TTest1Sample(ComputationBase):
def __init__(self, column, hypothesized_mean):
self.column = column
self.hypothesized_mean = hypothesized_mean
def to_model(self):
return {
"type": "ttest_1samp",
"column": self.column,
"hypothesizedMean" : self.hypothesized_mean
}

class DistributionFit(ComputationBase):
def __init__(self, column, type="normal", test=True, **kwargs):
self.column = column
self.type = type
self.test = test
self.distribution_args = kwargs

def to_model(self):
distribution = {
"type" : self.type
}
distribution.update(self.distribution_args)
return {
"type": "fit_distribution",
"column" : self.column,
"distribution": distribution,
"test" :self.test
}

class _BasicBivariateComputation(ComputationBase):
def __init__(self, type, column1, column2):
self.type = type
self.column1 = column1
self.column2 = column2

def to_model(self):
return {
"type": self.type,
"xColumn": self.column1,
"yColumn": self.column2
}


class Pearson(_BasicBivariateComputation):
def __init__(self, column1, column2):
super(Pearson, self).__init__("pearson", column1, column2)
class Covariance(_BasicBivariateComputation):
def __init__(self, column1, column2):
super(Pearson, self).__init__("covariance", column1, column2)
class Spearman(_BasicBivariateComputation):
def __init__(self, column1, column2):
super(Pearson, self).__init__("spearman", column1, column2)


class GroupedComputation(ComputationBase):
def __init__(self, computation, grouping):
self.computation = computation
self.grouping = grouping

def to_model(self):
return {
"type": "grouped",
"computation" : self.computation.to_model(),
"grouping": self.grouping
}