sklearn/tests/test_common.py

"""
General tests for all estimators in sklearn.
"""

# Authors: Andreas Mueller <amueller@ais.uni-bonn.de>
#          Gael Varoquaux gael.varoquaux@normalesup.org
# License: BSD 3 clause
from __future__ import print_function

import os
import warnings
import sys
import pkgutil

from sklearn.externals.six import PY3
from sklearn.externals.six.moves import zip
from sklearn.utils.testing import assert_false
from sklearn.utils.testing import all_estimators
from sklearn.utils.testing import assert_greater
from sklearn.utils.testing import SkipTest

import sklearn
from sklearn.base import (ClassifierMixin, RegressorMixin,
                          TransformerMixin, ClusterMixin)
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification

from sklearn.cross_validation import train_test_split
from sklearn.linear_model.base import LinearClassifierMixin
from sklearn.utils.estimator_checks import (
    check_parameters_default_constructible,
    check_regressors_classifiers_sparse_data,
    check_transformer,
    check_clustering,
    check_regressors_int,
    check_regressors_train,
    check_regressors_pickle,
    check_transformer_sparse_data,
    check_transformer_pickle,
    check_estimators_nan_inf,
    check_classifiers_one_label,
    check_classifiers_train,
    check_classifiers_classes,
    check_classifiers_input_shapes,
    check_classifiers_pickle,
    check_class_weight_classifiers,
    check_class_weight_auto_classifiers,
    check_class_weight_auto_linear_classifier,
    check_estimators_overwrite_params,
    check_cluster_overwrite_params,
    check_sparsify_binary_classifier,
    check_sparsify_multiclass_classifier,
    check_classifier_data_not_an_array,
    check_regressor_data_not_an_array,
    check_transformer_data_not_an_array,
    check_transformer_n_iter,
    check_non_transformer_estimators_n_iter,
    CROSS_DECOMPOSITION)


def test_all_estimator_no_base_class():
    # test that all_estimators doesn't find abstract classes.
    for name, Estimator in all_estimators():
        msg = ("Base estimators such as {0} should not be included"
               " in all_estimators").format(name)
        assert_false(name.lower().startswith('base'), msg=msg)


def test_all_estimators():
    # Test that estimators are default-constructible, clonable
    # and have working repr.
    estimators = all_estimators(include_meta_estimators=True)

    # Meta sanity-check to make sure that the estimator introspection runs
    # properly
    assert_greater(len(estimators), 0)

    for name, Estimator in estimators:
        # some can just not be sensibly default constructed
        yield check_parameters_default_constructible, name, Estimator


def test_estimators_sparse_data():
    # All estimators should either deal with sparse data or raise an
    # exception with type TypeError and an intelligible error message
    estimators = all_estimators()
    estimators = [(name, Estimator) for name, Estimator in estimators
                  if issubclass(Estimator, (ClassifierMixin, RegressorMixin))]
    for name, Estimator in estimators:
        yield check_regressors_classifiers_sparse_data, name, Estimator


def test_transformers():
    # test if transformers do something sensible on training set
    # also test all shapes / shape errors
    transformers = all_estimators(type_filter='transformer')
    for name, Transformer in transformers:
        # All transformers should either deal with sparse data or raise an
        # exception with type TypeError and an intelligible error message
        yield check_transformer_sparse_data, name, Transformer
        yield check_transformer_pickle, name, Transformer
        if name not in ['AdditiveChi2Sampler', 'Binarizer', 'Normalizer',
                        'PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD']:
            yield check_transformer_data_not_an_array, name, Transformer
        # these don't actually fit the data, so don't raise errors
        if name not in ['AdditiveChi2Sampler', 'Binarizer', 'Normalizer']:
            # basic tests
            yield check_transformer, name, Transformer


def test_estimators_nan_inf():
    # Test that all estimators check their input for NaN's and infs
    estimators = all_estimators()
    estimators = [(name, E) for name, E in estimators
                  if (issubclass(E, ClassifierMixin) or
                      issubclass(E, RegressorMixin) or
                      issubclass(E, TransformerMixin) or
                      issubclass(E, ClusterMixin))]
    for name, Estimator in estimators:
        if name not in CROSS_DECOMPOSITION + ['Imputer']:
            yield check_estimators_nan_inf, name, Estimator


def test_clustering():
    # test if clustering algorithms do something sensible
    # also test all shapes / shape errors
    clustering = all_estimators(type_filter='cluster')
    for name, Alg in clustering:
        # test whether any classifier overwrites his init parameters during fit
        yield check_cluster_overwrite_params, name, Alg
        if name not in ('WardAgglomeration', "FeatureAgglomeration"):
            # this is clustering on the features
            # let's not test that here.
            yield check_clustering, name, Alg


def test_classifiers():
    # test if classifiers can cope with non-consecutive classes
    classifiers = all_estimators(type_filter='classifier')
    for name, Classifier in classifiers:
        # test classfiers can handle non-array data
        yield check_classifier_data_not_an_array, name, Classifier
        # test classifiers trained on a single label always return this label
        yield check_classifiers_one_label, name, Classifier
        yield check_classifiers_classes, name, Classifier
        yield check_classifiers_pickle, name, Classifier
        # basic consistency testing
        yield check_classifiers_train, name, Classifier
        if (name not in ["MultinomialNB", "LabelPropagation", "LabelSpreading"]
            # TODO some complication with -1 label
                and name not in ["DecisionTreeClassifier", "ExtraTreeClassifier"]):
                # We don't raise a warning in these classifiers, as
                # the column y interface is used by the forests.

            # test if classifiers can cope with y.shape = (n_samples, 1)
            yield check_classifiers_input_shapes, name, Classifier


def test_regressors():
    regressors = all_estimators(type_filter='regressor')
    # TODO: test with intercept
    # TODO: test with multiple responses
    for name, Regressor in regressors:
        # basic testing
        yield check_regressors_train, name, Regressor
        yield check_regressor_data_not_an_array, name, Regressor
        # Test that estimators can be pickled, and once pickled
        # give the same answer as before.
        yield check_regressors_pickle, name, Regressor
        if name != 'CCA':
            # check that the regressor handles int input
            yield check_regressors_int, name, Regressor


def test_configure():
    # Smoke test the 'configure' step of setup, this tests all the
    # 'configure' functions in the setup.pys in the scikit
    cwd = os.getcwd()
    setup_path = os.path.abspath(os.path.join(sklearn.__path__[0], '..'))
    setup_filename = os.path.join(setup_path, 'setup.py')
    if not os.path.exists(setup_filename):
        return
    try:
        os.chdir(setup_path)
        old_argv = sys.argv
        sys.argv = ['setup.py', 'config']
        with warnings.catch_warnings():
            # The configuration spits out warnings when not finding
            # Blas/Atlas development headers
            warnings.simplefilter('ignore', UserWarning)
            if PY3:
                exec(open('setup.py').read(), dict(__name__='__main__'))
            else:
                execfile('setup.py', dict(__name__='__main__'))
    finally:
        sys.argv = old_argv
        os.chdir(cwd)


def test_class_weight_classifiers():
    # test that class_weight works and that the semantics are consistent
    classifiers = all_estimators(type_filter='classifier')

    with warnings.catch_warnings(record=True):
        classifiers = [c for c in classifiers
                       if 'class_weight' in c[1]().get_params().keys()]

    for name, Classifier in classifiers:
        if name == "NuSVC":
            # the sparse version has a parameter that doesn't do anything
            continue
        if name.endswith("NB"):
            # NaiveBayes classifiers have a somewhat different interface.
            # FIXME SOON!
            continue
        yield check_class_weight_classifiers, name, Classifier


def test_class_weight_auto_classifiers():
    """Test that class_weight="auto" improves f1-score"""

    # This test is broken; its success depends on:
    # * a rare fortuitous RNG seed for make_classification; and
    # * the use of binary F1 over a seemingly arbitrary positive class for two
    #   datasets, and weighted average F1 for the third.
    # Its expectations need to be clarified and reimplemented.
    raise SkipTest('This test requires redefinition')

    classifiers = all_estimators(type_filter='classifier')

    with warnings.catch_warnings(record=True):
        classifiers = [c for c in classifiers
                       if 'class_weight' in c[1]().get_params().keys()]

    for n_classes, weights in zip([2, 3], [[.8, .2], [.8, .1, .1]]):
        # create unbalanced dataset
        X, y = make_classification(n_classes=n_classes, n_samples=200,
                                   n_features=10, weights=weights,
                                   random_state=0, n_informative=n_classes)
        X = StandardScaler().fit_transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
                                                            random_state=0)
        for name, Classifier in classifiers:
            if (name != "NuSVC"
                # the sparse version has a parameter that doesn't do anything
                    and not name.startswith("RidgeClassifier")
                    # RidgeClassifier behaves unexpected
                    # FIXME!
                    and not name.endswith("NB")):
                # NaiveBayes classifiers have a somewhat different interface.
                # FIXME SOON!
                yield (check_class_weight_auto_classifiers, name, Classifier,
                       X_train, y_train, X_test, y_test, weights)


def test_class_weight_auto_linear_classifiers():
    classifiers = all_estimators(type_filter='classifier')

    with warnings.catch_warnings(record=True):
        linear_classifiers = [
            (name, clazz)
            for name, clazz in classifiers
            if 'class_weight' in clazz().get_params().keys()
               and issubclass(clazz, LinearClassifierMixin)]

    for name, Classifier in linear_classifiers:
        if name == "LogisticRegressionCV":
            # Contrary to RidgeClassifierCV, LogisticRegressionCV use actual
            # CV folds and fit a model for each CV iteration before averaging
            # the coef. Therefore it is expected to not behave exactly as the
            # other linear model.
            continue
        yield check_class_weight_auto_linear_classifier, name, Classifier


def test_estimators_overwrite_params():
    # test whether any classifier overwrites his init parameters during fit
    for est_type in ["classifier", "regressor", "transformer"]:
        estimators = all_estimators(type_filter=est_type)
        for name, Estimator in estimators:
            if (name not in ['CCA', '_CCA', 'PLSCanonical', 'PLSRegression',
                             'PLSSVD', 'GaussianProcess']):
                # FIXME!
                # in particular GaussianProcess!
                yield check_estimators_overwrite_params, name, Estimator


def test_import_all_consistency():
    # Smoke test to check that any name in a __all__ list is actually defined
    # in the namespace of the module or package.
    pkgs = pkgutil.walk_packages(path=sklearn.__path__, prefix='sklearn.',
                                 onerror=lambda _: None)
    for importer, modname, ispkg in pkgs:
        if ".tests." in modname:
            continue
        package = __import__(modname, fromlist="dummy")
        for name in getattr(package, '__all__', ()):
            if getattr(package, name, None) is None:
                raise AttributeError(
                    "Module '{0}' has no attribute '{1}'".format(
                        modname, name))


def test_sparsify_estimators():
    #Test if predict with sparsified estimators works.
    #Tests regression, binary classification, and multi-class classification.
    estimators = all_estimators()

    # test regression and binary classification
    for name, Estimator in estimators:
        try:
            Estimator.sparsify
            yield check_sparsify_binary_classifier, name, Estimator
        except:
            pass

    # test multiclass classification
    classifiers = all_estimators(type_filter='classifier')
    for name, Classifier in classifiers:
        try:
            Classifier.sparsify
            yield check_sparsify_multiclass_classifier, name, Classifier
        except:
            pass


def test_non_transformer_estimators_n_iter():
    # Test that all estimators of type which are non-transformer
    # and which have an attribute of max_iter, return the attribute
    # of n_iter atleast 1.
    for est_type in ['regressor', 'classifier', 'cluster']:
        regressors = all_estimators(type_filter=est_type)
        for name, Estimator in regressors:
            # LassoLars stops early for the default alpha=1.0 for
            # the iris dataset.
            if name == 'LassoLars':
                estimator = Estimator(alpha=0.)
            else:
                estimator = Estimator()
            if hasattr(estimator, "max_iter"):
                # These models are dependent on external solvers like
                # libsvm and accessing the iter parameter is non-trivial.
                if name in (['Ridge', 'SVR', 'NuSVR', 'NuSVC',
                             'RidgeClassifier', 'SVC', 'RandomizedLasso',
                             'LogisticRegressionCV']):
                    continue

                # Tested in test_transformer_n_iter below
                elif name in CROSS_DECOMPOSITION or (
                    name in ['LinearSVC', 'LogisticRegression']
                    ):
                    continue

                else:
                    # Multitask models related to ENet cannot handle
                    # if y is mono-output.
                    yield (check_non_transformer_estimators_n_iter,
                           name, estimator, 'Multi' in name)


def test_transformer_n_iter():
    transformers = all_estimators(type_filter='transformer')
    for name, Estimator in transformers:
        estimator = Estimator()
        # Dependent on external solvers and hence accessing the iter
        # param is non-trivial.
        external_solver = ['Isomap', 'KernelPCA', 'LocallyLinearEmbedding',
                           'RandomizedLasso', 'LogisticRegressionCV']

        if hasattr(estimator, "max_iter") and name not in external_solver:
            yield check_transformer_n_iter, name, estimator