MAINT Update fetch_openml to use the auto parser by default (scikit-learn#27802)

ogrisel · web-flow · commit 3ab6c8cf0cf6 · 2023-11-20T11:20:20.000+01:00
diff --git a/asv_benchmarks/benchmarks/datasets.py b/asv_benchmarks/benchmarks/datasets.py
@@ -60,9 +60,7 @@ def _20newsgroups_lowdim_dataset(n_components=100, ngrams=(1, 1), dtype=np.float
 
 @M.cache
 def _mnist_dataset(dtype=np.float32):
-    X, y = fetch_openml(
-        "mnist_784", version=1, return_X_y=True, as_frame=False, parser="pandas"
-    )
+    X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)
     X = X.astype(dtype, copy=False)
     X = MaxAbsScaler().fit_transform(X)
 
diff --git a/benchmarks/bench_hist_gradient_boosting_adult.py b/benchmarks/bench_hist_gradient_boosting_adult.py
@@ -49,7 +49,7 @@ def predict(est, data_test, target_test):
     print(f"predicted in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
 
 
-data = fetch_openml(data_id=179, as_frame=True, parser="pandas")  # adult dataset
+data = fetch_openml(data_id=179, as_frame=True)  # adult dataset
 X, y = data.data, data.target
 
 # Ordinal encode the categories to use the native support available in HGBDT
diff --git a/benchmarks/bench_isolation_forest.py b/benchmarks/bench_isolation_forest.py
@@ -64,7 +64,7 @@ def print_outlier_ratio(y):
         y = dataset.target
 
     if dat == "shuttle":
-        dataset = fetch_openml("shuttle", as_frame=False, parser="pandas")
+        dataset = fetch_openml("shuttle", as_frame=False)
         X = dataset.data
         y = dataset.target.astype(np.int64)
         X, y = sh(X, y, random_state=random_state)
diff --git a/benchmarks/bench_lof.py b/benchmarks/bench_lof.py
@@ -46,7 +46,7 @@
         y = dataset.target
 
     if dataset_name == "shuttle":
-        dataset = fetch_openml("shuttle", as_frame=False, parser="pandas")
+        dataset = fetch_openml("shuttle", as_frame=False)
         X = dataset.data
         y = dataset.target.astype(np.int64)
         # we remove data with label 4
diff --git a/benchmarks/bench_mnist.py b/benchmarks/bench_mnist.py
@@ -60,7 +60,7 @@ def load_data(dtype=np.float32, order="F"):
     ######################################################################
     # Load dataset
     print("Loading dataset...")
-    data = fetch_openml("mnist_784", as_frame=True, parser="pandas")
+    data = fetch_openml("mnist_784", as_frame=True)
     X = check_array(data["data"], dtype=dtype, order=order)
     y = data["target"]
 
diff --git a/benchmarks/bench_plot_randomized_svd.py b/benchmarks/bench_plot_randomized_svd.py
@@ -192,7 +192,7 @@ def get_data(dataset_name):
         del row
         del col
     else:
-        X = fetch_openml(dataset_name, parser="auto").data
+        X = fetch_openml(dataset_name).data
     return X
 
 
diff --git a/benchmarks/bench_tsne_mnist.py b/benchmarks/bench_tsne_mnist.py
@@ -36,7 +36,7 @@
 def load_data(dtype=np.float32, order="C", shuffle=True, seed=0):
     """Load the data, then cache and memmap the train/test split"""
     print("Loading dataset...")
-    data = fetch_openml("mnist_784", as_frame=True, parser="pandas")
+    data = fetch_openml("mnist_784", as_frame=True)
 
     X = check_array(data["data"], dtype=dtype, order=order)
     y = data["target"]
diff --git a/doc/datasets/loading_other_datasets.rst b/doc/datasets/loading_other_datasets.rst
@@ -99,7 +99,7 @@ from the repository using the function
 For example, to download a dataset of gene expressions in mice brains::
 
   >>> from sklearn.datasets import fetch_openml
-  >>> mice = fetch_openml(name='miceprotein', version=4, parser="auto")
+  >>> mice = fetch_openml(name='miceprotein', version=4)
 
 To fully specify a dataset, you need to provide a name and a version, though
 the version is optional, see :ref:`openml_versions` below.
@@ -147,7 +147,7 @@ dataset on the openml website::
 
 The ``data_id`` also uniquely identifies a dataset from OpenML::
 
-  >>> mice = fetch_openml(data_id=40966, parser="auto")
+  >>> mice = fetch_openml(data_id=40966)
   >>> mice.details # doctest: +SKIP
   {'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF',
   'creator': ...,
@@ -171,7 +171,7 @@ which can contain entirely different datasets.
 If a particular version of a dataset has been found to contain significant
 issues, it might be deactivated. Using a name to specify a dataset will yield
 the earliest version of a dataset that is still active. That means that
-``fetch_openml(name="miceprotein", parser="auto")`` can yield different results
+``fetch_openml(name="miceprotein")`` can yield different results
 at different times if earlier versions become inactive.
 You can see that the dataset with ``data_id`` 40966 that we fetched above is
 the first version of the "miceprotein" dataset::
@@ -182,19 +182,19 @@ the first version of the "miceprotein" dataset::
 In fact, this dataset only has one version. The iris dataset on the other hand
 has multiple versions::
 
-  >>> iris = fetch_openml(name="iris", parser="auto")
+  >>> iris = fetch_openml(name="iris")
   >>> iris.details['version']  #doctest: +SKIP
   '1'
   >>> iris.details['id']  #doctest: +SKIP
   '61'
 
-  >>> iris_61 = fetch_openml(data_id=61, parser="auto")
+  >>> iris_61 = fetch_openml(data_id=61)
   >>> iris_61.details['version']
   '1'
   >>> iris_61.details['id']
   '61'
 
-  >>> iris_969 = fetch_openml(data_id=969, parser="auto")
+  >>> iris_969 = fetch_openml(data_id=969)
   >>> iris_969.details['version']
   '3'
   >>> iris_969.details['id']
@@ -212,7 +212,7 @@ binarized version of the data::
 You can also specify both the name and the version, which also uniquely
 identifies the dataset::
 
-  >>> iris_version_3 = fetch_openml(name="iris", version=3, parser="auto")
+  >>> iris_version_3 = fetch_openml(name="iris", version=3)
   >>> iris_version_3.details['version']
   '3'
   >>> iris_version_3.details['id']
diff --git a/examples/applications/plot_cyclical_feature_engineering.py b/examples/applications/plot_cyclical_feature_engineering.py
@@ -20,9 +20,7 @@
 # We start by loading the data from the OpenML repository.
 from sklearn.datasets import fetch_openml
 
-bike_sharing = fetch_openml(
-    "Bike_Sharing_Demand", version=2, as_frame=True, parser="pandas"
-)
+bike_sharing = fetch_openml("Bike_Sharing_Demand", version=2, as_frame=True)
 df = bike_sharing.frame
 
 # %%
diff --git a/examples/applications/plot_digits_denoising.py b/examples/applications/plot_digits_denoising.py
@@ -37,7 +37,7 @@
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import MinMaxScaler
 
-X, y = fetch_openml(data_id=41082, as_frame=False, return_X_y=True, parser="pandas")
+X, y = fetch_openml(data_id=41082, as_frame=False, return_X_y=True)
 X = MinMaxScaler().fit_transform(X)
 
 # %%
diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py
@@ -45,9 +45,7 @@
 
 # %%
 # Load data from https://www.openml.org/d/40945
-X, y = fetch_openml(
-    "titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
-)
+X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
 
 # Alternatively X and y can be obtained directly from the frame attribute:
 # X = titanic.frame.drop('survived', axis=1)
diff --git a/examples/compose/plot_transformed_target.py b/examples/compose/plot_transformed_target.py
@@ -131,7 +131,7 @@ def compute_score(y_true, y_pred):
 from sklearn.datasets import fetch_openml
 from sklearn.preprocessing import quantile_transform
 
-ames = fetch_openml(name="house_prices", as_frame=True, parser="pandas")
+ames = fetch_openml(name="house_prices", as_frame=True)
 # Keep only numeric columns
 X = ames.data.select_dtypes(np.number)
 # Remove columns with NaN or Inf values
diff --git a/examples/ensemble/plot_gradient_boosting_categorical.py b/examples/ensemble/plot_gradient_boosting_categorical.py
@@ -30,7 +30,7 @@
 # are either categorical or numerical:
 from sklearn.datasets import fetch_openml
 
-X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True, parser="pandas")
+X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
 
 # Select only a subset of features of X to make the example faster to run
 categorical_columns_subset = [
diff --git a/examples/ensemble/plot_stack_predictors.py b/examples/ensemble/plot_stack_predictors.py
@@ -45,7 +45,7 @@
 
 
 def load_ames_housing():
-    df = fetch_openml(name="house_prices", as_frame=True, parser="pandas")
+    df = fetch_openml(name="house_prices", as_frame=True)
     X = df.data
     y = df.target
 
diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
@@ -36,7 +36,7 @@
 # in OpenML.
 from sklearn.datasets import fetch_openml
 
-co2 = fetch_openml(data_id=41187, as_frame=True, parser="pandas")
+co2 = fetch_openml(data_id=41187, as_frame=True)
 co2.frame.head()
 
 # %%
diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py
@@ -55,7 +55,7 @@
 # as a pandas dataframe.
 from sklearn.datasets import fetch_openml
 
-survey = fetch_openml(data_id=534, as_frame=True, parser="pandas")
+survey = fetch_openml(data_id=534, as_frame=True)
 
 # %%
 # Then, we identify features `X` and targets `y`: the column WAGE is our
diff --git a/examples/inspection/plot_partial_dependence.py b/examples/inspection/plot_partial_dependence.py
@@ -42,7 +42,7 @@
 # rentals using weather and season data as well as the datetime information.
 from sklearn.datasets import fetch_openml
 
-bikes = fetch_openml("Bike_Sharing_Demand", version=2, as_frame=True, parser="pandas")
+bikes = fetch_openml("Bike_Sharing_Demand", version=2, as_frame=True)
 # Make an explicit copy to avoid "SettingWithCopyWarning" from pandas
 X, y = bikes.data.copy(), bikes.target
 
diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py
@@ -43,9 +43,7 @@
 from sklearn.datasets import fetch_openml
 from sklearn.model_selection import train_test_split
 
-X, y = fetch_openml(
-    "titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
-)
+X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
 rng = np.random.RandomState(seed=42)
 X["random_cat"] = rng.randint(3, size=X.shape[0])
 X["random_num"] = rng.randn(X.shape[0])
diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -53,7 +53,7 @@
 # https://www.openml.org/d/41214
 from sklearn.datasets import fetch_openml
 
-df = fetch_openml(data_id=41214, as_frame=True, parser="pandas").frame
+df = fetch_openml(data_id=41214, as_frame=True).frame
 df
 
 # %%
diff --git a/examples/linear_model/plot_sgd_early_stopping.py b/examples/linear_model/plot_sgd_early_stopping.py
@@ -59,7 +59,7 @@
 def load_mnist(n_samples=None, class_0="0", class_1="8"):
     """Load MNIST, select two classes, shuffle and return only n_samples."""
     # Load data from http://openml.org/d/554
-    mnist = fetch_openml("mnist_784", version=1, as_frame=False, parser="pandas")
+    mnist = fetch_openml("mnist_784", version=1, as_frame=False)
 
     # take only two classes for binary classification
     mask = np.logical_or(mnist.target == class_0, mnist.target == class_1)
diff --git a/examples/linear_model/plot_sparse_logistic_regression_mnist.py b/examples/linear_model/plot_sparse_logistic_regression_mnist.py
@@ -36,9 +36,7 @@
 train_samples = 5000
 
 # Load data from https://www.openml.org/d/554
-X, y = fetch_openml(
-    "mnist_784", version=1, return_X_y=True, as_frame=False, parser="pandas"
-)
+X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)
 
 random_state = check_random_state(0)
 permutation = random_state.permutation(X.shape[0])
diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -68,12 +68,12 @@ def load_mtpl2(n_samples=None):
       678013 samples.
     """
     # freMTPL2freq dataset from https://www.openml.org/d/41214
-    df_freq = fetch_openml(data_id=41214, as_frame=True, parser="pandas").data
+    df_freq = fetch_openml(data_id=41214, as_frame=True).data
     df_freq["IDpol"] = df_freq["IDpol"].astype(int)
     df_freq.set_index("IDpol", inplace=True)
 
     # freMTPL2sev dataset from https://www.openml.org/d/41215
-    df_sev = fetch_openml(data_id=41215, as_frame=True, parser="pandas").data
+    df_sev = fetch_openml(data_id=41215, as_frame=True).data
 
     # sum ClaimAmount over identical IDs
     df_sev = df_sev.groupby("IDpol").sum()
diff --git a/examples/miscellaneous/plot_display_object_visualization.py b/examples/miscellaneous/plot_display_object_visualization.py
@@ -29,7 +29,7 @@
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
 
-X, y = fetch_openml(data_id=1464, return_X_y=True, parser="pandas")
+X, y = fetch_openml(data_id=1464, return_X_y=True)
 X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
 
 clf = make_pipeline(StandardScaler(), LogisticRegression(random_state=0))
diff --git a/examples/miscellaneous/plot_outlier_detection_bench.py b/examples/miscellaneous/plot_outlier_detection_bench.py
@@ -201,9 +201,7 @@ def fit_predict(estimator, X):
 
 from sklearn.datasets import fetch_openml
 
-X, y = fetch_openml(
-    name="ames_housing", version=1, return_X_y=True, as_frame=True, parser="pandas"
-)
+X, y = fetch_openml(name="ames_housing", version=1, return_X_y=True, as_frame=True)
 y = y.div(X["Lot_Area"])
 
 # None values in pandas 1.5.1 were mapped to np.nan in pandas 2.0.1
@@ -256,9 +254,7 @@ def fit_predict(estimator, X):
 # which are binary encoded and some are continuous.
 
 # %%
-X, y = fetch_openml(
-    name="cardiotocography", version=1, return_X_y=True, as_frame=False, parser="pandas"
-)
+X, y = fetch_openml(name="cardiotocography", version=1, return_X_y=True, as_frame=False)
 X_cardiotocography = X  # save X for later use
 s = y == "3"
 y = s.astype(np.int32)
diff --git a/examples/miscellaneous/plot_set_output.py b/examples/miscellaneous/plot_set_output.py
@@ -68,9 +68,7 @@
 # :class:`compose.ColumnTransformer` and heterogeneous data.
 from sklearn.datasets import fetch_openml
 
-X, y = fetch_openml(
-    "titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
-)
+X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
 X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
 
 # %%
diff --git a/examples/multiclass/plot_multiclass_overview.py b/examples/multiclass/plot_multiclass_overview.py
@@ -29,7 +29,7 @@
 # the dataset from OpenML.
 from sklearn.datasets import fetch_openml
 
-X, y = fetch_openml(data_id=181, as_frame=True, return_X_y=True, parser="pandas")
+X, y = fetch_openml(data_id=181, as_frame=True, return_X_y=True)
 
 # %%
 # To know the type of data science problem we are dealing with, we can check
diff --git a/examples/multioutput/plot_classifier_chain_yeast.py b/examples/multioutput/plot_classifier_chain_yeast.py
@@ -41,7 +41,7 @@
 from sklearn.model_selection import train_test_split
 
 # Load a multi-label dataset from https://www.openml.org/d/40597
-X, Y = fetch_openml("yeast", version=4, return_X_y=True, parser="pandas")
+X, Y = fetch_openml("yeast", version=4, return_X_y=True)
 Y = Y == "TRUE"
 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
 
diff --git a/examples/neighbors/approximate_nearest_neighbors.py b/examples/neighbors/approximate_nearest_neighbors.py
@@ -103,7 +103,7 @@ def transform(self, X):
 
 def load_mnist(n_samples):
     """Load MNIST, shuffle the data, and return only n_samples."""
-    mnist = fetch_openml("mnist_784", as_frame=False, parser="pandas")
+    mnist = fetch_openml("mnist_784", as_frame=False)
     X, y = shuffle(mnist.data, mnist.target, random_state=2)
     return X[:n_samples] / 255, y[:n_samples]
 
diff --git a/examples/neural_networks/plot_mnist_filters.py b/examples/neural_networks/plot_mnist_filters.py
@@ -34,9 +34,7 @@
 from sklearn.neural_network import MLPClassifier
 
 # Load data from https://www.openml.org/d/554
-X, y = fetch_openml(
-    "mnist_784", version=1, return_X_y=True, as_frame=False, parser="pandas"
-)
+X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)
 X = X / 255.0
 
 # Split data into train partition and test partition
diff --git a/examples/preprocessing/plot_target_encoder.py b/examples/preprocessing/plot_target_encoder.py
@@ -23,7 +23,7 @@
 # be a reviewer:
 from sklearn.datasets import fetch_openml
 
-wine_reviews = fetch_openml(data_id=42074, as_frame=True, parser="pandas")
+wine_reviews = fetch_openml(data_id=42074, as_frame=True)
 
 df = wine_reviews.frame
 df.head()
diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py