Skip to content

Commit 3ab6c8c

Browse files
authored
MAINT Update fetch_openml to use the auto parser by default (scikit-learn#27802)
1 parent 7ceb4d8 commit 3ab6c8c

32 files changed

+45
-98
lines changed

asv_benchmarks/benchmarks/datasets.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,7 @@ def _20newsgroups_lowdim_dataset(n_components=100, ngrams=(1, 1), dtype=np.float
6060

6161
@M.cache
6262
def _mnist_dataset(dtype=np.float32):
63-
X, y = fetch_openml(
64-
"mnist_784", version=1, return_X_y=True, as_frame=False, parser="pandas"
65-
)
63+
X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)
6664
X = X.astype(dtype, copy=False)
6765
X = MaxAbsScaler().fit_transform(X)
6866

benchmarks/bench_hist_gradient_boosting_adult.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def predict(est, data_test, target_test):
4949
print(f"predicted in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
5050

5151

52-
data = fetch_openml(data_id=179, as_frame=True, parser="pandas") # adult dataset
52+
data = fetch_openml(data_id=179, as_frame=True) # adult dataset
5353
X, y = data.data, data.target
5454

5555
# Ordinal encode the categories to use the native support available in HGBDT

benchmarks/bench_isolation_forest.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def print_outlier_ratio(y):
6464
y = dataset.target
6565

6666
if dat == "shuttle":
67-
dataset = fetch_openml("shuttle", as_frame=False, parser="pandas")
67+
dataset = fetch_openml("shuttle", as_frame=False)
6868
X = dataset.data
6969
y = dataset.target.astype(np.int64)
7070
X, y = sh(X, y, random_state=random_state)

benchmarks/bench_lof.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
y = dataset.target
4747

4848
if dataset_name == "shuttle":
49-
dataset = fetch_openml("shuttle", as_frame=False, parser="pandas")
49+
dataset = fetch_openml("shuttle", as_frame=False)
5050
X = dataset.data
5151
y = dataset.target.astype(np.int64)
5252
# we remove data with label 4

benchmarks/bench_mnist.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def load_data(dtype=np.float32, order="F"):
6060
######################################################################
6161
# Load dataset
6262
print("Loading dataset...")
63-
data = fetch_openml("mnist_784", as_frame=True, parser="pandas")
63+
data = fetch_openml("mnist_784", as_frame=True)
6464
X = check_array(data["data"], dtype=dtype, order=order)
6565
y = data["target"]
6666

benchmarks/bench_plot_randomized_svd.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ def get_data(dataset_name):
192192
del row
193193
del col
194194
else:
195-
X = fetch_openml(dataset_name, parser="auto").data
195+
X = fetch_openml(dataset_name).data
196196
return X
197197

198198

benchmarks/bench_tsne_mnist.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
def load_data(dtype=np.float32, order="C", shuffle=True, seed=0):
3737
"""Load the data, then cache and memmap the train/test split"""
3838
print("Loading dataset...")
39-
data = fetch_openml("mnist_784", as_frame=True, parser="pandas")
39+
data = fetch_openml("mnist_784", as_frame=True)
4040

4141
X = check_array(data["data"], dtype=dtype, order=order)
4242
y = data["target"]

doc/datasets/loading_other_datasets.rst

+7-7
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ from the repository using the function
9999
For example, to download a dataset of gene expressions in mice brains::
100100

101101
>>> from sklearn.datasets import fetch_openml
102-
>>> mice = fetch_openml(name='miceprotein', version=4, parser="auto")
102+
>>> mice = fetch_openml(name='miceprotein', version=4)
103103

104104
To fully specify a dataset, you need to provide a name and a version, though
105105
the version is optional, see :ref:`openml_versions` below.
@@ -147,7 +147,7 @@ dataset on the openml website::
147147

148148
The ``data_id`` also uniquely identifies a dataset from OpenML::
149149

150-
>>> mice = fetch_openml(data_id=40966, parser="auto")
150+
>>> mice = fetch_openml(data_id=40966)
151151
>>> mice.details # doctest: +SKIP
152152
{'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF',
153153
'creator': ...,
@@ -171,7 +171,7 @@ which can contain entirely different datasets.
171171
If a particular version of a dataset has been found to contain significant
172172
issues, it might be deactivated. Using a name to specify a dataset will yield
173173
the earliest version of a dataset that is still active. That means that
174-
``fetch_openml(name="miceprotein", parser="auto")`` can yield different results
174+
``fetch_openml(name="miceprotein")`` can yield different results
175175
at different times if earlier versions become inactive.
176176
You can see that the dataset with ``data_id`` 40966 that we fetched above is
177177
the first version of the "miceprotein" dataset::
@@ -182,19 +182,19 @@ the first version of the "miceprotein" dataset::
182182
In fact, this dataset only has one version. The iris dataset on the other hand
183183
has multiple versions::
184184

185-
>>> iris = fetch_openml(name="iris", parser="auto")
185+
>>> iris = fetch_openml(name="iris")
186186
>>> iris.details['version'] #doctest: +SKIP
187187
'1'
188188
>>> iris.details['id'] #doctest: +SKIP
189189
'61'
190190

191-
>>> iris_61 = fetch_openml(data_id=61, parser="auto")
191+
>>> iris_61 = fetch_openml(data_id=61)
192192
>>> iris_61.details['version']
193193
'1'
194194
>>> iris_61.details['id']
195195
'61'
196196

197-
>>> iris_969 = fetch_openml(data_id=969, parser="auto")
197+
>>> iris_969 = fetch_openml(data_id=969)
198198
>>> iris_969.details['version']
199199
'3'
200200
>>> iris_969.details['id']
@@ -212,7 +212,7 @@ binarized version of the data::
212212
You can also specify both the name and the version, which also uniquely
213213
identifies the dataset::
214214

215-
>>> iris_version_3 = fetch_openml(name="iris", version=3, parser="auto")
215+
>>> iris_version_3 = fetch_openml(name="iris", version=3)
216216
>>> iris_version_3.details['version']
217217
'3'
218218
>>> iris_version_3.details['id']

examples/applications/plot_cyclical_feature_engineering.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,7 @@
2020
# We start by loading the data from the OpenML repository.
2121
from sklearn.datasets import fetch_openml
2222

23-
bike_sharing = fetch_openml(
24-
"Bike_Sharing_Demand", version=2, as_frame=True, parser="pandas"
25-
)
23+
bike_sharing = fetch_openml("Bike_Sharing_Demand", version=2, as_frame=True)
2624
df = bike_sharing.frame
2725

2826
# %%

examples/applications/plot_digits_denoising.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
from sklearn.model_selection import train_test_split
3838
from sklearn.preprocessing import MinMaxScaler
3939

40-
X, y = fetch_openml(data_id=41082, as_frame=False, return_X_y=True, parser="pandas")
40+
X, y = fetch_openml(data_id=41082, as_frame=False, return_X_y=True)
4141
X = MinMaxScaler().fit_transform(X)
4242

4343
# %%

examples/compose/plot_column_transformer_mixed_types.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,7 @@
4545

4646
# %%
4747
# Load data from https://www.openml.org/d/40945
48-
X, y = fetch_openml(
49-
"titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
50-
)
48+
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
5149

5250
# Alternatively X and y can be obtained directly from the frame attribute:
5351
# X = titanic.frame.drop('survived', axis=1)

examples/compose/plot_transformed_target.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ def compute_score(y_true, y_pred):
131131
from sklearn.datasets import fetch_openml
132132
from sklearn.preprocessing import quantile_transform
133133

134-
ames = fetch_openml(name="house_prices", as_frame=True, parser="pandas")
134+
ames = fetch_openml(name="house_prices", as_frame=True)
135135
# Keep only numeric columns
136136
X = ames.data.select_dtypes(np.number)
137137
# Remove columns with NaN or Inf values

examples/ensemble/plot_gradient_boosting_categorical.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
# are either categorical or numerical:
3131
from sklearn.datasets import fetch_openml
3232

33-
X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True, parser="pandas")
33+
X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
3434

3535
# Select only a subset of features of X to make the example faster to run
3636
categorical_columns_subset = [

examples/ensemble/plot_stack_predictors.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545

4646

4747
def load_ames_housing():
48-
df = fetch_openml(name="house_prices", as_frame=True, parser="pandas")
48+
df = fetch_openml(name="house_prices", as_frame=True)
4949
X = df.data
5050
y = df.target
5151

examples/gaussian_process/plot_gpr_co2.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
# in OpenML.
3737
from sklearn.datasets import fetch_openml
3838

39-
co2 = fetch_openml(data_id=41187, as_frame=True, parser="pandas")
39+
co2 = fetch_openml(data_id=41187, as_frame=True)
4040
co2.frame.head()
4141

4242
# %%

examples/inspection/plot_linear_model_coefficient_interpretation.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
# as a pandas dataframe.
5656
from sklearn.datasets import fetch_openml
5757

58-
survey = fetch_openml(data_id=534, as_frame=True, parser="pandas")
58+
survey = fetch_openml(data_id=534, as_frame=True)
5959

6060
# %%
6161
# Then, we identify features `X` and targets `y`: the column WAGE is our

examples/inspection/plot_partial_dependence.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
# rentals using weather and season data as well as the datetime information.
4343
from sklearn.datasets import fetch_openml
4444

45-
bikes = fetch_openml("Bike_Sharing_Demand", version=2, as_frame=True, parser="pandas")
45+
bikes = fetch_openml("Bike_Sharing_Demand", version=2, as_frame=True)
4646
# Make an explicit copy to avoid "SettingWithCopyWarning" from pandas
4747
X, y = bikes.data.copy(), bikes.target
4848

examples/inspection/plot_permutation_importance.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,7 @@
4343
from sklearn.datasets import fetch_openml
4444
from sklearn.model_selection import train_test_split
4545

46-
X, y = fetch_openml(
47-
"titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
48-
)
46+
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
4947
rng = np.random.RandomState(seed=42)
5048
X["random_cat"] = rng.randint(3, size=X.shape[0])
5149
X["random_num"] = rng.randn(X.shape[0])

examples/linear_model/plot_poisson_regression_non_normal_loss.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353
# https://www.openml.org/d/41214
5454
from sklearn.datasets import fetch_openml
5555

56-
df = fetch_openml(data_id=41214, as_frame=True, parser="pandas").frame
56+
df = fetch_openml(data_id=41214, as_frame=True).frame
5757
df
5858

5959
# %%

examples/linear_model/plot_sgd_early_stopping.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@
5959
def load_mnist(n_samples=None, class_0="0", class_1="8"):
6060
"""Load MNIST, select two classes, shuffle and return only n_samples."""
6161
# Load data from http://openml.org/d/554
62-
mnist = fetch_openml("mnist_784", version=1, as_frame=False, parser="pandas")
62+
mnist = fetch_openml("mnist_784", version=1, as_frame=False)
6363

6464
# take only two classes for binary classification
6565
mask = np.logical_or(mnist.target == class_0, mnist.target == class_1)

examples/linear_model/plot_sparse_logistic_regression_mnist.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,7 @@
3636
train_samples = 5000
3737

3838
# Load data from https://www.openml.org/d/554
39-
X, y = fetch_openml(
40-
"mnist_784", version=1, return_X_y=True, as_frame=False, parser="pandas"
41-
)
39+
X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)
4240

4341
random_state = check_random_state(0)
4442
permutation = random_state.permutation(X.shape[0])

examples/linear_model/plot_tweedie_regression_insurance_claims.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -68,12 +68,12 @@ def load_mtpl2(n_samples=None):
6868
678013 samples.
6969
"""
7070
# freMTPL2freq dataset from https://www.openml.org/d/41214
71-
df_freq = fetch_openml(data_id=41214, as_frame=True, parser="pandas").data
71+
df_freq = fetch_openml(data_id=41214, as_frame=True).data
7272
df_freq["IDpol"] = df_freq["IDpol"].astype(int)
7373
df_freq.set_index("IDpol", inplace=True)
7474

7575
# freMTPL2sev dataset from https://www.openml.org/d/41215
76-
df_sev = fetch_openml(data_id=41215, as_frame=True, parser="pandas").data
76+
df_sev = fetch_openml(data_id=41215, as_frame=True).data
7777

7878
# sum ClaimAmount over identical IDs
7979
df_sev = df_sev.groupby("IDpol").sum()

examples/miscellaneous/plot_display_object_visualization.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
from sklearn.pipeline import make_pipeline
3030
from sklearn.preprocessing import StandardScaler
3131

32-
X, y = fetch_openml(data_id=1464, return_X_y=True, parser="pandas")
32+
X, y = fetch_openml(data_id=1464, return_X_y=True)
3333
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
3434

3535
clf = make_pipeline(StandardScaler(), LogisticRegression(random_state=0))

examples/miscellaneous/plot_outlier_detection_bench.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -201,9 +201,7 @@ def fit_predict(estimator, X):
201201

202202
from sklearn.datasets import fetch_openml
203203

204-
X, y = fetch_openml(
205-
name="ames_housing", version=1, return_X_y=True, as_frame=True, parser="pandas"
206-
)
204+
X, y = fetch_openml(name="ames_housing", version=1, return_X_y=True, as_frame=True)
207205
y = y.div(X["Lot_Area"])
208206

209207
# None values in pandas 1.5.1 were mapped to np.nan in pandas 2.0.1
@@ -256,9 +254,7 @@ def fit_predict(estimator, X):
256254
# which are binary encoded and some are continuous.
257255

258256
# %%
259-
X, y = fetch_openml(
260-
name="cardiotocography", version=1, return_X_y=True, as_frame=False, parser="pandas"
261-
)
257+
X, y = fetch_openml(name="cardiotocography", version=1, return_X_y=True, as_frame=False)
262258
X_cardiotocography = X # save X for later use
263259
s = y == "3"
264260
y = s.astype(np.int32)

examples/miscellaneous/plot_set_output.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,7 @@
6868
# :class:`compose.ColumnTransformer` and heterogeneous data.
6969
from sklearn.datasets import fetch_openml
7070

71-
X, y = fetch_openml(
72-
"titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
73-
)
71+
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
7472
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
7573

7674
# %%

examples/multiclass/plot_multiclass_overview.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
# the dataset from OpenML.
3030
from sklearn.datasets import fetch_openml
3131

32-
X, y = fetch_openml(data_id=181, as_frame=True, return_X_y=True, parser="pandas")
32+
X, y = fetch_openml(data_id=181, as_frame=True, return_X_y=True)
3333

3434
# %%
3535
# To know the type of data science problem we are dealing with, we can check

examples/multioutput/plot_classifier_chain_yeast.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
from sklearn.model_selection import train_test_split
4242

4343
# Load a multi-label dataset from https://www.openml.org/d/40597
44-
X, Y = fetch_openml("yeast", version=4, return_X_y=True, parser="pandas")
44+
X, Y = fetch_openml("yeast", version=4, return_X_y=True)
4545
Y = Y == "TRUE"
4646
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
4747

examples/neighbors/approximate_nearest_neighbors.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def transform(self, X):
103103

104104
def load_mnist(n_samples):
105105
"""Load MNIST, shuffle the data, and return only n_samples."""
106-
mnist = fetch_openml("mnist_784", as_frame=False, parser="pandas")
106+
mnist = fetch_openml("mnist_784", as_frame=False)
107107
X, y = shuffle(mnist.data, mnist.target, random_state=2)
108108
return X[:n_samples] / 255, y[:n_samples]
109109

examples/neural_networks/plot_mnist_filters.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,7 @@
3434
from sklearn.neural_network import MLPClassifier
3535

3636
# Load data from https://www.openml.org/d/554
37-
X, y = fetch_openml(
38-
"mnist_784", version=1, return_X_y=True, as_frame=False, parser="pandas"
39-
)
37+
X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)
4038
X = X / 255.0
4139

4240
# Split data into train partition and test partition

examples/preprocessing/plot_target_encoder.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
# be a reviewer:
2424
from sklearn.datasets import fetch_openml
2525

26-
wine_reviews = fetch_openml(data_id=42074, as_frame=True, parser="pandas")
26+
wine_reviews = fetch_openml(data_id=42074, as_frame=True)
2727

2828
df = wine_reviews.frame
2929
df.head()

0 commit comments

Comments
 (0)