forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbench_hist_gradient_boosting_higgsboson.py
124 lines (103 loc) · 4.02 KB
/
bench_hist_gradient_boosting_higgsboson.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import argparse
import os
from gzip import GzipFile
from time import time
from urllib.request import urlretrieve
import numpy as np
import pandas as pd
from joblib import Memory
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
parser = argparse.ArgumentParser()
parser.add_argument("--n-leaf-nodes", type=int, default=31)
parser.add_argument("--n-trees", type=int, default=10)
parser.add_argument("--lightgbm", action="store_true", default=False)
parser.add_argument("--xgboost", action="store_true", default=False)
parser.add_argument("--catboost", action="store_true", default=False)
parser.add_argument("--learning-rate", type=float, default=1.0)
parser.add_argument("--subsample", type=int, default=None)
parser.add_argument("--max-bins", type=int, default=255)
parser.add_argument("--no-predict", action="store_true", default=False)
parser.add_argument("--cache-loc", type=str, default="/tmp")
parser.add_argument("--no-interactions", type=bool, default=False)
args = parser.parse_args()
HERE = os.path.dirname(__file__)
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"
m = Memory(location=args.cache_loc, mmap_mode="r")
n_leaf_nodes = args.n_leaf_nodes
n_trees = args.n_trees
subsample = args.subsample
lr = args.learning_rate
max_bins = args.max_bins
@m.cache
def load_data():
filename = os.path.join(HERE, URL.rsplit("/", 1)[-1])
if not os.path.exists(filename):
print(f"Downloading {URL} to {filename} (2.6 GB)...")
urlretrieve(URL, filename)
print("done.")
print(f"Parsing {filename}...")
tic = time()
with GzipFile(filename) as f:
df = pd.read_csv(f, header=None, dtype=np.float32)
toc = time()
print(f"Loaded {df.values.nbytes / 1e9:0.3f} GB in {toc - tic:0.3f}s")
return df
def fit(est, data_train, target_train, libname):
print(f"Fitting a {libname} model...")
tic = time()
est.fit(data_train, target_train)
toc = time()
print(f"fitted in {toc - tic:.3f}s")
def predict(est, data_test, target_test):
if args.no_predict:
return
tic = time()
predicted_test = est.predict(data_test)
predicted_proba_test = est.predict_proba(data_test)
toc = time()
roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
acc = accuracy_score(target_test, predicted_test)
print(f"predicted in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
df = load_data()
target = df.values[:, 0]
data = np.ascontiguousarray(df.values[:, 1:])
data_train, data_test, target_train, target_test = train_test_split(
data, target, test_size=0.2, random_state=0
)
n_classes = len(np.unique(target))
if subsample is not None:
data_train, target_train = data_train[:subsample], target_train[:subsample]
n_samples, n_features = data_train.shape
print(f"Training set with {n_samples} records with {n_features} features.")
if args.no_interactions:
interaction_cst = [[i] for i in range(n_features)]
else:
interaction_cst = None
est = HistGradientBoostingClassifier(
loss="log_loss",
learning_rate=lr,
max_iter=n_trees,
max_bins=max_bins,
max_leaf_nodes=n_leaf_nodes,
early_stopping=False,
random_state=0,
verbose=1,
interaction_cst=interaction_cst,
)
fit(est, data_train, target_train, "sklearn")
predict(est, data_test, target_test)
if args.lightgbm:
est = get_equivalent_estimator(est, lib="lightgbm", n_classes=n_classes)
fit(est, data_train, target_train, "lightgbm")
predict(est, data_test, target_test)
if args.xgboost:
est = get_equivalent_estimator(est, lib="xgboost", n_classes=n_classes)
fit(est, data_train, target_train, "xgboost")
predict(est, data_test, target_test)
if args.catboost:
est = get_equivalent_estimator(est, lib="catboost", n_classes=n_classes)
fit(est, data_train, target_train, "catboost")
predict(est, data_test, target_test)