-
Notifications
You must be signed in to change notification settings - Fork 57
/
Copy pathcheck_file_size.py
127 lines (95 loc) · 4 KB
/
check_file_size.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""Check that the file size of skops files is not too large.
Load each (fitted) estimator and persist it with pickle and with skops. Measure
the file size of the resulting files. Report the results and raise an error if
any file is larger than MAX_ALLOWED_SIZE.
Zip compression is applied. This is because we can assume that if a user really
cares about file size, they will compress the file.
"""
from __future__ import annotations
import io
import os
import pickle
import warnings
from tempfile import mkstemp
from typing import Any
from zipfile import ZIP_DEFLATED, ZipFile
import pandas as pd
from sklearn.utils._testing import set_random_state
import skops.io as sio
from skops.io.tests.test_persist import (
_get_check_estimator_ids,
_tested_estimators,
get_input,
)
from skops.utils._fixes import get_tags
TOPK = 10 # number of largest estimators reported
MAX_ALLOWED_SIZE = 1024 # maximum allowed file size in kb
def check_file_size() -> None:
"""Run all file size checks on all estimators and report the results.
Print the results twice, once sorted by absolute differences, once sorted by
relative differences.
"""
results: dict[str, list[Any]] = {"name": [], "pickle (kb)": [], "skops (kb)": []}
for estimator in _tested_estimators():
set_random_state(estimator, random_state=0)
X, y = get_input(estimator)
if get_tags(estimator).requires_fit:
with warnings.catch_warnings():
warnings.filterwarnings("ignore", module="sklearn")
if y is not None:
estimator.fit(X, y)
else:
estimator.fit(X)
name = _get_check_estimator_ids(estimator)
cls_name, _, _ = name.partition("(")
size_pickle, size_skops = run_check(estimator)
results["name"].append(cls_name)
results["pickle (kb)"].append(size_pickle)
results["skops (kb)"].append(size_skops)
format_result(results, topk=TOPK)
def run_check(estimator) -> tuple[float, float]:
"""Run file size check with the given estimator for pickle and skops."""
_, name = mkstemp(prefix="skops")
def run_pickle():
fname = name + ".pickle"
buffer = io.BytesIO()
pickle.dump(estimator, buffer)
with ZipFile(
fname + ".zip", mode="w", compression=ZIP_DEFLATED, compresslevel=9
) as zipf:
zipf.writestr(fname, buffer.getvalue())
# return size in kb
return os.stat(fname + ".zip").st_size / 1024
def run_skops():
fname = name + ".skops"
sio.dump(estimator, fname, compression=ZIP_DEFLATED, compresslevel=9)
# return size in kb
return os.stat(fname).st_size / 1024
size_pickle = run_pickle()
size_skops = run_skops()
return size_pickle, size_skops
def format_result(results: dict[str, list[Any]], topk: int) -> None:
"""Report results from performance checks.
Print the largest file size differences between pickle and skops, once for
absolute, once for relative differences.
"""
df = pd.DataFrame(results)
df = df.assign(
abs_diff=df["skops (kb)"] - df["pickle (kb)"],
rel_diff=df["skops (kb)"] / df["pickle (kb)"],
)
dfs = df.sort_values(["abs_diff"], ascending=False).reset_index(drop=True)
print(f"{topk} largest absolute differences:")
print(dfs[["name", "pickle (kb)", "skops (kb)", "abs_diff"]].head(10))
print(f"{topk} largest relative differences:")
dfs = df.sort_values(["rel_diff"], ascending=False).reset_index(drop=True)
print(dfs[["name", "pickle (kb)", "skops (kb)", "rel_diff"]].head(10))
df_large = df[df["skops (kb)"] > MAX_ALLOWED_SIZE]
if df_large.empty:
print("No file was found to be unacceptably large.")
return
print(f"Found {len(df_large)} skops file(s) larger than {MAX_ALLOWED_SIZE} kb:")
print(", ".join(df_large["name"].tolist()))
raise RuntimeError("Found unacceptably large skops files.")
if __name__ == "__main__":
check_file_size()