-
Notifications
You must be signed in to change notification settings - Fork 505
/
Copy pathmicrobench.py
139 lines (113 loc) · 4.76 KB
/
microbench.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import os
from dataclasses import dataclass
from torch.profiler import profile, ProfilerActivity, schedule
WAIT = 2
WARMUP = 5
ACTIVE = 3
DEFAULT_SCHEDULE = schedule(wait=WAIT, warmup=WARMUP, active=ACTIVE)
PROF_ITERS = WAIT + WARMUP + ACTIVE
@dataclass
class MicrobenchResults:
test_name: str
testing_speedup: float
baseline_wall_ms: float
testing_wall_ms: float
def __str__(self):
RED = "\u001B[31m"
GREEN = "\u001B[32m"
RESET = "\u001B[0m"
def wrap_in_color(str, color=None):
if color is None:
return str
return f"{color}{str}{RESET}"
formatted_baseline_wall_ms = f"{self.baseline_wall_ms:.02f}ms"
formatted_testing_wall_ms = f"{self.testing_wall_ms:.02f}ms"
formatted_speedup = f"{self.testing_speedup:.02f}x"
formatted_test_name = self.test_name
color = None
if self.testing_speedup < 0.95:
color = RED
if self.testing_speedup > 1.05:
color = GREEN
formatted_test_name = wrap_in_color(formatted_test_name, color)
formatted_baseline_wall_ms = wrap_in_color(formatted_baseline_wall_ms,
color)
formatted_testing_wall_ms = wrap_in_color(formatted_testing_wall_ms, color)
formatted_speedup = wrap_in_color(formatted_speedup, color)
return f"{formatted_test_name}: speedup={formatted_speedup}; base={formatted_baseline_wall_ms}; test={formatted_testing_wall_ms}"
def microbench(test_name,
baseline_fn,
testing_fn,
baseline_bench_fn,
testing_bench_fn,
baseline_sync_fn,
testing_sync_fn,
save_profile_to_dir=None):
"""Benchmarks testing function against baseline function.
Args:
test_name (str): Name of the microbenchmark run.
baseline_fn (fn): Baseline function to benchmark against.
testing_fn (fn): Test function to be benchmarked.
baseline_bench_fn (fn): Benchmarking function for baseline.
testing_bench_fn (fn): Benchmarking function for test.
baseline_sync_fn (fn): Profiling sync function for baseline.
testing_sync_fn (fn): Profiling sync function for test.
return_mode (str, optional): Benchmarking should consist of multiple runs. Defaults to 'min'.
save_profile_to_dir (_type_, optional): If not None, saves the profiling results for the slowest runs to the `save_profile_to_dir`. Defaults to None.
Returns:
MicrobenchResults: A structure holding `test_name`, speedups, and measured wall times for both functions.
"""
assert baseline_bench_fn is not None, "Expect baseline_bench_fn to be defined"
assert testing_bench_fn is not None, "Expect testing_bench_fn to be defined"
baseline_wall_ms, _ = baseline_bench_fn(baseline_fn)
testing_wall_ms, _ = testing_bench_fn(testing_fn)
if save_profile_to_dir and baseline_wall_ms / testing_wall_ms < 0.95:
filebase_path = save_profile_to_dir
_perf_prof(
_get_filepath(filebase_path, test_name, "baseline"), baseline_fn,
baseline_sync_fn)
_perf_prof(
_get_filepath(filebase_path, test_name, "testing"), testing_fn,
testing_sync_fn)
return MicrobenchResults(
test_name=test_name,
testing_speedup=baseline_wall_ms / testing_wall_ms,
baseline_wall_ms=baseline_wall_ms,
testing_wall_ms=testing_wall_ms,
)
def _dump_flame(filepath, prof):
"""Data can be interpreted with running ./flamegraph.pl --title "CUDA time" --countname "us." /tmp/profiler_stacks.txt > perf_viz.svg
More about flamegraphs, and the flamegraph.pl script can be found in https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html#visualizing-data-as-a-flame-graph.
"""
os.makedirs(filepath, exist_ok=True)
filedest = os.path.join(filepath, "flame.txt")
prof.export_stacks(filedest)
def _dump_kernels(filepath, prof):
os.makedirs(filepath, exist_ok=True)
filedest = os.path.join(filepath, "kernel.txt")
summary = prof.key_averages(group_by_input_shape=True).table(
sort_by="cuda_time_total", row_limit=100)
with open(filedest, mode='w', encoding='utf-8') as f:
f.write(summary)
def _dump_trace(filepath, prof):
os.makedirs(filepath, exist_ok=True)
filedest = os.path.join(filepath, "trace.json")
prof.export_chrome_trace(filedest)
def _perf_prof(filepath, fn, sync_fn):
assert sync_fn is not None
with profile(
activities=[ProfilerActivity.CUDA, ProfilerActivity.CPU],
record_shapes=True,
with_flops=True,
profile_memory=True,
with_stack=True,
schedule=DEFAULT_SCHEDULE) as prof:
for _ in range(PROF_ITERS):
fn()
prof.step()
sync_fn()
_dump_flame(filepath, prof)
_dump_trace(filepath, prof)
_dump_kernels(filepath, prof)
def _get_filepath(filebase, test_name, suffix):
return os.path.join(filebase, test_name, suffix)