Skip to content

Commit 96c21f0

Browse files
authored
Merge pull request numpy#15648 from xiegengxin/avx512-exp-float64
MAINT: AVX512 implementation with intrinsic for float64 input np.exp()
2 parents 517f53d + 6bd0cb9 commit 96c21f0

File tree

11 files changed

+666
-59
lines changed

11 files changed

+666
-59
lines changed

benchmarks/benchmarks/bench_avx.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,8 @@ def time_mandel(self):
131131
self.mandelbrot_set(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
132132

133133
class LogisticRegression(Benchmark):
134+
param_names = ['dtype']
135+
params = [np.float32, np.float64]
134136

135137
timeout = 1000
136138
def train(self, max_epoch):
@@ -142,16 +144,16 @@ def train(self, max_epoch):
142144
dw = (1/self.size) * np.matmul(self.X_train.T, dz)
143145
self.W = self.W - self.alpha*dw
144146

145-
def setup(self):
147+
def setup(self, dtype):
146148
np.random.seed(42)
147149
self.size = 250
148150
features = 16
149-
self.X_train = np.float32(np.random.rand(self.size,features))
150-
self.Y_train = np.float32(np.random.choice(2,self.size))
151+
self.X_train = np.random.rand(self.size,features).astype(dtype)
152+
self.Y_train = np.random.choice(2,self.size).astype(dtype)
151153
# Initialize weights
152-
self.W = np.zeros((features,1), dtype=np.float32)
153-
self.b = np.zeros((1,1), dtype=np.float32)
154+
self.W = np.zeros((features,1), dtype=dtype)
155+
self.b = np.zeros((1,1), dtype=dtype)
154156
self.alpha = 0.1
155157

156-
def time_train(self):
158+
def time_train(self, dtype):
157159
self.train(1000)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Use AVX512 intrinsic to implement ``np.exp`` when input is ``np.float64``
2+
--------------------------------------------------------------------------
3+
Use AVX512 intrinsic to implement ``np.exp`` when input is ``np.float64``,
4+
which can improve the performance of ``np.exp`` with ``np.float64`` input 5-7x
5+
faster than before. The _multiarray_umath.so module has grown about 63 KB on
6+
linux64.
7+

numpy/core/code_generators/generate_umath.py

+1
Original file line numberDiff line numberDiff line change
@@ -702,6 +702,7 @@ def english_upper(s):
702702
None,
703703
TD('e', f='exp', astype={'e':'f'}),
704704
TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
705+
TD('d', simd=[('avx512f', 'd')]),
705706
TD('fdg' + cmplx, f='exp'),
706707
TD(P, f='exp'),
707708
),

numpy/core/include/numpy/npy_math.h

-47
Original file line numberDiff line numberDiff line change
@@ -113,53 +113,6 @@ NPY_INLINE static float __npy_nzerof(void)
113113
#define NPY_SQRT2l 1.414213562373095048801688724209698079L /* sqrt(2) */
114114
#define NPY_SQRT1_2l 0.707106781186547524400844362104849039L /* 1/sqrt(2) */
115115

116-
/*
117-
* Constants used in vector implementation of exp(x)
118-
*/
119-
#define NPY_RINT_CVT_MAGICf 0x1.800000p+23f
120-
#define NPY_CODY_WAITE_LOGE_2_HIGHf -6.93145752e-1f
121-
#define NPY_CODY_WAITE_LOGE_2_LOWf -1.42860677e-6f
122-
#define NPY_COEFF_P0_EXPf 9.999999999980870924916e-01f
123-
#define NPY_COEFF_P1_EXPf 7.257664613233124478488e-01f
124-
#define NPY_COEFF_P2_EXPf 2.473615434895520810817e-01f
125-
#define NPY_COEFF_P3_EXPf 5.114512081637298353406e-02f
126-
#define NPY_COEFF_P4_EXPf 6.757896990527504603057e-03f
127-
#define NPY_COEFF_P5_EXPf 5.082762527590693718096e-04f
128-
#define NPY_COEFF_Q0_EXPf 1.000000000000000000000e+00f
129-
#define NPY_COEFF_Q1_EXPf -2.742335390411667452936e-01f
130-
#define NPY_COEFF_Q2_EXPf 2.159509375685829852307e-02f
131-
132-
/*
133-
* Constants used in vector implementation of log(x)
134-
*/
135-
#define NPY_COEFF_P0_LOGf 0.000000000000000000000e+00f
136-
#define NPY_COEFF_P1_LOGf 9.999999999999998702752e-01f
137-
#define NPY_COEFF_P2_LOGf 2.112677543073053063722e+00f
138-
#define NPY_COEFF_P3_LOGf 1.480000633576506585156e+00f
139-
#define NPY_COEFF_P4_LOGf 3.808837741388407920751e-01f
140-
#define NPY_COEFF_P5_LOGf 2.589979117907922693523e-02f
141-
#define NPY_COEFF_Q0_LOGf 1.000000000000000000000e+00f
142-
#define NPY_COEFF_Q1_LOGf 2.612677543073109236779e+00f
143-
#define NPY_COEFF_Q2_LOGf 2.453006071784736363091e+00f
144-
#define NPY_COEFF_Q3_LOGf 9.864942958519418960339e-01f
145-
#define NPY_COEFF_Q4_LOGf 1.546476374983906719538e-01f
146-
#define NPY_COEFF_Q5_LOGf 5.875095403124574342950e-03f
147-
/*
148-
* Constants used in vector implementation of sinf/cosf(x)
149-
*/
150-
#define NPY_TWO_O_PIf 0x1.45f306p-1f
151-
#define NPY_CODY_WAITE_PI_O_2_HIGHf -0x1.921fb0p+00f
152-
#define NPY_CODY_WAITE_PI_O_2_MEDf -0x1.5110b4p-22f
153-
#define NPY_CODY_WAITE_PI_O_2_LOWf -0x1.846988p-48f
154-
#define NPY_COEFF_INVF0_COSINEf 0x1.000000p+00f
155-
#define NPY_COEFF_INVF2_COSINEf -0x1.000000p-01f
156-
#define NPY_COEFF_INVF4_COSINEf 0x1.55553cp-05f
157-
#define NPY_COEFF_INVF6_COSINEf -0x1.6c06dcp-10f
158-
#define NPY_COEFF_INVF8_COSINEf 0x1.98e616p-16f
159-
#define NPY_COEFF_INVF3_SINEf -0x1.555556p-03f
160-
#define NPY_COEFF_INVF5_SINEf 0x1.11119ap-07f
161-
#define NPY_COEFF_INVF7_SINEf -0x1.a06bbap-13f
162-
#define NPY_COEFF_INVF9_SINEf 0x1.7d3bbcp-19f
163116
/*
164117
* Integer functions.
165118
*/

numpy/core/src/umath/loops.c.src

+20
Original file line numberDiff line numberDiff line change
@@ -1558,6 +1558,15 @@ FLOAT_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, voi
15581558

15591559
/**end repeat**/
15601560

1561+
NPY_NO_EXPORT NPY_GCC_OPT_3 void
1562+
DOUBLE_exp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
1563+
{
1564+
UNARY_LOOP {
1565+
const npy_double in1 = *(npy_double *)ip1;
1566+
*(npy_double *)op1 = npy_exp(in1);
1567+
}
1568+
}
1569+
15611570
/**begin repeat
15621571
* #isa = avx512f, fma#
15631572
* #ISA = AVX512F, FMA#
@@ -1688,6 +1697,17 @@ FLOAT_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *step
16881697
/**end repeat1**/
16891698
/**end repeat**/
16901699

1700+
NPY_NO_EXPORT NPY_GCC_OPT_3 void
1701+
DOUBLE_exp_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
1702+
{
1703+
if (!run_unary_avx512f_exp_DOUBLE(args, dimensions, steps)) {
1704+
UNARY_LOOP {
1705+
const npy_double in1 = *(npy_double *)ip1;
1706+
*(npy_double *)op1 = npy_exp(in1);
1707+
}
1708+
}
1709+
}
1710+
16911711

16921712
/**begin repeat
16931713
* Float types

numpy/core/src/umath/loops.h.src

+6
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,12 @@ NPY_NO_EXPORT void
196196
/**end repeat1**/
197197
/**end repeat**/
198198

199+
NPY_NO_EXPORT void
200+
DOUBLE_exp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
201+
202+
NPY_NO_EXPORT void
203+
DOUBLE_exp_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
204+
199205
/**begin repeat
200206
* #func = sin, cos, exp, log#
201207
*/

numpy/core/src/umath/npy_simd_data.h

+137
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
#ifndef __NPY_SIMD_DATA_H_
2+
#define __NPY_SIMD_DATA_H_
3+
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
4+
/*
5+
* Constants used in vector implementation of float64 exp(x)
6+
*/
7+
#define NPY_RINT_CVT_MAGIC 0x1.8p52
8+
#define NPY_INV_LN2_MUL_32 0x1.71547652b82fep+5
9+
#define NPY_TANG_NEG_L1 -0x1.62e42fefp-6
10+
#define NPY_TANG_NEG_L2 -0x1.473de6af278edp-39
11+
#define NPY_TANG_A1 0x1p-1
12+
#define NPY_TANG_A2 0x1.5555555548f7cp-3
13+
#define NPY_TANG_A3 0x1.5555555545d4ep-5
14+
#define NPY_TANG_A4 0x1.11115b7aa905ep-7
15+
#define NPY_TANG_A5 0x1.6c1728d739765p-10
16+
17+
/* Lookup table for 2^(j/32) */
18+
static npy_uint64 EXP_Table_top[32] = {
19+
0x3FF0000000000000,
20+
0x3FF059B0D3158540,
21+
0x3FF0B5586CF98900,
22+
0x3FF11301D0125B40,
23+
0x3FF172B83C7D5140,
24+
0x3FF1D4873168B980,
25+
0x3FF2387A6E756200,
26+
0x3FF29E9DF51FDEC0,
27+
0x3FF306FE0A31B700,
28+
0x3FF371A7373AA9C0,
29+
0x3FF3DEA64C123400,
30+
0x3FF44E0860618900,
31+
0x3FF4BFDAD5362A00,
32+
0x3FF5342B569D4F80,
33+
0x3FF5AB07DD485400,
34+
0x3FF6247EB03A5580,
35+
0x3FF6A09E667F3BC0,
36+
0x3FF71F75E8EC5F40,
37+
0x3FF7A11473EB0180,
38+
0x3FF82589994CCE00,
39+
0x3FF8ACE5422AA0C0,
40+
0x3FF93737B0CDC5C0,
41+
0x3FF9C49182A3F080,
42+
0x3FFA5503B23E2540,
43+
0x3FFAE89F995AD380,
44+
0x3FFB7F76F2FB5E40,
45+
0x3FFC199BDD855280,
46+
0x3FFCB720DCEF9040,
47+
0x3FFD5818DCFBA480,
48+
0x3FFDFC97337B9B40,
49+
0x3FFEA4AFA2A490C0,
50+
0x3FFF50765B6E4540,
51+
};
52+
53+
static npy_uint64 EXP_Table_tail[32] = {
54+
0x0000000000000000,
55+
0x3D0A1D73E2A475B4,
56+
0x3CEEC5317256E308,
57+
0x3CF0A4EBBF1AED93,
58+
0x3D0D6E6FBE462876,
59+
0x3D053C02DC0144C8,
60+
0x3D0C3360FD6D8E0B,
61+
0x3D009612E8AFAD12,
62+
0x3CF52DE8D5A46306,
63+
0x3CE54E28AA05E8A9,
64+
0x3D011ADA0911F09F,
65+
0x3D068189B7A04EF8,
66+
0x3D038EA1CBD7F621,
67+
0x3CBDF0A83C49D86A,
68+
0x3D04AC64980A8C8F,
69+
0x3CD2C7C3E81BF4B7,
70+
0x3CE921165F626CDD,
71+
0x3D09EE91B8797785,
72+
0x3CDB5F54408FDB37,
73+
0x3CF28ACF88AFAB35,
74+
0x3CFB5BA7C55A192D,
75+
0x3D027A280E1F92A0,
76+
0x3CF01C7C46B071F3,
77+
0x3CFC8B424491CAF8,
78+
0x3D06AF439A68BB99,
79+
0x3CDBAA9EC206AD4F,
80+
0x3CFC2220CB12A092,
81+
0x3D048A81E5E8F4A5,
82+
0x3CDC976816BAD9B8,
83+
0x3CFEB968CAC39ED3,
84+
0x3CF9858F73A18F5E,
85+
0x3C99D3E12DD8A18B,
86+
};
87+
#endif
88+
89+
/*
90+
* Constants used in vector implementation of exp(x)
91+
*/
92+
#define NPY_RINT_CVT_MAGICf 0x1.800000p+23f
93+
#define NPY_CODY_WAITE_LOGE_2_HIGHf -6.93145752e-1f
94+
#define NPY_CODY_WAITE_LOGE_2_LOWf -1.42860677e-6f
95+
#define NPY_COEFF_P0_EXPf 9.999999999980870924916e-01f
96+
#define NPY_COEFF_P1_EXPf 7.257664613233124478488e-01f
97+
#define NPY_COEFF_P2_EXPf 2.473615434895520810817e-01f
98+
#define NPY_COEFF_P3_EXPf 5.114512081637298353406e-02f
99+
#define NPY_COEFF_P4_EXPf 6.757896990527504603057e-03f
100+
#define NPY_COEFF_P5_EXPf 5.082762527590693718096e-04f
101+
#define NPY_COEFF_Q0_EXPf 1.000000000000000000000e+00f
102+
#define NPY_COEFF_Q1_EXPf -2.742335390411667452936e-01f
103+
#define NPY_COEFF_Q2_EXPf 2.159509375685829852307e-02f
104+
105+
/*
106+
* Constants used in vector implementation of log(x)
107+
*/
108+
#define NPY_COEFF_P0_LOGf 0.000000000000000000000e+00f
109+
#define NPY_COEFF_P1_LOGf 9.999999999999998702752e-01f
110+
#define NPY_COEFF_P2_LOGf 2.112677543073053063722e+00f
111+
#define NPY_COEFF_P3_LOGf 1.480000633576506585156e+00f
112+
#define NPY_COEFF_P4_LOGf 3.808837741388407920751e-01f
113+
#define NPY_COEFF_P5_LOGf 2.589979117907922693523e-02f
114+
#define NPY_COEFF_Q0_LOGf 1.000000000000000000000e+00f
115+
#define NPY_COEFF_Q1_LOGf 2.612677543073109236779e+00f
116+
#define NPY_COEFF_Q2_LOGf 2.453006071784736363091e+00f
117+
#define NPY_COEFF_Q3_LOGf 9.864942958519418960339e-01f
118+
#define NPY_COEFF_Q4_LOGf 1.546476374983906719538e-01f
119+
#define NPY_COEFF_Q5_LOGf 5.875095403124574342950e-03f
120+
/*
121+
* Constants used in vector implementation of sinf/cosf(x)
122+
*/
123+
#define NPY_TWO_O_PIf 0x1.45f306p-1f
124+
#define NPY_CODY_WAITE_PI_O_2_HIGHf -0x1.921fb0p+00f
125+
#define NPY_CODY_WAITE_PI_O_2_MEDf -0x1.5110b4p-22f
126+
#define NPY_CODY_WAITE_PI_O_2_LOWf -0x1.846988p-48f
127+
#define NPY_COEFF_INVF0_COSINEf 0x1.000000p+00f
128+
#define NPY_COEFF_INVF2_COSINEf -0x1.000000p-01f
129+
#define NPY_COEFF_INVF4_COSINEf 0x1.55553cp-05f
130+
#define NPY_COEFF_INVF6_COSINEf -0x1.6c06dcp-10f
131+
#define NPY_COEFF_INVF8_COSINEf 0x1.98e616p-16f
132+
#define NPY_COEFF_INVF3_SINEf -0x1.555556p-03f
133+
#define NPY_COEFF_INVF5_SINEf 0x1.11119ap-07f
134+
#define NPY_COEFF_INVF7_SINEf -0x1.a06bbap-13f
135+
#define NPY_COEFF_INVF9_SINEf 0x1.7d3bbcp-19f
136+
137+
#endif

0 commit comments

Comments
 (0)