0% found this document useful (0 votes)

9 views395 pages

CUDA Math API

Uploaded by

alan88w

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

9 views395 pages

CUDA Math API

Uploaded by

alan88w

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 395

CUDA Math API

API Reference Manual

vRelease Version | January 2022

Table of Contents
Chapter 1. Modules.............................................................................................................. 1
1.1. Half Precision Intrinsics........................................................................................................... 2
Half Arithmetic Functions............................................................................................................2
Half2 Arithmetic Functions.......................................................................................................... 2
Half Comparison Functions......................................................................................................... 2
Half2 Comparison Functions....................................................................................................... 2
Half Precision Conversion and Data Movement.........................................................................2
Half Math Functions.....................................................................................................................2
Half2 Math Functions................................................................................................................... 2
1.1.1. Half Arithmetic Functions..................................................................................................2
__habs........................................................................................................................................2
__hadd....................................................................................................................................... 3
__hadd_rn..................................................................................................................................3
__hadd_sat................................................................................................................................ 3
__hdiv......................................................................................................................................... 4
__hfma....................................................................................................................................... 4
__hfma_relu.............................................................................................................................. 4
__hfma_sat................................................................................................................................ 5
__hmul....................................................................................................................................... 5
__hmul_rn................................................................................................................................. 5
__hmul_sat................................................................................................................................ 6
__hneg....................................................................................................................................... 6
__hsub........................................................................................................................................6
__hsub_rn.................................................................................................................................. 6
__hsub_sat................................................................................................................................ 7
atomicAdd..................................................................................................................................7
1.1.2. Half2 Arithmetic Functions................................................................................................8
__h2div....................................................................................................................................... 8
__habs2......................................................................................................................................8
__hadd2......................................................................................................................................9
__hadd2_rn................................................................................................................................ 9
__hadd2_sat.............................................................................................................................. 9
__hcmadd................................................................................................................................ 10
__hfma2................................................................................................................................... 10
__hfma2_relu...........................................................................................................................10
__hfma2_sat............................................................................................................................ 11

CUDA Math API vRelease Version | ii

__hmul2................................................................................................................................... 12
__hmul2_rn..............................................................................................................................12
__hmul2_sat............................................................................................................................ 12
__hneg2....................................................................................................................................13
__hsub2....................................................................................................................................13
__hsub2_rn.............................................................................................................................. 13
__hsub2_sat.............................................................................................................................13
atomicAdd................................................................................................................................ 14
1.1.3. Half Comparison Functions............................................................................................. 14
__heq........................................................................................................................................15
__hequ..................................................................................................................................... 15
__hge........................................................................................................................................15
__hgeu..................................................................................................................................... 16
__hgt........................................................................................................................................ 16
__hgtu...................................................................................................................................... 17
__hisinf.....................................................................................................................................17
__hisnan...................................................................................................................................18
__hle.........................................................................................................................................18
__hleu...................................................................................................................................... 18
__hlt......................................................................................................................................... 19
__hltu....................................................................................................................................... 19
__hmax.....................................................................................................................................20
__hmax_nan............................................................................................................................ 20
__hmin..................................................................................................................................... 20
__hmin_nan............................................................................................................................. 20
__hne....................................................................................................................................... 21
__hneu..................................................................................................................................... 21
1.1.4. Half2 Comparison Functions........................................................................................... 21
__hbeq2....................................................................................................................................22
__hbequ2................................................................................................................................. 22
__hbge2....................................................................................................................................23
__hbgeu2................................................................................................................................. 23
__hbgt2.................................................................................................................................... 24
__hbgtu2.................................................................................................................................. 24
__hble2.....................................................................................................................................25
__hbleu2.................................................................................................................................. 25
__hblt2..................................................................................................................................... 26
__hbltu2................................................................................................................................... 27

CUDA Math API vRelease Version | iii

__hbne2....................................................................................................................................27
__hbneu2................................................................................................................................. 28
__heq2......................................................................................................................................28
__hequ2....................................................................................................................................29
__hge2......................................................................................................................................29
__hgeu2....................................................................................................................................30
__hgt2...................................................................................................................................... 30
__hgtu2.................................................................................................................................... 31
__hisnan2.................................................................................................................................31
__hle2.......................................................................................................................................31
__hleu2.................................................................................................................................... 32
__hlt2....................................................................................................................................... 32
__hltu2..................................................................................................................................... 33
__hmax2...................................................................................................................................33
__hmax2_nan.......................................................................................................................... 34
__hmin2................................................................................................................................... 34
__hmin2_nan........................................................................................................................... 34
__hne2......................................................................................................................................35
__hneu2................................................................................................................................... 35
1.1.5. Half Precision Conversion and Data Movement............................................................. 36
__double2half.......................................................................................................................... 36
__float22half2_rn.................................................................................................................... 36
__float2half.............................................................................................................................. 37
__float2half2_rn...................................................................................................................... 37
__float2half_rd........................................................................................................................ 37
__float2half_rn........................................................................................................................ 38
__float2half_ru........................................................................................................................ 38
__float2half_rz.........................................................................................................................39
__floats2half2_rn.....................................................................................................................39
__half22float2.......................................................................................................................... 40
__half2float.............................................................................................................................. 40
__half2half2............................................................................................................................. 40
__half2int_rd............................................................................................................................41
__half2int_rn............................................................................................................................41
__half2int_ru............................................................................................................................42
__half2int_rz............................................................................................................................ 42
__half2ll_rd..............................................................................................................................42
__half2ll_rn..............................................................................................................................43

CUDA Math API vRelease Version | iv

__half2ll_ru..............................................................................................................................43
__half2ll_rz.............................................................................................................................. 44
__half2short_rd....................................................................................................................... 44
__half2short_rn....................................................................................................................... 44
__half2short_ru....................................................................................................................... 45
__half2short_rz....................................................................................................................... 45
__half2uint_rd......................................................................................................................... 46
__half2uint_rn......................................................................................................................... 46
__half2uint_ru......................................................................................................................... 47
__half2uint_rz..........................................................................................................................47
__half2ull_rd............................................................................................................................47
__half2ull_rn............................................................................................................................48
__half2ull_ru............................................................................................................................48
__half2ull_rz............................................................................................................................ 49
__half2ushort_rd..................................................................................................................... 49
__half2ushort_rn..................................................................................................................... 50
__half2ushort_ru..................................................................................................................... 50
__half2ushort_rz..................................................................................................................... 51
__half_as_short.......................................................................................................................51
__half_as_ushort.....................................................................................................................51
__halves2half2.........................................................................................................................52
__high2float............................................................................................................................. 52
__high2half.............................................................................................................................. 53
__high2half2............................................................................................................................ 53
__highs2half2.......................................................................................................................... 54
__int2half_rd............................................................................................................................54
__int2half_rn............................................................................................................................55
__int2half_ru............................................................................................................................55
__int2half_rz............................................................................................................................ 55
__ldca.......................................................................................................................................56
__ldca.......................................................................................................................................56
__ldcg.......................................................................................................................................56
__ldcg.......................................................................................................................................57
__ldcs.......................................................................................................................................57
__ldcs.......................................................................................................................................57
__ldcv....................................................................................................................................... 57
__ldcv....................................................................................................................................... 58
__ldg.........................................................................................................................................58

CUDA Math API vRelease Version | v

__ldg.........................................................................................................................................58
__ldlu....................................................................................................................................... 58
__ldlu....................................................................................................................................... 59
__ll2half_rd..............................................................................................................................59
__ll2half_rn..............................................................................................................................59
__ll2half_ru..............................................................................................................................60
__ll2half_rz.............................................................................................................................. 60
__low2float...............................................................................................................................61
__low2half................................................................................................................................61
__low2half2..............................................................................................................................61
__lowhigh2highlow..................................................................................................................62
__lows2half2............................................................................................................................62
__shfl_down_sync................................................................................................................... 63
__shfl_down_sync................................................................................................................... 64
__shfl_sync.............................................................................................................................. 65
__shfl_sync.............................................................................................................................. 65
__shfl_up_sync........................................................................................................................ 66
__shfl_up_sync........................................................................................................................ 67
__shfl_xor_sync.......................................................................................................................68
__shfl_xor_sync.......................................................................................................................69
__short2half_rd....................................................................................................................... 69
__short2half_rn....................................................................................................................... 70
__short2half_ru....................................................................................................................... 70
__short2half_rz....................................................................................................................... 71
__short_as_half.......................................................................................................................71
__stcg.......................................................................................................................................72
__stcg.......................................................................................................................................72
__stcs.......................................................................................................................................72
__stcs.......................................................................................................................................72
__stwb......................................................................................................................................73
__stwb......................................................................................................................................73
__stwt.......................................................................................................................................73
__stwt.......................................................................................................................................73
__uint2half_rd......................................................................................................................... 74
__uint2half_rn......................................................................................................................... 74
__uint2half_ru......................................................................................................................... 74
__uint2half_rz..........................................................................................................................75
__ull2half_rd............................................................................................................................75

CUDA Math API vRelease Version | vi

__ull2half_rn............................................................................................................................76
__ull2half_ru............................................................................................................................76
__ull2half_rz............................................................................................................................ 77
__ushort2half_rd..................................................................................................................... 77
__ushort2half_rn..................................................................................................................... 78
__ushort2half_ru..................................................................................................................... 78
__ushort2half_rz..................................................................................................................... 79
__ushort_as_half.....................................................................................................................79
1.1.6. Half Math Functions.........................................................................................................79
hceil..........................................................................................................................................80
hcos..........................................................................................................................................80
hexp..........................................................................................................................................80
hexp10......................................................................................................................................81
hexp2........................................................................................................................................81
hfloor........................................................................................................................................81
hlog.......................................................................................................................................... 82
hlog10...................................................................................................................................... 82
hlog2........................................................................................................................................ 83
hrcp.......................................................................................................................................... 83
hrint..........................................................................................................................................83
hrsqrt....................................................................................................................................... 84
hsin...........................................................................................................................................84
hsqrt.........................................................................................................................................85
htrunc.......................................................................................................................................85
1.1.7. Half2 Math Functions.......................................................................................................85
h2ceil........................................................................................................................................86
h2cos........................................................................................................................................86
h2exp........................................................................................................................................86
h2exp10....................................................................................................................................87
h2exp2......................................................................................................................................87
h2floor......................................................................................................................................88
h2log........................................................................................................................................ 88
h2log10.....................................................................................................................................88
h2log2...................................................................................................................................... 89
h2rcp........................................................................................................................................ 89
h2rint........................................................................................................................................90
h2rsqrt..................................................................................................................................... 90
h2sin.........................................................................................................................................90

CUDA Math API vRelease Version | vii

h2sqrt.......................................................................................................................................91
h2trunc.....................................................................................................................................91
1.2. Bfloat16 Precision Intrinsics.................................................................................................. 91
Bfloat16 Arithmetic Functions................................................................................................... 92
Bfloat162 Arithmetic Functions................................................................................................. 92
Bfloat16 Comparison Functions................................................................................................ 92
Bfloat162 Comparison Functions.............................................................................................. 92
Bfloat16 Precision Conversion and Data Movement................................................................ 92
Bfloat16 Math Functions............................................................................................................ 92
Bfloat162 Math Functions.......................................................................................................... 92
1.2.1. Bfloat16 Arithmetic Functions......................................................................................... 92
__h2div..................................................................................................................................... 92
__habs......................................................................................................................................92
__hadd......................................................................................................................................93
__hadd_rn................................................................................................................................ 93
__hadd_sat.............................................................................................................................. 93
__hdiv....................................................................................................................................... 94
__hfma..................................................................................................................................... 94
__hfma_relu............................................................................................................................ 94
__hfma_sat.............................................................................................................................. 95
__hmul..................................................................................................................................... 95
__hmul_rn............................................................................................................................... 95
__hmul_sat.............................................................................................................................. 96
__hneg..................................................................................................................................... 96
__hsub......................................................................................................................................96
__hsub_rn................................................................................................................................ 97
__hsub_sat...............................................................................................................................97
atomicAdd................................................................................................................................ 97
1.2.2. Bfloat162 Arithmetic Functions....................................................................................... 98
__habs2....................................................................................................................................98
__hadd2....................................................................................................................................99
__hadd2_rn.............................................................................................................................. 99
__hadd2_sat.............................................................................................................................99
__hcmadd.............................................................................................................................. 100
__hfma2................................................................................................................................. 100
__hfma2_relu.........................................................................................................................100
__hfma2_sat.......................................................................................................................... 101
__hmul2................................................................................................................................. 102

CUDA Math API vRelease Version | viii

__hmul2_rn............................................................................................................................102
__hmul2_sat.......................................................................................................................... 102
__hneg2..................................................................................................................................103
__hsub2..................................................................................................................................103
__hsub2_rn............................................................................................................................ 103
__hsub2_sat...........................................................................................................................103
atomicAdd.............................................................................................................................. 104
1.2.3. Bfloat16 Comparison Functions.................................................................................... 104
__heq......................................................................................................................................105
__hequ....................................................................................................................................105
__hge......................................................................................................................................106
__hgeu....................................................................................................................................106
__hgt...................................................................................................................................... 107
__hgtu.................................................................................................................................... 107
__hisinf...................................................................................................................................108
__hisnan.................................................................................................................................108
__hle.......................................................................................................................................108
__hleu.................................................................................................................................... 109
__hlt....................................................................................................................................... 109
__hltu..................................................................................................................................... 110
__hmax...................................................................................................................................110
__hmax_nan.......................................................................................................................... 111
__hmin................................................................................................................................... 111
__hmin_nan........................................................................................................................... 111
__hne......................................................................................................................................111
__hneu................................................................................................................................... 112
1.2.4. Bfloat162 Comparison Functions.................................................................................. 112
__hbeq2..................................................................................................................................113
__hbequ2................................................................................................................................113
__hbge2..................................................................................................................................114
__hbgeu2................................................................................................................................114
__hbgt2.................................................................................................................................. 115
__hbgtu2................................................................................................................................ 116
__hble2...................................................................................................................................116
__hbleu2................................................................................................................................ 117
__hblt2................................................................................................................................... 117
__hbltu2................................................................................................................................. 118
__hbne2..................................................................................................................................119

CUDA Math API vRelease Version | ix

__hbneu2............................................................................................................................... 119
__heq2....................................................................................................................................120
__hequ2..................................................................................................................................120
__hge2....................................................................................................................................121
__hgeu2..................................................................................................................................121
__hgt2.................................................................................................................................... 122
__hgtu2.................................................................................................................................. 122
__hisnan2...............................................................................................................................123
__hle2.....................................................................................................................................123
__hleu2...................................................................................................................................124
__hlt2..................................................................................................................................... 124
__hltu2................................................................................................................................... 125
__hmax2.................................................................................................................................125
__hmax2_nan........................................................................................................................ 126
__hmin2................................................................................................................................. 126
__hmin2_nan......................................................................................................................... 126
__hne2....................................................................................................................................127
__hneu2................................................................................................................................. 127
1.2.5. Bfloat16 Precision Conversion and Data Movement.................................................... 128
__bfloat1622float2................................................................................................................. 128
__bfloat162bfloat162............................................................................................................. 128
__bfloat162float..................................................................................................................... 129
__bfloat162int_rd...................................................................................................................129
__bfloat162int_rn...................................................................................................................129
__bfloat162int_ru...................................................................................................................130
__bfloat162int_rz................................................................................................................... 130
__bfloat162ll_rd.....................................................................................................................131
__bfloat162ll_rn.....................................................................................................................131
__bfloat162ll_ru.....................................................................................................................132
__bfloat162ll_rz..................................................................................................................... 132
__bfloat162short_rd.............................................................................................................. 133
__bfloat162short_rn.............................................................................................................. 133
__bfloat162short_ru.............................................................................................................. 134
__bfloat162short_rz.............................................................................................................. 134
__bfloat162uint_rd................................................................................................................ 135
__bfloat162uint_rn................................................................................................................ 135
__bfloat162uint_ru................................................................................................................ 136
__bfloat162uint_rz.................................................................................................................136

CUDA Math API vRelease Version | x

__bfloat162ull_rd...................................................................................................................137
__bfloat162ull_rn...................................................................................................................137
__bfloat162ull_ru...................................................................................................................138
__bfloat162ull_rz................................................................................................................... 138
__bfloat162ushort_rd............................................................................................................ 139
__bfloat162ushort_rn............................................................................................................ 139
__bfloat162ushort_ru............................................................................................................ 140
__bfloat162ushort_rz............................................................................................................ 140
__bfloat16_as_short..............................................................................................................141
__bfloat16_as_ushort............................................................................................................141
__double2bfloat16................................................................................................................. 142
__float22bfloat162_rn........................................................................................................... 142
__float2bfloat16..................................................................................................................... 143
__float2bfloat162_rn............................................................................................................. 143
__float2bfloat16_rd............................................................................................................... 144
__float2bfloat16_rn............................................................................................................... 144
__float2bfloat16_ru............................................................................................................... 145
__float2bfloat16_rz................................................................................................................145
__floats2bfloat162_rn............................................................................................................146
__halves2bfloat162................................................................................................................146
__high2bfloat16..................................................................................................................... 147
__high2bfloat162................................................................................................................... 147
__high2float........................................................................................................................... 148
__highs2bfloat162................................................................................................................. 148
__int2bfloat16_rd...................................................................................................................149
__int2bfloat16_rn...................................................................................................................149
__int2bfloat16_ru...................................................................................................................149
__int2bfloat16_rz................................................................................................................... 150
__ldca.....................................................................................................................................150
__ldca.....................................................................................................................................151
__ldcg.....................................................................................................................................151
__ldcg.....................................................................................................................................151
__ldcs..................................................................................................................................... 151
__ldcs..................................................................................................................................... 152
__ldcv..................................................................................................................................... 152
__ldcv..................................................................................................................................... 152
__ldg.......................................................................................................................................153
__ldg.......................................................................................................................................153

CUDA Math API vRelease Version | xi

__ldlu..................................................................................................................................... 153
__ldlu..................................................................................................................................... 153
__ll2bfloat16_rd.....................................................................................................................154
__ll2bfloat16_rn.....................................................................................................................154
__ll2bfloat16_ru.....................................................................................................................155
__ll2bfloat16_rz..................................................................................................................... 155
__low2bfloat16.......................................................................................................................156
__low2bfloat162.....................................................................................................................156
__low2float.............................................................................................................................157
__lowhigh2highlow................................................................................................................157
__lows2bfloat162...................................................................................................................158
__shfl_down_sync................................................................................................................. 158
__shfl_down_sync................................................................................................................. 159
__shfl_sync............................................................................................................................ 160
__shfl_sync............................................................................................................................ 161
__shfl_up_sync...................................................................................................................... 162
__shfl_up_sync...................................................................................................................... 163
__shfl_xor_sync.....................................................................................................................164
__shfl_xor_sync.....................................................................................................................165
__short2bfloat16_rd.............................................................................................................. 166
__short2bfloat16_rn.............................................................................................................. 166
__short2bfloat16_ru.............................................................................................................. 167
__short2bfloat16_rz.............................................................................................................. 167
__short_as_bfloat16..............................................................................................................168
__stcg.....................................................................................................................................168
__stcg.....................................................................................................................................168
__stcs..................................................................................................................................... 169
__stcs..................................................................................................................................... 169
__stwb.................................................................................................................................... 169
__stwb.................................................................................................................................... 169
__stwt.....................................................................................................................................170
__stwt.....................................................................................................................................170
__uint2bfloat16_rd................................................................................................................ 170
__uint2bfloat16_rn................................................................................................................ 171
__uint2bfloat16_ru................................................................................................................ 171
__uint2bfloat16_rz.................................................................................................................172
__ull2bfloat16_rd...................................................................................................................172
__ull2bfloat16_rn...................................................................................................................173

CUDA Math API vRelease Version | xii

__ull2bfloat16_ru...................................................................................................................173
__ull2bfloat16_rz................................................................................................................... 174
__ushort2bfloat16_rd............................................................................................................ 174
__ushort2bfloat16_rn............................................................................................................ 175
__ushort2bfloat16_ru............................................................................................................ 175
__ushort2bfloat16_rz............................................................................................................ 176
__ushort_as_bfloat16............................................................................................................176
1.2.6. Bfloat16 Math Functions................................................................................................ 176
hceil........................................................................................................................................177
hcos........................................................................................................................................177
hexp........................................................................................................................................177
hexp10....................................................................................................................................178
hexp2......................................................................................................................................178
hfloor......................................................................................................................................179
hlog........................................................................................................................................ 179
hlog10.....................................................................................................................................179
hlog2...................................................................................................................................... 180
hrcp........................................................................................................................................ 180
hrint........................................................................................................................................180
hrsqrt..................................................................................................................................... 181
hsin.........................................................................................................................................181
hsqrt.......................................................................................................................................182
htrunc.....................................................................................................................................182
1.2.7. Bfloat162 Math Functions.............................................................................................. 182
h2ceil......................................................................................................................................183
h2cos......................................................................................................................................183
h2exp......................................................................................................................................183
h2exp10.................................................................................................................................. 184
h2exp2....................................................................................................................................184
h2floor....................................................................................................................................185
h2log...................................................................................................................................... 185
h2log10...................................................................................................................................185
h2log2.....................................................................................................................................186
h2rcp...................................................................................................................................... 186
h2rint......................................................................................................................................187
h2rsqrt................................................................................................................................... 187
h2sin.......................................................................................................................................187
h2sqrt..................................................................................................................................... 188

CUDA Math API vRelease Version | xiii

h2trunc...................................................................................................................................188
1.3. Mathematical Functions....................................................................................................... 189
1.4. Single Precision Mathematical Functions........................................................................... 189
acosf...........................................................................................................................................189
acoshf........................................................................................................................................ 190
asinf........................................................................................................................................... 190
asinhf......................................................................................................................................... 191
atan2f......................................................................................................................................... 191
atanf........................................................................................................................................... 192
atanhf.........................................................................................................................................192
cbrtf............................................................................................................................................193
ceilf............................................................................................................................................ 193
copysignf....................................................................................................................................194
cosf............................................................................................................................................ 194
coshf.......................................................................................................................................... 194
cospif..........................................................................................................................................195
cyl_bessel_i0f............................................................................................................................ 195
cyl_bessel_i1f............................................................................................................................ 196
erfcf............................................................................................................................................196
erfcinvf....................................................................................................................................... 196
erfcxf.......................................................................................................................................... 197
erff..............................................................................................................................................197
erfinvf.........................................................................................................................................198
exp10f.........................................................................................................................................198
exp2f...........................................................................................................................................199
expf............................................................................................................................................ 199
expm1f....................................................................................................................................... 200
fabsf........................................................................................................................................... 200
fdimf...........................................................................................................................................201
fdividef........................................................................................................................................201
floorf.......................................................................................................................................... 202
fmaf............................................................................................................................................202
fmaxf.......................................................................................................................................... 203
fminf...........................................................................................................................................203
fmodf..........................................................................................................................................204
frexpf..........................................................................................................................................204
hypotf......................................................................................................................................... 205
ilogbf.......................................................................................................................................... 205

CUDA Math API vRelease Version | xiv

isfinite........................................................................................................................................ 206
isinf............................................................................................................................................ 206
isnan.......................................................................................................................................... 207
j0f............................................................................................................................................... 207
j1f............................................................................................................................................... 207
jnf............................................................................................................................................... 208
ldexpf......................................................................................................................................... 208
lgammaf.................................................................................................................................... 209
llrintf.......................................................................................................................................... 209
llroundf...................................................................................................................................... 210
log10f......................................................................................................................................... 210
log1pf......................................................................................................................................... 211
log2f........................................................................................................................................... 211
logbf........................................................................................................................................... 212
logf............................................................................................................................................. 212
lrintf........................................................................................................................................... 213
lroundf....................................................................................................................................... 213
max............................................................................................................................................ 213
min............................................................................................................................................. 214
modff..........................................................................................................................................214
nanf............................................................................................................................................ 214
nearbyintf...................................................................................................................................215
nextafterf................................................................................................................................... 215
norm3df..................................................................................................................................... 216
norm4df..................................................................................................................................... 216
normcdff.................................................................................................................................... 217
normcdfinvf................................................................................................................................217
normf......................................................................................................................................... 218
powf........................................................................................................................................... 218
rcbrtf.......................................................................................................................................... 219
remainderf.................................................................................................................................220
remquof..................................................................................................................................... 220
rhypotf........................................................................................................................................221
rintf............................................................................................................................................ 221
rnorm3df....................................................................................................................................222
rnorm4df....................................................................................................................................222
rnormf........................................................................................................................................223
roundf........................................................................................................................................ 223

CUDA Math API vRelease Version | xv

rsqrtf.......................................................................................................................................... 224
scalblnf...................................................................................................................................... 224
scalbnf....................................................................................................................................... 224
signbit........................................................................................................................................ 225
sincosf........................................................................................................................................225
sincospif.....................................................................................................................................226
sinf............................................................................................................................................. 226
sinhf........................................................................................................................................... 227
sinpif.......................................................................................................................................... 227
sqrtf............................................................................................................................................228
tanf............................................................................................................................................. 228
tanhf...........................................................................................................................................229
tgammaf.................................................................................................................................... 229
truncf......................................................................................................................................... 230
y0f...............................................................................................................................................230
y1f...............................................................................................................................................230
ynf.............................................................................................................................................. 231
1.5. Double Precision Mathematical Functions..........................................................................232
acos............................................................................................................................................232
acosh..........................................................................................................................................232
asin............................................................................................................................................ 233
asinh.......................................................................................................................................... 233
atan............................................................................................................................................ 234
atan2.......................................................................................................................................... 234
atanh.......................................................................................................................................... 235
cbrt.............................................................................................................................................235
ceil..............................................................................................................................................236
copysign..................................................................................................................................... 236
cos..............................................................................................................................................236
cosh............................................................................................................................................237
cospi...........................................................................................................................................237
cyl_bessel_i0............................................................................................................................. 238
cyl_bessel_i1............................................................................................................................. 238
erf...............................................................................................................................................238
erfc.............................................................................................................................................239
erfcinv........................................................................................................................................ 239
erfcx........................................................................................................................................... 240
erfinv.......................................................................................................................................... 240

CUDA Math API vRelease Version | xvi

exp..............................................................................................................................................241
exp10..........................................................................................................................................241
exp2............................................................................................................................................242
expm1........................................................................................................................................ 242
fabs............................................................................................................................................ 243
fdim............................................................................................................................................ 243
floor............................................................................................................................................244
fma............................................................................................................................................. 244
fmax........................................................................................................................................... 245
fmin............................................................................................................................................245
fmod...........................................................................................................................................246
frexp........................................................................................................................................... 246
hypot.......................................................................................................................................... 247
ilogb........................................................................................................................................... 247
isfinite........................................................................................................................................ 248
isinf............................................................................................................................................ 248
isnan.......................................................................................................................................... 249
j0.................................................................................................................................................249
j1.................................................................................................................................................249
jn................................................................................................................................................ 250
ldexp.......................................................................................................................................... 250
lgamma......................................................................................................................................251
llrint........................................................................................................................................... 251
llround....................................................................................................................................... 252
log.............................................................................................................................................. 252
log10.......................................................................................................................................... 252
log1p.......................................................................................................................................... 253
log2............................................................................................................................................ 253
logb............................................................................................................................................ 254
lrint............................................................................................................................................ 254
lround........................................................................................................................................ 255
max............................................................................................................................................ 255
max............................................................................................................................................ 255
max............................................................................................................................................ 255
min............................................................................................................................................. 256
min............................................................................................................................................. 256
min............................................................................................................................................. 256
modf...........................................................................................................................................256

CUDA Math API vRelease Version | xvii

nan............................................................................................................................................. 257
nearbyint....................................................................................................................................257
nextafter.................................................................................................................................... 258
norm.......................................................................................................................................... 258
norm3d...................................................................................................................................... 259
norm4d...................................................................................................................................... 259
normcdf..................................................................................................................................... 260
normcdfinv.................................................................................................................................260
pow.............................................................................................................................................261
rcbrt........................................................................................................................................... 262
remainder.................................................................................................................................. 262
remquo...................................................................................................................................... 263
rhypot.........................................................................................................................................263
rint..............................................................................................................................................264
rnorm.........................................................................................................................................264
rnorm3d.....................................................................................................................................265
rnorm4d.....................................................................................................................................265
round..........................................................................................................................................266
rsqrt........................................................................................................................................... 266
scalbln....................................................................................................................................... 267
scalbn........................................................................................................................................ 267
signbit........................................................................................................................................ 267
sin.............................................................................................................................................. 268
sincos.........................................................................................................................................268
sincospi......................................................................................................................................269
sinh............................................................................................................................................ 269
sinpi........................................................................................................................................... 270
sqrt.............................................................................................................................................270
tan.............................................................................................................................................. 271
tanh............................................................................................................................................ 271
tgamma..................................................................................................................................... 271
trunc.......................................................................................................................................... 272
y0................................................................................................................................................272
y1................................................................................................................................................273
yn................................................................................................................................................273
1.6. Integer Mathematical Functions.......................................................................................... 274
abs............................................................................................................................................. 274
labs............................................................................................................................................ 274

CUDA Math API vRelease Version | xviii

llabs........................................................................................................................................... 274
llmax.......................................................................................................................................... 275
llmin...........................................................................................................................................275
max............................................................................................................................................ 275
max............................................................................................................................................ 275
max............................................................................................................................................ 276
max............................................................................................................................................ 276
max............................................................................................................................................ 276
max............................................................................................................................................ 276
max............................................................................................................................................ 276
max............................................................................................................................................ 277
max............................................................................................................................................ 277
max............................................................................................................................................ 277
max............................................................................................................................................ 277
max............................................................................................................................................ 277
min............................................................................................................................................. 278
min............................................................................................................................................. 278
min............................................................................................................................................. 278
min............................................................................................................................................. 278
min............................................................................................................................................. 279
min............................................................................................................................................. 279
min............................................................................................................................................. 279
min............................................................................................................................................. 279
min............................................................................................................................................. 279
min............................................................................................................................................. 280
min............................................................................................................................................. 280
min............................................................................................................................................. 280
ullmax........................................................................................................................................ 280
ullmin.........................................................................................................................................280
umax.......................................................................................................................................... 281
umin........................................................................................................................................... 281
1.7. Single Precision Intrinsics....................................................................................................281
__cosf.........................................................................................................................................281
__exp10f.....................................................................................................................................282
__expf.........................................................................................................................................282
__fadd_rd...................................................................................................................................282
__fadd_rn...................................................................................................................................283
__fadd_ru...................................................................................................................................283

CUDA Math API vRelease Version | xix

__fadd_rz................................................................................................................................... 284
__fdiv_rd.................................................................................................................................... 284
__fdiv_rn.................................................................................................................................... 284
__fdiv_ru.................................................................................................................................... 285
__fdiv_rz.....................................................................................................................................285
__fdividef....................................................................................................................................285
__fmaf_ieee_rd..........................................................................................................................286
__fmaf_ieee_rn......................................................................................................................... 286
__fmaf_ieee_ru......................................................................................................................... 286
__fmaf_ieee_rz..........................................................................................................................287
__fmaf_rd.................................................................................................................................. 287
__fmaf_rn.................................................................................................................................. 287
__fmaf_ru.................................................................................................................................. 288
__fmaf_rz...................................................................................................................................288
__fmul_rd.................................................................................................................................. 289
__fmul_rn.................................................................................................................................. 289
__fmul_ru.................................................................................................................................. 290
__fmul_rz...................................................................................................................................290
__frcp_rd................................................................................................................................... 291
__frcp_rn................................................................................................................................... 291
__frcp_ru................................................................................................................................... 291
__frcp_rz....................................................................................................................................292
__frsqrt_rn................................................................................................................................ 292
__fsqrt_rd.................................................................................................................................. 293
__fsqrt_rn.................................................................................................................................. 293
__fsqrt_ru.................................................................................................................................. 293
__fsqrt_rz.................................................................................................................................. 294
__fsub_rd...................................................................................................................................294
__fsub_rn...................................................................................................................................295
__fsub_ru...................................................................................................................................295
__fsub_rz................................................................................................................................... 295
__log10f..................................................................................................................................... 296
__log2f....................................................................................................................................... 296
__logf......................................................................................................................................... 297
__powf........................................................................................................................................297
__saturatef................................................................................................................................ 297
__sincosf....................................................................................................................................298
__sinf......................................................................................................................................... 298

CUDA Math API vRelease Version | xx

__tanf......................................................................................................................................... 299
1.8. Double Precision Intrinsics.................................................................................................. 299
__dadd_rd..................................................................................................................................299
__dadd_rn..................................................................................................................................300
__dadd_ru..................................................................................................................................300
__dadd_rz.................................................................................................................................. 300
__ddiv_rd................................................................................................................................... 301
__ddiv_rn................................................................................................................................... 301
__ddiv_ru................................................................................................................................... 302
__ddiv_rz....................................................................................................................................302
__dmul_rd................................................................................................................................. 302
__dmul_rn................................................................................................................................. 303
__dmul_ru................................................................................................................................. 303
__dmul_rz..................................................................................................................................304
__drcp_rd.................................................................................................................................. 304
__drcp_rn.................................................................................................................................. 305
__drcp_ru.................................................................................................................................. 305
__drcp_rz...................................................................................................................................305
__dsqrt_rd................................................................................................................................. 306
__dsqrt_rn................................................................................................................................. 306
__dsqrt_ru................................................................................................................................. 307
__dsqrt_rz..................................................................................................................................307
__dsub_rd..................................................................................................................................308
__dsub_rn..................................................................................................................................308
__dsub_ru..................................................................................................................................308
__dsub_rz.................................................................................................................................. 309
__fma_rd................................................................................................................................... 309
__fma_rn................................................................................................................................... 310
__fma_ru................................................................................................................................... 310
__fma_rz....................................................................................................................................311
1.9. Integer Intrinsics................................................................................................................... 312
__brev........................................................................................................................................ 312
__brevll...................................................................................................................................... 312
__byte_perm..............................................................................................................................312
__clz...........................................................................................................................................313
__clzll.........................................................................................................................................313
__ffs........................................................................................................................................... 313
__ffsll......................................................................................................................................... 314

CUDA Math API vRelease Version | xxi

__funnelshift_l...........................................................................................................................314
__funnelshift_lc.........................................................................................................................314
__funnelshift_r.......................................................................................................................... 315
__funnelshift_rc........................................................................................................................ 315
__hadd....................................................................................................................................... 315
__mul24..................................................................................................................................... 316
__mul64hi.................................................................................................................................. 316
__mulhi...................................................................................................................................... 316
__popc........................................................................................................................................316
__popcll..................................................................................................................................... 317
__rhadd......................................................................................................................................317
__sad..........................................................................................................................................317
__uhadd..................................................................................................................................... 318
__umul24................................................................................................................................... 318
__umul64hi................................................................................................................................ 318
__umulhi....................................................................................................................................319
__urhadd....................................................................................................................................319
__usad....................................................................................................................................... 319
1.10. Type Casting Intrinsics....................................................................................................... 320
__double2float_rd..................................................................................................................... 320
__double2float_rn..................................................................................................................... 320
__double2float_ru..................................................................................................................... 320
__double2float_rz......................................................................................................................321
__double2hiint........................................................................................................................... 321
__double2int_rd........................................................................................................................ 321
__double2int_rn........................................................................................................................ 321
__double2int_ru........................................................................................................................ 322
__double2int_rz.........................................................................................................................322
__double2ll_rd...........................................................................................................................322
__double2ll_rn.......................................................................................................................... 323
__double2ll_ru.......................................................................................................................... 323
__double2ll_rz........................................................................................................................... 323
__double2loint........................................................................................................................... 323
__double2uint_rd...................................................................................................................... 324
__double2uint_rn...................................................................................................................... 324
__double2uint_ru...................................................................................................................... 324
__double2uint_rz.......................................................................................................................325
__double2ull_rd........................................................................................................................ 325

CUDA Math API vRelease Version | xxii

__double2ull_rn........................................................................................................................ 325
__double2ull_ru........................................................................................................................ 326
__double2ull_rz.........................................................................................................................326
__double_as_longlong.............................................................................................................. 326
__float2int_rd............................................................................................................................ 327
__float2int_rn............................................................................................................................ 327
__float2int_ru............................................................................................................................ 327
__float2int_rz.............................................................................................................................327
__float2ll_rd.............................................................................................................................. 328
__float2ll_rn.............................................................................................................................. 328
__float2ll_ru.............................................................................................................................. 328
__float2ll_rz...............................................................................................................................329
__float2uint_rd.......................................................................................................................... 329
__float2uint_rn.......................................................................................................................... 329
__float2uint_ru.......................................................................................................................... 329
__float2uint_rz.......................................................................................................................... 330
__float2ull_rd............................................................................................................................ 330
__float2ull_rn............................................................................................................................ 330
__float2ull_ru............................................................................................................................ 331
__float2ull_rz.............................................................................................................................331
__float_as_int............................................................................................................................ 331
__float_as_uint.......................................................................................................................... 331
__hiloint2double........................................................................................................................332
__int2double_rn........................................................................................................................ 332
__int2float_rd............................................................................................................................ 332
__int2float_rn............................................................................................................................ 333
__int2float_ru............................................................................................................................ 333
__int2float_rz.............................................................................................................................333
__int_as_float............................................................................................................................ 333
__ll2double_rd...........................................................................................................................334
__ll2double_rn.......................................................................................................................... 334
__ll2double_ru.......................................................................................................................... 334
__ll2double_rz........................................................................................................................... 335
__ll2float_rd.............................................................................................................................. 335
__ll2float_rn.............................................................................................................................. 335
__ll2float_ru.............................................................................................................................. 335
__ll2float_rz...............................................................................................................................336
__longlong_as_double.............................................................................................................. 336

CUDA Math API vRelease Version | xxiii

__uint2double_rn...................................................................................................................... 336
__uint2float_rd.......................................................................................................................... 337
__uint2float_rn.......................................................................................................................... 337
__uint2float_ru.......................................................................................................................... 337
__uint2float_rz.......................................................................................................................... 337
__uint_as_float.......................................................................................................................... 338
__ull2double_rd........................................................................................................................ 338
__ull2double_rn........................................................................................................................ 338
__ull2double_ru........................................................................................................................ 339
__ull2double_rz.........................................................................................................................339
__ull2float_rd............................................................................................................................ 339
__ull2float_rn............................................................................................................................ 340
__ull2float_ru............................................................................................................................ 340
__ull2float_rz.............................................................................................................................340
1.11. SIMD Intrinsics.................................................................................................................... 341
__vabs2...................................................................................................................................... 341
__vabs4...................................................................................................................................... 341
__vabsdiffs2...............................................................................................................................341
__vabsdiffs4...............................................................................................................................342
__vabsdiffu2...............................................................................................................................342
__vabsdiffu4...............................................................................................................................342
__vabsss2.................................................................................................................................. 343
__vabsss4.................................................................................................................................. 343
__vadd2......................................................................................................................................343
__vadd4......................................................................................................................................344
__vaddss2.................................................................................................................................. 344
__vaddss4.................................................................................................................................. 344
__vaddus2..................................................................................................................................345
__vaddus4..................................................................................................................................345
__vavgs2.................................................................................................................................... 345
__vavgs4.................................................................................................................................... 346
__vavgu2.................................................................................................................................... 346
__vavgu4.................................................................................................................................... 346
__vcmpeq2.................................................................................................................................347
__vcmpeq4.................................................................................................................................347
__vcmpges2...............................................................................................................................347
__vcmpges4...............................................................................................................................348
__vcmpgeu2...............................................................................................................................348

CUDA Math API vRelease Version | xxiv

__vcmpgeu4...............................................................................................................................348
__vcmpgts2................................................................................................................................349
__vcmpgts4................................................................................................................................349
__vcmpgtu2............................................................................................................................... 349
__vcmpgtu4............................................................................................................................... 350
__vcmples2................................................................................................................................350
__vcmples4................................................................................................................................350
__vcmpleu2............................................................................................................................... 351
__vcmpleu4............................................................................................................................... 351
__vcmplts2................................................................................................................................ 351
__vcmplts4................................................................................................................................ 352
__vcmpltu2................................................................................................................................ 352
__vcmpltu4................................................................................................................................ 352
__vcmpne2.................................................................................................................................353
__vcmpne4.................................................................................................................................353
__vhaddu2..................................................................................................................................353
__vhaddu4..................................................................................................................................354
__vmaxs2................................................................................................................................... 354
__vmaxs4................................................................................................................................... 354
__vmaxu2...................................................................................................................................355
__vmaxu4...................................................................................................................................355
__vmins2....................................................................................................................................355
__vmins4....................................................................................................................................356
__vminu2................................................................................................................................... 356
__vminu4................................................................................................................................... 356
__vneg2......................................................................................................................................357
__vneg4......................................................................................................................................357
__vnegss2.................................................................................................................................. 357
__vnegss4.................................................................................................................................. 357
__vsads2.................................................................................................................................... 358
__vsads4.................................................................................................................................... 358
__vsadu2.................................................................................................................................... 358
__vsadu4.................................................................................................................................... 359
__vseteq2...................................................................................................................................359
__vseteq4...................................................................................................................................359
__vsetges2................................................................................................................................. 360
__vsetges4................................................................................................................................. 360
__vsetgeu2.................................................................................................................................360

CUDA Math API vRelease Version | xxv

__vsetgeu4.................................................................................................................................361
__vsetgts2..................................................................................................................................361
__vsetgts4..................................................................................................................................361
__vsetgtu2................................................................................................................................. 362
__vsetgtu4................................................................................................................................. 362
__vsetles2.................................................................................................................................. 362
__vsetles4.................................................................................................................................. 363
__vsetleu2..................................................................................................................................363
__vsetleu4..................................................................................................................................363
__vsetlts2...................................................................................................................................364
__vsetlts4...................................................................................................................................364
__vsetltu2.................................................................................................................................. 364
__vsetltu4.................................................................................................................................. 365
__vsetne2...................................................................................................................................365
__vsetne4...................................................................................................................................365
__vsub2......................................................................................................................................366
__vsub4......................................................................................................................................366
__vsubss2.................................................................................................................................. 366
__vsubss4.................................................................................................................................. 367
__vsubus2.................................................................................................................................. 367
__vsubus4.................................................................................................................................. 367

CUDA Math API vRelease Version | xxvi

Chapter 1. Modules

Here is a list of all modules:

‣ Half Precision Intrinsics

‣ Half Arithmetic Functions
‣ Half2 Arithmetic Functions
‣ Half Comparison Functions
‣ Half2 Comparison Functions
‣ Half Precision Conversion and Data Movement
‣ Half Math Functions
‣ Half2 Math Functions
‣ Bfloat16 Precision Intrinsics
‣ Bfloat16 Arithmetic Functions
‣ Bfloat162 Arithmetic Functions
‣ Bfloat16 Comparison Functions
‣ Bfloat162 Comparison Functions
‣ Bfloat16 Precision Conversion and Data Movement
‣ Bfloat16 Math Functions
‣ Bfloat162 Math Functions
‣ Mathematical Functions
‣ Single Precision Mathematical Functions
‣ Double Precision Mathematical Functions
‣ Integer Mathematical Functions
‣ Single Precision Intrinsics
‣ Double Precision Intrinsics

CUDA Math API vRelease Version | 1

Modules

‣ Integer Intrinsics
‣ Type Casting Intrinsics
‣ SIMD Intrinsics

1.1. Half Precision Intrinsics

This section describes half precision intrinsic functions that are only supported in device code.
To use these functions, include the header file cuda_fp16.h in your program.

Half Arithmetic Functions

Half2 Arithmetic Functions
Half Comparison Functions
Half2 Comparison Functions
Half Precision Conversion and Data
Movement
Half Math Functions
Half2 Math Functions
1.1.1. Half Arithmetic Functions
Half Precision Intrinsics
To use these functions, include the header file cuda_fp16.h in your program.

device half habs (const __half a)

Calculates the absolute value of input half number and returns the result.

Parameters
a
- half. Is only being read.

CUDA Math API vRelease Version | 2

Modules

Returns
half

‣ The absolute value of a.

Description
Calculates the absolute value of input half number and returns the result.

device half hadd (const half a, const half b)

Performs half addition in round-to-nearest-even mode.

Description
Performs half addition of inputs a and b, in round-to-nearest-even mode.

device half hadd_rn (const half a, const half b)

Performs half addition in round-to-nearest-even mode.

Description
Performs half addition of inputs a and b, in round-to-nearest-even mode. Prevents floating-
point contractions of mul+add into fma.

device half hadd_sat (const half a, const half

b)
Performs half addition in round-to-nearest-even mode, with saturation to [0.0, 1.0].

Parameters
a
- half. Is only being read.
b
- half. Is only being read.

Returns
half

‣ The sum of a and b, with respect to saturation.

Description
Performs half add of inputs a and b, in round-to-nearest-even mode, and clamps the result
to range [0.0, 1.0]. NaN results are flushed to +0.0.

CUDA Math API vRelease Version | 3

Modules

device half hdiv (const half a, const half b)

Performs half division in round-to-nearest-even mode.

Description
Divides half input a by input b in round-to-nearest mode.

device half hfma (const half a, const half b,

const __half c)
Performs half fused multiply-add in round-to-nearest-even mode.

Description
Performs half multiply on inputs a and b, then performs a half add of the result with c,
rounding the result once in round-to-nearest-even mode.

device half hfma_relu (const half a, const half

b, const __half c)
Performs half fused multiply-add in round-to-nearest-even mode with relu saturation.

Parameters
a
- half. Is only being read.
b
- half. Is only being read.
c
- half. Is only being read.

Returns
half

‣ The result of fused multiply-add operation on a, b, and c with relu saturation.

Description
Performs half multiply on inputs a and b, then performs a half add of the result with c,
rounding the result once in round-to-nearest-even mode. Then negative result is clamped to
0. NaN result is converted to canonical NaN.

CUDA Math API vRelease Version | 4

Modules

device half hfma_sat (const half a, const half

b, const __half c)
Performs half fused multiply-add in round-to-nearest-even mode, with saturation to [0.0,
1.0].

Parameters
a
- half. Is only being read.
b
- half. Is only being read.
c
- half. Is only being read.

Returns
half

‣ The result of fused multiply-add operation on a, b, and c, with respect to saturation.

Description
Performs half multiply on inputs a and b, then performs a half add of the result with c,
rounding the result once in round-to-nearest-even mode, and clamps the result to range [0.0,
1.0]. NaN results are flushed to +0.0.

device half hmul (const half a, const half b)

Performs half multiplication in round-to-nearest-even mode.

Description
Performs half multiplication of inputs a and b, in round-to-nearest mode.

device half hmul_rn (const half a, const half

b)
Performs half multiplication in round-to-nearest-even mode.

Description
Performs half multiplication of inputs a and b, in round-to-nearest mode. Prevents floating-
point contractions of mul+add or sub into fma.

CUDA Math API vRelease Version | 5

Modules

device half hmul_sat (const half a, const half

b)
Performs half multiplication in round-to-nearest-even mode, with saturation to [0.0, 1.0].

Parameters
a
- half. Is only being read.
b
- half. Is only being read.

Returns
half

‣ The result of multiplying a and b, with respect to saturation.

Description
Performs half multiplication of inputs a and b, in round-to-nearest mode, and clamps the
result to range [0.0, 1.0]. NaN results are flushed to +0.0.

device half hneg (const __half a)

Negates input half number and returns the result.

Description
Negates input half number and returns the result.

device half hsub (const half a, const half b)

Performs half subtraction in round-to-nearest-even mode.

Description
Subtracts half input b from input a in round-to-nearest mode.

device half hsub_rn (const half a, const half b)

Performs half subtraction in round-to-nearest-even mode.

Description
Subtracts half input b from input a in round-to-nearest mode. Prevents floating-point
contractions of mul+sub into fma.

CUDA Math API vRelease Version | 6

Modules

device half hsub_sat (const half a, const half

b)
Performs half subtraction in round-to-nearest-even mode, with saturation to [0.0, 1.0].

Parameters
a
- half. Is only being read.
b
- half. Is only being read.

Returns
half

‣ The result of subtraction of b from a, with respect to saturation.

Description
Subtracts half input b from input a in round-to-nearest mode, and clamps the result to
range [0.0, 1.0]. NaN results are flushed to +0.0.

device half atomicAdd (const half *address, const

__half val)
Adds val to the value stored at address in global or shared memory, and writes this value
back to address. This operation is performed in one atomic operation.

Parameters
address
- half*. An address in global or shared memory.
val
- half. The value to be added.

Returns
half

‣ The old value read from address.

Description
The location of address must be in global or shared memory. This operation has undefined
behavior otherwise. This operation is only supported by devices of compute capability 7.x and
higher.

CUDA Math API vRelease Version | 7

Modules

Note:

For more details for this function see the Atomic Functions section in the CUDA C++
Programming Guide.

1.1.2. Half2 Arithmetic Functions

Half Precision Intrinsics
To use these functions, include the header file cuda_fp16.h in your program.

device half2 h2div (const half2 a, const half2

b)
Performs half2 vector division in round-to-nearest-even mode.

Description
Divides half2 input vector a by input vector b in round-to-nearest mode.

device half2 habs2 (const __half2 a)

Calculates the absolute value of both halves of the input half2 number and returns the
result.

Parameters
a
- half2. Is only being read.

Returns
half2

‣ Returns a with the absolute value of both halves.

Description
Calculates the absolute value of both halves of the input half2 number and returns the
result.

CUDA Math API vRelease Version | 8

Modules

device half2 hadd2 (const half2 a, const half2

b)
Performs half2 vector addition in round-to-nearest-even mode.

Description
Performs half2 vector add of inputs a and b, in round-to-nearest mode.

device half2 hadd2_rn (const __half2 a, const

__half2 b)
Performs half2 vector addition in round-to-nearest-even mode.

Description
Performs half2 vector add of inputs a and b, in round-to-nearest mode. Prevents floating-
point contractions of mul+add into fma.

device half2 hadd2_sat (const __half2 a, const

__half2 b)
Performs half2 vector addition in round-to-nearest-even mode, with saturation to [0.0, 1.0].

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
half2

‣ The sum of a and b, with respect to saturation.

Description
Performs half2 vector add of inputs a and b, in round-to-nearest mode, and clamps the
results to range [0.0, 1.0]. NaN results are flushed to +0.0.

CUDA Math API vRelease Version | 9

Modules

device half2 hcmadd (const __half2 a, const

__half2 b, const __half2 c)
Performs fast complex multiply-accumulate.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.
c
- half2. Is only being read.

Returns
half2

‣ The result of complex multiply-accumulate operation on complex numbers a, b, and c

Description
Interprets vector half2 input pairs a, b, and c as complex numbers in half precision and
performs complex multiply-accumulate operation: a*b + c

device half2 hfma2 (const half2 a, const half2

b, const __half2 c)
Performs half2 vector fused multiply-add in round-to-nearest-even mode.

Description
Performs half2 vector multiply on inputs a and b, then performs a half2 vector add of the
result with c, rounding the result once in round-to-nearest-even mode.

device half2 hfma2_relu (const __half2 a, const

__half2 b, const __half2 c)
Performs half2 vector fused multiply-add in round-to-nearest-even mode with relu
saturation.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

CUDA Math API vRelease Version | 10

Modules

c
- half2. Is only being read.

Returns
half2

‣ The result of elementwise fused multiply-add operation on vectors a, b, and c with relu
saturation.

Description
Performs half2 vector multiply on inputs a and b, then performs a half2 vector add of the
result with c, rounding the result once in round-to-nearest-even mode. Then negative result is
clamped to 0. NaN result is converted to canonical NaN.

device half2 hfma2_sat (const __half2 a, const

__half2 b, const __half2 c)
Performs half2 vector fused multiply-add in round-to-nearest-even mode, with saturation to
[0.0, 1.0].

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.
c
- half2. Is only being read.

Returns
half2

‣ The result of elementwise fused multiply-add operation on vectors a, b, and c, with

respect to saturation.

Description
Performs half2 vector multiply on inputs a and b, then performs a half2 vector add of
the result with c, rounding the result once in round-to-nearest-even mode, and clamps the
results to range [0.0, 1.0]. NaN results are flushed to +0.0.

CUDA Math API vRelease Version | 11

Modules

device half2 hmul2 (const half2 a, const half2

b)
Performs half2 vector multiplication in round-to-nearest-even mode.

Description
Performs half2 vector multiplication of inputs a and b, in round-to-nearest-even mode.

device half2 hmul2_rn (const __half2 a, const

__half2 b)
Performs half2 vector multiplication in round-to-nearest-even mode.

Description
Performs half2 vector multiplication of inputs a and b, in round-to-nearest-even mode.
Prevents floating-point contractions of mul+add or sub into fma.

device half2 hmul2_sat (const __half2 a, const

__half2 b)
Performs half2 vector multiplication in round-to-nearest-even mode, with saturation to [0.0,
1.0].

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
half2

‣ The result of elementwise multiplication of vectors a and b, with respect to saturation.

Description
Performs half2 vector multiplication of inputs a and b, in round-to-nearest-even mode, and
clamps the results to range [0.0, 1.0]. NaN results are flushed to +0.0.

CUDA Math API vRelease Version | 12

Modules

device half2 hneg2 (const __half2 a)

Negates both halves of the input half2 number and returns the result.

Description
Negates both halves of the input half2 number a and returns the result.

device half2 hsub2 (const half2 a, const half2

b)
Performs half2 vector subtraction in round-to-nearest-even mode.

Description
Subtracts half2 input vector b from input vector a in round-to-nearest-even mode.

device half2 hsub2_rn (const __half2 a, const

__half2 b)
Performs half2 vector subtraction in round-to-nearest-even mode.

Description
Subtracts half2 input vector b from input vector a in round-to-nearest-even mode. Prevents
floating-point contractions of mul+sub into fma.

device half2 hsub2_sat (const __half2 a, const

__half2 b)
Performs half2 vector subtraction in round-to-nearest-even mode, with saturation to [0.0,
1.0].

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
half2

‣ The subtraction of vector b from a, with respect to saturation.

CUDA Math API vRelease Version | 13

Modules

Description
Subtracts half2 input vector b from input vector a in round-to-nearest-even mode, and
clamps the results to range [0.0, 1.0]. NaN results are flushed to +0.0.

device half2 atomicAdd (const half2 *address,

const __half2 val)
Vector add val to the value stored at address in global or shared memory, and writes this
value back to address. The atomicity of the add operation is guaranteed separately for each
of the two __half elements; the entire __half2 is not guaranteed to be atomic as a single 32-bit
access.

Parameters
address
- half2*. An address in global or shared memory.
val
- half2. The value to be added.

Returns
half2

‣ The old value read from address.

Note:

For more details for this function see the Atomic Functions section in the CUDA C++
Programming Guide.

1.1.3. Half Comparison Functions

Half Precision Intrinsics
To use these functions, include the header file cuda_fp16.h in your program.

CUDA Math API vRelease Version | 14

Modules

device bool heq (const half a, const __half b)

Performs half if-equal comparison.

Parameters
a
- half. Is only being read.
b
- half. Is only being read.

Returns
bool

‣ The boolean result of if-equal comparison of a and b.

Description
Performs half if-equal comparison of inputs a and b. NaN inputs generate false results.

device bool hequ (const half a, const __half b)

Performs half unordered if-equal comparison.

Parameters
a
- half. Is only being read.
b
- half. Is only being read.

Returns
bool

‣ The boolean result of unordered if-equal comparison of a and b.

Description
Performs half if-equal comparison of inputs a and b. NaN inputs generate true results.

device bool hge (const half a, const __half b)

Performs half greater-equal comparison.

Parameters
a
- half. Is only being read.

CUDA Math API vRelease Version | 15

Modules

b
- half. Is only being read.

Returns
bool

‣ The boolean result of greater-equal comparison of a and b.

Description
Performs half greater-equal comparison of inputs a and b. NaN inputs generate false
results.

device bool hgeu (const half a, const __half b)

Performs half unordered greater-equal comparison.

Parameters
a
- half. Is only being read.
b
- half. Is only being read.

Returns
bool

‣ The boolean result of unordered greater-equal comparison of a and b.

Description
Performs half greater-equal comparison of inputs a and b. NaN inputs generate true
results.

device bool hgt (const half a, const __half b)

Performs half greater-than comparison.

Parameters
a
- half. Is only being read.
b
- half. Is only being read.

Returns
bool

CUDA Math API vRelease Version | 16

Modules

‣ The boolean result of greater-than comparison of a and b.

Description
Performs half greater-than comparison of inputs a and b. NaN inputs generate false results.

device bool hgtu (const half a, const __half b)

Performs half unordered greater-than comparison.

Parameters
a
- half. Is only being read.
b
- half. Is only being read.

Returns
bool

‣ The boolean result of unordered greater-than comparison of a and b.

Description
Performs half greater-than comparison of inputs a and b. NaN inputs generate true results.

device int hisinf (const half a)

Checks if the input half number is infinite.

Parameters
a
- half. Is only being read.

Returns
int

‣ -1 iff a is equal to negative infinity,

‣ 1 iff a is equal to positive infinity,
‣ 0 otherwise.

Description
Checks if the input half number a is infinite.

CUDA Math API vRelease Version | 17

Modules

device bool hisnan (const half a)

Determine whether half argument is a NaN.

Parameters
a
- half. Is only being read.

Returns
bool

‣ true iff argument is NaN.

Description
Determine whether half value a is a NaN.

device bool hle (const half a, const __half b)

Performs half less-equal comparison.

Parameters
a
- half. Is only being read.
b
- half. Is only being read.

Returns
bool

‣ The boolean result of less-equal comparison of a and b.

Description
Performs half less-equal comparison of inputs a and b. NaN inputs generate false results.

device bool hleu (const half a, const __half b)

Performs half unordered less-equal comparison.

Parameters
a
- half. Is only being read.
b
- half. Is only being read.

CUDA Math API vRelease Version | 18

Modules

Returns
bool

‣ The boolean result of unordered less-equal comparison of a and b.

Description
Performs half less-equal comparison of inputs a and b. NaN inputs generate true results.

device bool hlt (const half a, const __half b)

Performs half less-than comparison.

Parameters
a
- half. Is only being read.
b
- half. Is only being read.

Returns
bool

‣ The boolean result of less-than comparison of a and b.

Description
Performs half less-than comparison of inputs a and b. NaN inputs generate false results.

device bool hltu (const half a, const __half b)

Performs half unordered less-than comparison.

Parameters
a
- half. Is only being read.
b
- half. Is only being read.

Returns
bool

‣ The boolean result of unordered less-than comparison of a and b.

Description
Performs half less-than comparison of inputs a and b. NaN inputs generate true results.

CUDA Math API vRelease Version | 19

Modules

device half hmax (const half a, const half b)

Calculates half maximum of two input values.

Description
Calculates half max(a, b) defined as (a > b) ? a : b.

‣ If either of inputs is NaN, the other input is returned.

‣ If both inputs are NaNs, then canonical NaN is returned.
‣ If values of both inputs are 0.0, then +0.0 > -0.0

device half hmax_nan (const half a, const half

b)
Calculates half maximum of two input values, NaNs pass through.

Description
Calculates half max(a, b) defined as (a > b) ? a : b.

‣ If either of inputs is NaN, then canonical NaN is returned.

‣ If values of both inputs are 0.0, then +0.0 > -0.0

device half hmin (const half a, const half b)

Calculates half minimum of two input values.

Description
Calculates half min(a, b) defined as (a < b) ? a : b.

‣ If either of inputs is NaN, the other input is returned.

‣ If both inputs are NaNs, then canonical NaN is returned.
‣ If values of both inputs are 0.0, then +0.0 > -0.0

device half hmin_nan (const half a, const half

b)
Calculates half minimum of two input values, NaNs pass through.

Description
Calculates half min(a, b) defined as (a < b) ? a : b.

‣ If either of inputs is NaN, then canonical NaN is returned.

CUDA Math API vRelease Version | 20

Modules

‣ If values of both inputs are 0.0, then +0.0 > -0.0

device bool hne (const half a, const __half b)

Performs half not-equal comparison.

Parameters
a
- half. Is only being read.
b
- half. Is only being read.

Returns
bool

‣ The boolean result of not-equal comparison of a and b.

Description
Performs half not-equal comparison of inputs a and b. NaN inputs generate false results.

device bool hneu (const half a, const __half b)

Performs half unordered not-equal comparison.

Parameters
a
- half. Is only being read.
b
- half. Is only being read.

Returns
bool

‣ The boolean result of unordered not-equal comparison of a and b.

Description
Performs half not-equal comparison of inputs a and b. NaN inputs generate true results.

1.1.4. Half2 Comparison Functions

Half Precision Intrinsics
To use these functions, include the header file cuda_fp16.h in your program.

CUDA Math API vRelease Version | 21

Modules

device bool hbeq2 (const half2 a, const __half2 b)

Performs half2 vector if-equal comparison and returns boolean true iff both half results
are true, boolean false otherwise.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
bool

‣ true if both half results of if-equal comparison of vectors a and b are true;
‣ false otherwise.

Description
Performs half2 vector if-equal comparison of inputs a and b. The bool result is set to true
only if both half if-equal comparisons evaluate to true, or false otherwise. NaN inputs
generate false results.

device bool hbequ2 (const half2 a, const __half2 b)

Performs half2 vector unordered if-equal comparison and returns boolean true iff both half
results are true, boolean false otherwise.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
bool

‣ true if both half results of unordered if-equal comparison of vectors a and b are true;
‣ false otherwise.

CUDA Math API vRelease Version | 22

Modules

device bool hbge2 (const half2 a, const __half2 b)

Performs half2 vector greater-equal comparison and returns boolean true iff both half
results are true, boolean false otherwise.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
bool

‣ true if both half results of greater-equal comparison of vectors a and b are true;
‣ false otherwise.

Description
Performs half2 vector greater-equal comparison of inputs a and b. The bool result is set to
true only if both half greater-equal comparisons evaluate to true, or false otherwise. NaN
inputs generate false results.

device bool hbgeu2 (const half2 a, const __half2 b)

Performs half2 vector unordered greater-equal comparison and returns boolean true iff
both half results are true, boolean false otherwise.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
bool

CUDA Math API vRelease Version | 23

Modules

‣ true if both half results of unordered greater-equal comparison of vectors a and b are
true;

‣ false otherwise.

device bool hbgt2 (const half2 a, const __half2 b)

Performs half2 vector greater-than comparison and returns boolean true iff both half
results are true, boolean false otherwise.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
bool

‣ true if both half results of greater-than comparison of vectors a and b are true;
‣ false otherwise.

Description
Performs half2 vector greater-than comparison of inputs a and b. The bool result is set to
true only if both half greater-than comparisons evaluate to true, or false otherwise. NaN
inputs generate false results.

device bool hbgtu2 (const half2 a, const __half2 b)

Performs half2 vector unordered greater-than comparison and returns boolean true iff both
half results are true, boolean false otherwise.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

CUDA Math API vRelease Version | 24

Modules

Returns
bool

‣ true if both half results of unordered greater-than comparison of vectors a and b are
true;

‣ false otherwise.

device bool hble2 (const half2 a, const __half2 b)

Performs half2 vector less-equal comparison and returns boolean true iff both half results
are true, boolean false otherwise.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
bool

‣ true if both half results of less-equal comparison of vectors a and b are true;
‣ false otherwise.

Description
Performs half2 vector less-equal comparison of inputs a and b. The bool result is set to true
only if both half less-equal comparisons evaluate to true, or false otherwise. NaN inputs
generate false results.

device bool hbleu2 (const half2 a, const __half2 b)

Performs half2 vector unordered less-equal comparison and returns boolean true iff both
half results are true, boolean false otherwise.

Parameters
a
- half2. Is only being read.

CUDA Math API vRelease Version | 25

Modules

b
- half2. Is only being read.

Returns
bool

‣ true if both half results of unordered less-equal comparison of vectors a and b are true;
‣ false otherwise.

device bool hblt2 (const half2 a, const __half2 b)

Performs half2 vector less-than comparison and returns boolean true iff both half results
are true, boolean false otherwise.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
bool

‣ true if both half results of less-than comparison of vectors a and b are true;
‣ false otherwise.

Description
Performs half2 vector less-than comparison of inputs a and b. The bool result is set to true
only if both half less-than comparisons evaluate to true, or false otherwise. NaN inputs
generate false results.

CUDA Math API vRelease Version | 26

Modules

device bool hbltu2 (const half2 a, const __half2 b)

Performs half2 vector unordered less-than comparison and returns boolean true iff both
half results are true, boolean false otherwise.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
bool

‣ true if both half results of unordered less-than comparison of vectors a and b are true;
‣ false otherwise.

device bool hbne2 (const half2 a, const __half2 b)

Performs half2 vector not-equal comparison and returns boolean true iff both half results
are true, boolean false otherwise.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
bool

‣ true if both half results of not-equal comparison of vectors a and b are true,
‣ false otherwise.

CUDA Math API vRelease Version | 27

Modules

Description
Performs half2 vector not-equal comparison of inputs a and b. The bool result is set to true
only if both half not-equal comparisons evaluate to true, or false otherwise. NaN inputs
generate false results.

device bool hbneu2 (const half2 a, const __half2 b)

Performs half2 vector unordered not-equal comparison and returns boolean true iff both
half results are true, boolean false otherwise.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
bool

‣ true if both half results of unordered not-equal comparison of vectors a and b are true;
‣ false otherwise.

device half2 heq2 (const half2 a, const half2 b)

Performs half2 vector if-equal comparison.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
half2

‣ The vector result of if-equal comparison of vectors a and b.

CUDA Math API vRelease Version | 28

Modules

Description
Performs half2 vector if-equal comparison of inputs a and b. The corresponding half
results are set to 1.0 for true, or 0.0 for false. NaN inputs generate false results.

device half2 hequ2 (const half2 a, const half2

b)
Performs half2 vector unordered if-equal comparison.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
half2

‣ The vector result of unordered if-equal comparison of vectors a and b.

Description
Performs half2 vector if-equal comparison of inputs a and b. The corresponding half
results are set to 1.0 for true, or 0.0 for false. NaN inputs generate true results.

device half2 hge2 (const half2 a, const half2 b)

Performs half2 vector greater-equal comparison.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
half2

‣ The vector result of greater-equal comparison of vectors a and b.

Description
Performs half2 vector greater-equal comparison of inputs a and b. The corresponding half
results are set to 1.0 for true, or 0.0 for false. NaN inputs generate false results.

CUDA Math API vRelease Version | 29

Modules

device half2 hgeu2 (const half2 a, const half2

b)
Performs half2 vector unordered greater-equal comparison.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
half2

‣ The half2 vector result of unordered greater-equal comparison of vectors a and b.

Description
Performs half2 vector greater-equal comparison of inputs a and b. The corresponding half
results are set to 1.0 for true, or 0.0 for false. NaN inputs generate true results.

device half2 hgt2 (const half2 a, const half2 b)

Performs half2 vector greater-than comparison.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
half2

‣ The vector result of greater-than comparison of vectors a and b.

Description
Performs half2 vector greater-than comparison of inputs a and b. The corresponding half
results are set to 1.0 for true, or 0.0 for false. NaN inputs generate false results.

CUDA Math API vRelease Version | 30

Modules

device half2 hgtu2 (const half2 a, const half2

b)
Performs half2 vector unordered greater-than comparison.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
half2

‣ The half2 vector result of unordered greater-than comparison of vectors a and b.

Description
Performs half2 vector greater-than comparison of inputs a and b. The corresponding half
results are set to 1.0 for true, or 0.0 for false. NaN inputs generate true results.

device half2 hisnan2 (const __half2 a)

Determine whether half2 argument is a NaN.

Parameters
a
- half2. Is only being read.

Returns
half2

‣ The half2 with the corresponding half results set to 1.0 for NaN, 0.0 otherwise.

Description
Determine whether each half of input half2 number a is a NaN.

device half2 hle2 (const half2 a, const half2 b)

Performs half2 vector less-equal comparison.

Parameters
a
- half2. Is only being read.

CUDA Math API vRelease Version | 31

Modules

b
- half2. Is only being read.

Returns
half2

‣ The half2 result of less-equal comparison of vectors a and b.

Description
Performs half2 vector less-equal comparison of inputs a and b. The corresponding half
results are set to 1.0 for true, or 0.0 for false. NaN inputs generate false results.

device half2 hleu2 (const half2 a, const half2

b)
Performs half2 vector unordered less-equal comparison.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
half2

‣ The vector result of unordered less-equal comparison of vectors a and b.

Description
Performs half2 vector less-equal comparison of inputs a and b. The corresponding half
results are set to 1.0 for true, or 0.0 for false. NaN inputs generate true results.

device half2 hlt2 (const half2 a, const half2 b)

Performs half2 vector less-than comparison.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

CUDA Math API vRelease Version | 32

Modules

Returns
half2

‣ The half2 vector result of less-than comparison of vectors a and b.

Description
Performs half2 vector less-than comparison of inputs a and b. The corresponding half
results are set to 1.0 for true, or 0.0 for false. NaN inputs generate false results.

device half2 hltu2 (const half2 a, const half2

b)
Performs half2 vector unordered less-than comparison.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
half2

‣ The vector result of unordered less-than comparison of vectors a and b.

Description
Performs half2 vector less-than comparison of inputs a and b. The corresponding half
results are set to 1.0 for true, or 0.0 for false. NaN inputs generate true results.

device half2 hmax2 (const half2 a, const half2

b)
Calculates half2 vector maximum of two inputs.

Description
Calculates half2 vector max(a, b). Elementwise half operation is defined as (a > b) ? a : b.

‣ If either of inputs is NaN, the other input is returned.

‣ If both inputs are NaNs, then canonical NaN is returned.
‣ If values of both inputs are 0.0, then +0.0 > -0.0
‣ The result of elementwise maximum of vectors a and b

CUDA Math API vRelease Version | 33

Modules

device half2 hmax2_nan (const __half2 a, const

__half2 b)
Calculates half2 vector maximum of two inputs, NaNs pass through.

Description
Calculates half2 vector max(a, b). Elementwise half operation is defined as (a > b) ? a : b.

‣ If either of inputs is NaN, then canonical NaN is returned.

‣ If values of both inputs are 0.0, then +0.0 > -0.0
‣ The result of elementwise maximum of vectors a and b, with NaNs pass through

device half2 hmin2 (const half2 a, const half2

b)
Calculates half2 vector minimum of two inputs.

Description
Calculates half2 vector min(a, b). Elementwise half operation is defined as (a < b) ? a : b.

‣ If either of inputs is NaN, the other input is returned.

‣ If both inputs are NaNs, then canonical NaN is returned.
‣ If values of both inputs are 0.0, then +0.0 > -0.0
‣ The result of elementwise minimum of vectors a and b

device half2 hmin2_nan (const __half2 a, const

__half2 b)
Calculates half2 vector minimum of two inputs, NaNs pass through.

Description
Calculates half2 vector min(a, b). Elementwise half operation is defined as (a < b) ? a : b.

‣ If either of inputs is NaN, then canonical NaN is returned.

‣ If values of both inputs are 0.0, then +0.0 > -0.0
‣ The result of elementwise minimum of vectors a and b, with NaNs pass through

CUDA Math API vRelease Version | 34

Modules

device half2 hne2 (const half2 a, const half2 b)

Performs half2 vector not-equal comparison.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
half2

‣ The vector result of not-equal comparison of vectors a and b.

Description
Performs half2 vector not-equal comparison of inputs a and b. The corresponding half
results are set to 1.0 for true, or 0.0 for false. NaN inputs generate false results.

device half2 hneu2 (const half2 a, const half2

b)
Performs half2 vector unordered not-equal comparison.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
half2

‣ The vector result of unordered not-equal comparison of vectors a and b.

Description
Performs half2 vector not-equal comparison of inputs a and b. The corresponding half
results are set to 1.0 for true, or 0.0 for false. NaN inputs generate true results.

CUDA Math API vRelease Version | 35

Modules

1.1.5. Half Precision Conversion and Data

Movement
Half Precision Intrinsics
To use these functions, include the header file cuda_fp16.h in your program.

hostdevice half double2half (const double a)

Converts double number to half precision in round-to-nearest-even mode and returns half
with converted value.

Parameters
a
- double. Is only being read.

Returns
half

‣ a converted to half.

Description
Converts double number a to half precision in round-to-nearest-even mode.

hostdevice half2 float22half2_rn (const float2

a)
Converts both components of float2 number to half precision in round-to-nearest-even mode
and returns half2 with converted values.

Parameters
a
- float2. Is only being read.

Returns
half2

‣ The half2 which has corresponding halves equal to the converted float2 components.

Description
Converts both components of float2 to half precision in round-to-nearest mode and combines
the results into one half2 number. Low 16 bits of the return value correspond to a.x and
high 16 bits of the return value correspond to a.y.

CUDA Math API vRelease Version | 36

Modules

hostdevice half float2half (const float a)

Converts float number to half precision in round-to-nearest-even mode and returns half with
converted value.

Parameters
a
- float. Is only being read.

Returns
half

‣ a converted to half.

Description
Converts float number a to half precision in round-to-nearest-even mode.

hostdevice half2 float2half2_rn (const float a)

Converts input to half precision in round-to-nearest-even mode and populates both halves of
half2 with converted value.

Parameters
a
- float. Is only being read.

Returns
half2

‣ The half2 value with both halves equal to the converted half precision number.

Description
Converts input a to half precision in round-to-nearest-even mode and populates both halves of
half2 with converted value.

hostdevice half float2half_rd (const float a)

Converts float number to half precision in round-down mode and returns half with converted
value.

Parameters
a
- float. Is only being read.

CUDA Math API vRelease Version | 37

Modules

Returns
half

‣ a converted to half.

Description
Converts float number a to half precision in round-down mode.

hostdevice half float2half_rn (const float a)

Converts float number to half precision in round-to-nearest-even mode and returns half with
converted value.

Parameters
a
- float. Is only being read.

Returns
half

‣ a converted to half.

Description
Converts float number a to half precision in round-to-nearest-even mode.

hostdevice half float2half_ru (const float a)

Converts float number to half precision in round-up mode and returns half with converted
value.

Parameters
a
- float. Is only being read.

Returns
half

‣ a converted to half.

Description
Converts float number a to half precision in round-up mode.

CUDA Math API vRelease Version | 38

Modules

hostdevice half float2half_rz (const float a)

Converts float number to half precision in round-towards-zero mode and returns half with
converted value.

Parameters
a
- float. Is only being read.

Returns
half

‣ a converted to half.

Description
Converts float number a to half precision in round-towards-zero mode.

hostdevice half2 floats2half2_rn (const float a,

const float b)
Converts both input floats to half precision in round-to-nearest-even mode and returns half2
with converted values.

Parameters
a
- float. Is only being read.
b
- float. Is only being read.

Returns
half2

‣ The half2 value with corresponding halves equal to the converted input floats.

Description
Converts both input floats to half precision in round-to-nearest-even mode and combines the
results into one half2 number. Low 16 bits of the return value correspond to the input a, high
16 bits correspond to the input b.

CUDA Math API vRelease Version | 39

Modules

hostdevice float2 half22float2 (const half2 a)

Converts both halves of half2 to float2 and returns the result.

Parameters
a
- half2. Is only being read.

Returns
float2

‣ a converted to float2.

Description
Converts both halves of half2 input a to float2 and returns the result.

hostdevice float half2float (const half a)

Converts half number to float.

Parameters
a
- float. Is only being read.

Returns
float

‣ a converted to float.

Description
Converts half number a to float.

device half2 half2half2 (const __half a)

Returns half2 with both halves equal to the input value.

Parameters
a
- half. Is only being read.

Returns
half2

‣ The vector which has both its halves equal to the input a.

CUDA Math API vRelease Version | 40

Modules

Description
Returns half2 number with both halves equal to the input a half number.

device int __half2int_rd (const __half h)

Convert a half to a signed integer in round-down mode.

Parameters
h
- half. Is only being read.

Returns
int

‣ h converted to a signed integer.

Description
Convert the half-precision floating-point value h to a signed integer in round-down mode. NaN
inputs are converted to 0.

device int __half2int_rn (const __half h)

Convert a half to a signed integer in round-to-nearest-even mode.

Parameters
h
- half. Is only being read.

Returns
int

‣ h converted to a signed integer.

Description
Convert the half-precision floating-point value h to a signed integer in round-to-nearest-even
mode. NaN inputs are converted to 0.

CUDA Math API vRelease Version | 41

Modules

device int __half2int_ru (const __half h)

Convert a half to a signed integer in round-up mode.

Parameters
h
- half. Is only being read.

Returns
int

‣ h converted to a signed integer.

Description
Convert the half-precision floating-point value h to a signed integer in round-up mode. NaN
inputs are converted to 0.

hostdevice int __half2int_rz (const __half h)

Convert a half to a signed integer in round-towards-zero mode.

Parameters
h
- half. Is only being read.

Returns
int

‣ h converted to a signed integer.

Description
Convert the half-precision floating-point value h to a signed integer in round-towards-zero
mode. NaN inputs are converted to 0.

device long long int __half2ll_rd (const __half h)

Convert a half to a signed 64-bit integer in round-down mode.

Parameters
h
- half. Is only being read.

CUDA Math API vRelease Version | 42

Modules

Returns
long long int

‣ h converted to a signed 64-bit integer.

Description
Convert the half-precision floating-point value h to a signed 64-bit integer in round-down
mode. NaN inputs return a long long int with hex value of 0x8000000000000000.

device long long int __half2ll_rn (const __half h)

Convert a half to a signed 64-bit integer in round-to-nearest-even mode.

Parameters
h
- half. Is only being read.

Returns
long long int

‣ h converted to a signed 64-bit integer.

Description
Convert the half-precision floating-point value h to a signed 64-bit integer in round-to-
nearest-even mode. NaN inputs return a long long int with hex value of 0x8000000000000000.

device long long int __half2ll_ru (const __half h)

Convert a half to a signed 64-bit integer in round-up mode.

Parameters
h
- half. Is only being read.

Returns
long long int

‣ h converted to a signed 64-bit integer.

Description
Convert the half-precision floating-point value h to a signed 64-bit integer in round-up mode.
NaN inputs return a long long int with hex value of 0x8000000000000000.

CUDA Math API vRelease Version | 43

Modules

hostdevice long long int __half2ll_rz (const __half

h)
Convert a half to a signed 64-bit integer in round-towards-zero mode.

Parameters
h
- half. Is only being read.

Returns
long long int

‣ h converted to a signed 64-bit integer.

Description
Convert the half-precision floating-point value h to a signed 64-bit integer in round-towards-
zero mode. NaN inputs return a long long int with hex value of 0x8000000000000000.

device short int __half2short_rd (const __half h)

Convert a half to a signed short integer in round-down mode.

Parameters
h
- half. Is only being read.

Returns
short int

‣ h converted to a signed short integer.

Description
Convert the half-precision floating-point value h to a signed short integer in round-down
mode. NaN inputs are converted to 0.

device short int __half2short_rn (const __half h)

Convert a half to a signed short integer in round-to-nearest-even mode.

Parameters
h
- half. Is only being read.

CUDA Math API vRelease Version | 44

Modules

Returns
short int

‣ h converted to a signed short integer.

Description
Convert the half-precision floating-point value h to a signed short integer in round-to-nearest-
even mode. NaN inputs are converted to 0.

device short int __half2short_ru (const __half h)

Convert a half to a signed short integer in round-up mode.

Parameters
h
- half. Is only being read.

Returns
short int

‣ h converted to a signed short integer.

Description
Convert the half-precision floating-point value h to a signed short integer in round-up mode.
NaN inputs are converted to 0.

hostdevice short int __half2short_rz (const __half

h)
Convert a half to a signed short integer in round-towards-zero mode.

Parameters
h
- half. Is only being read.

Returns
short int

‣ h converted to a signed short integer.

CUDA Math API vRelease Version | 45

Modules

Description
Convert the half-precision floating-point value h to a signed short integer in round-towards-
zero mode. NaN inputs are converted to 0.

device unsigned int __half2uint_rd (const __half h)

Convert a half to an unsigned integer in round-down mode.

Parameters
h
- half. Is only being read.

Returns
unsigned int

‣ h converted to an unsigned integer.

Description
Convert the half-precision floating-point value h to an unsigned integer in round-down mode.
NaN inputs are converted to 0.

device unsigned int __half2uint_rn (const __half h)

Convert a half to an unsigned integer in round-to-nearest-even mode.

Parameters
h
- half. Is only being read.

Returns
unsigned int

‣ h converted to an unsigned integer.

Description
Convert the half-precision floating-point value h to an unsigned integer in round-to-nearest-
even mode. NaN inputs are converted to 0.

CUDA Math API vRelease Version | 46

Modules

device unsigned int __half2uint_ru (const __half h)

Convert a half to an unsigned integer in round-up mode.

Parameters
h
- half. Is only being read.

Returns
unsigned int

‣ h converted to an unsigned integer.

Description
Convert the half-precision floating-point value h to an unsigned integer in round-up mode.
NaN inputs are converted to 0.

hostdevice unsigned int __half2uint_rz (const

__half h)
Convert a half to an unsigned integer in round-towards-zero mode.

Parameters
h
- half. Is only being read.

Returns
unsigned int

‣ h converted to an unsigned integer.

Description
Convert the half-precision floating-point value h to an unsigned integer in round-towards-zero
mode. NaN inputs are converted to 0.

device unsigned long long int __half2ull_rd (const

__half h)
Convert a half to an unsigned 64-bit integer in round-down mode.

Parameters
h
- half. Is only being read.

CUDA Math API vRelease Version | 47

Modules

Returns
unsigned long long int

‣ h converted to an unsigned 64-bit integer.

Description
Convert the half-precision floating-point value h to an unsigned 64-bit integer in round-down
mode. NaN inputs return 0x8000000000000000.

device unsigned long long int __half2ull_rn (const

__half h)
Convert a half to an unsigned 64-bit integer in round-to-nearest-even mode.

Parameters
h
- half. Is only being read.

Returns
unsigned long long int

‣ h converted to an unsigned 64-bit integer.

Description
Convert the half-precision floating-point value h to an unsigned 64-bit integer in round-to-
nearest-even mode. NaN inputs return 0x8000000000000000.

device unsigned long long int __half2ull_ru (const

__half h)
Convert a half to an unsigned 64-bit integer in round-up mode.

Parameters
h
- half. Is only being read.

Returns
unsigned long long int

‣ h converted to an unsigned 64-bit integer.

CUDA Math API vRelease Version | 48

Modules

Description
Convert the half-precision floating-point value h to an unsigned 64-bit integer in round-up
mode. NaN inputs return 0x8000000000000000.

hostdevice unsigned long long int __half2ull_rz

(const __half h)
Convert a half to an unsigned 64-bit integer in round-towards-zero mode.

Parameters
h
- half. Is only being read.

Returns
unsigned long long int

‣ h converted to an unsigned 64-bit integer.

Description
Convert the half-precision floating-point value h to an unsigned 64-bit integer in round-
towards-zero mode. NaN inputs return 0x8000000000000000.

device unsigned short int __half2ushort_rd (const

__half h)
Convert a half to an unsigned short integer in round-down mode.

Parameters
h
- half. Is only being read.

Returns
unsigned short int

‣ h converted to an unsigned short integer.

Description
Convert the half-precision floating-point value h to an unsigned short integer in round-down
mode. NaN inputs are converted to 0.

CUDA Math API vRelease Version | 49

Modules

device unsigned short int __half2ushort_rn (const

__half h)
Convert a half to an unsigned short integer in round-to-nearest-even mode.

Parameters
h
- half. Is only being read.

Returns
unsigned short int

‣ h converted to an unsigned short integer.

Description
Convert the half-precision floating-point value h to an unsigned short integer in round-to-
nearest-even mode. NaN inputs are converted to 0.

device unsigned short int __half2ushort_ru (const

__half h)
Convert a half to an unsigned short integer in round-up mode.

Parameters
h
- half. Is only being read.

Returns
unsigned short int

‣ h converted to an unsigned short integer.

Description
Convert the half-precision floating-point value h to an unsigned short integer in round-up
mode. NaN inputs are converted to 0.

CUDA Math API vRelease Version | 50

Modules

hostdevice unsigned short int __half2ushort_rz

(const __half h)
Convert a half to an unsigned short integer in round-towards-zero mode.

Parameters
h
- half. Is only being read.

Returns
unsigned short int

‣ h converted to an unsigned short integer.

Description
Convert the half-precision floating-point value h to an unsigned short integer in round-
towards-zero mode. NaN inputs are converted to 0.

device short int __half_as_short (const __half h)

Reinterprets bits in a half as a signed short integer.

Parameters
h
- half. Is only being read.

Returns
short int

‣ The reinterpreted value.

Description
Reinterprets the bits in the half-precision floating-point number h as a signed short integer.

device unsigned short int __half_as_ushort (const

__half h)
Reinterprets bits in a half as an unsigned short integer.

Parameters
h
- half. Is only being read.

CUDA Math API vRelease Version | 51

Modules

Returns
unsigned short int

‣ The reinterpreted value.

Description
Reinterprets the bits in the half-precision floating-point h as an unsigned short number.

device half2 halves2half2 (const __half a, const

__half b)
Combines two half numbers into one half2 number.

Parameters
a
- half. Is only being read.
b
- half. Is only being read.

Returns
half2

‣ The half2 with one half equal to a and the other to b.

Description
Combines two input half number a and b into one half2 number. Input a is stored in low 16
bits of the return value, input b is stored in high 16 bits of the return value.

hostdevice float high2float (const half2 a)

Converts high 16 bits of half2 to float and returns the result.

Parameters
a
- half2. Is only being read.

Returns
float

‣ The high 16 bits of a converted to float.

CUDA Math API vRelease Version | 52

Modules

Description
Converts high 16 bits of half2 input a to 32-bit floating-point number and returns the result.

device half high2half (const __half2 a)

Returns high 16 bits of half2 input.

Parameters
a
- half2. Is only being read.

Returns
half

‣ The high 16 bits of the input.

Description
Returns high 16 bits of half2 input a.

device half2 high2half2 (const __half2 a)

Extracts high 16 bits from half2 input.

Parameters
a
- half2. Is only being read.

Returns
half2

‣ The half2 with both halves equal to the high 16 bits of the input.

Description
Extracts high 16 bits from half2 input a and returns a new half2 number which has both
halves equal to the extracted bits.

CUDA Math API vRelease Version | 53

Modules

device half2 highs2half2 (const __half2 a, const

__half2 b)
Extracts high 16 bits from each of the two half2 inputs and combines into one half2
number.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
half2

‣ The high 16 bits of a and of b.

Description
Extracts high 16 bits from each of the two half2 inputs and combines into one half2
number. High 16 bits from input a is stored in low 16 bits of the return value, high 16 bits from
input b is stored in high 16 bits of the return value.

device half int2half_rd (const int i)

Convert a signed integer to a half in round-down mode.

Parameters
i
- int. Is only being read.

Returns
half

‣ i converted to half.

Description
Convert the signed integer value i to a half-precision floating-point value in round-down
mode.

CUDA Math API vRelease Version | 54

Modules

hostdevice half int2half_rn (const int i)

Convert a signed integer to a half in round-to-nearest-even mode.

Parameters
i
- int. Is only being read.

Returns
half

‣ i converted to half.

Description
Convert the signed integer value i to a half-precision floating-point value in round-to-nearest-
even mode.

device half int2half_ru (const int i)

Convert a signed integer to a half in round-up mode.

Parameters
i
- int. Is only being read.

Returns
half

‣ i converted to half.

Description
Convert the signed integer value i to a half-precision floating-point value in round-up mode.

device half int2half_rz (const int i)

Convert a signed integer to a half in round-towards-zero mode.

Parameters
i
- int. Is only being read.

Returns
half

CUDA Math API vRelease Version | 55

Modules

‣ i converted to half.

Description
Convert the signed integer value i to a half-precision floating-point value in round-towards-
zero mode.

device half ldca (const __half *ptr)

Generates a `ld.global.ca` load instruction.

Parameters
ptr
- memory location

Returns
The value pointed by `ptr`

device half2 ldca (const __half2 *ptr)

Generates a `ld.global.ca` load instruction.

Parameters
ptr
- memory location

Returns
The value pointed by `ptr`

device half ldcg (const __half *ptr)

Generates a `ld.global.cg` load instruction.

Parameters
ptr
- memory location

Returns
The value pointed by `ptr`

CUDA Math API vRelease Version | 56

Modules

device half2 ldcg (const __half2 *ptr)

Generates a `ld.global.cg` load instruction.

Parameters
ptr
- memory location

Returns
The value pointed by `ptr`

device half ldcs (const __half *ptr)

Generates a `ld.global.cs` load instruction.

Parameters
ptr
- memory location

Returns
The value pointed by `ptr`

device half2 ldcs (const __half2 *ptr)

Generates a `ld.global.cs` load instruction.

Parameters
ptr
- memory location

Returns
The value pointed by `ptr`

device half ldcv (const __half *ptr)

Generates a `ld.global.cv` load instruction.

Parameters
ptr
- memory location

Returns
The value pointed by `ptr`

CUDA Math API vRelease Version | 57

Modules

device half2 ldcv (const __half2 *ptr)

Generates a `ld.global.cv` load instruction.

Parameters
ptr
- memory location

Returns
The value pointed by `ptr`

device half ldg (const __half *ptr)

Generates a `ld.global.nc` load instruction.

Parameters
ptr
- memory location

Returns
The value pointed by `ptr`

device half2 ldg (const __half2 *ptr)

Generates a `ld.global.nc` load instruction.

Parameters
ptr
- memory location

Returns
The value pointed by `ptr`

Description
defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300)

device half ldlu (const __half *ptr)

Generates a `ld.global.lu` load instruction.

Parameters
ptr
- memory location

CUDA Math API vRelease Version | 58

Modules

Returns
The value pointed by `ptr`

device half2 ldlu (const __half2 *ptr)

Generates a `ld.global.lu` load instruction.

Parameters
ptr
- memory location

Returns
The value pointed by `ptr`

device half ll2half_rd (const long long int i)

Convert a signed 64-bit integer to a half in round-down mode.

Parameters
i
- long long int. Is only being read.

Returns
half

‣ i converted to half.

Description
Convert the signed 64-bit integer value i to a half-precision floating-point value in round-
down mode.

hostdevice half ll2half_rn (const long long int

i)
Convert a signed 64-bit integer to a half in round-to-nearest-even mode.

Parameters
i
- long long int. Is only being read.

Returns
half

CUDA Math API vRelease Version | 59

Modules

‣ i converted to half.

Description
Convert the signed 64-bit integer value i to a half-precision floating-point value in round-to-
nearest-even mode.

device half ll2half_ru (const long long int i)

Convert a signed 64-bit integer to a half in round-up mode.

Parameters
i
- long long int. Is only being read.

Returns
half

‣ i converted to half.

Description
Convert the signed 64-bit integer value i to a half-precision floating-point value in round-up
mode.

device half ll2half_rz (const long long int i)

Convert a signed 64-bit integer to a half in round-towards-zero mode.

Parameters
i
- long long int. Is only being read.

Returns
half

‣ i converted to half.

Description
Convert the signed 64-bit integer value i to a half-precision floating-point value in round-
towards-zero mode.

CUDA Math API vRelease Version | 60

Modules

hostdevice float low2float (const half2 a)

Converts low 16 bits of half2 to float and returns the result.

Parameters
a
- half2. Is only being read.

Returns
float

‣ The low 16 bits of a converted to float.

Description
Converts low 16 bits of half2 input a to 32-bit floating-point number and returns the result.

device half low2half (const __half2 a)

Returns low 16 bits of half2 input.

Parameters
a
- half2. Is only being read.

Returns
half

‣ Returns half which contains low 16 bits of the input a.

Description
Returns low 16 bits of half2 input a.

device half2 low2half2 (const __half2 a)

Extracts low 16 bits from half2 input.

Parameters
a
- half2. Is only being read.

Returns
half2

‣ The half2 with both halves equal to the low 16 bits of the input.

CUDA Math API vRelease Version | 61

Modules

Description
Extracts low 16 bits from half2 input a and returns a new half2 number which has both
halves equal to the extracted bits.

device half2 lowhigh2highlow (const __half2 a)

Swaps both halves of the half2 input.

Parameters
a
- half2. Is only being read.

Returns
half2

‣ a with its halves being swapped.

Description
Swaps both halves of the half2 input and returns a new half2 number with swapped halves.

device half2 lows2half2 (const __half2 a, const

__half2 b)
Extracts low 16 bits from each of the two half2 inputs and combines into one half2 number.

Parameters
a
- half2. Is only being read.
b
- half2. Is only being read.

Returns
half2

‣ The low 16 bits of a and of b.

Description
Extracts low 16 bits from each of the two half2 inputs and combines into one half2 number.
Low 16 bits from input a is stored in low 16 bits of the return value, low 16 bits from input b is
stored in high 16 bits of the return value.

CUDA Math API vRelease Version | 62

Modules

device half shfl_down_sync (const unsigned mask,

const __half var, const unsigned int delta, const int width)
Exchange a variable between threads within a warp. Copy from a thread with higher ID relative
to the caller.

Parameters
mask
- unsigned int. Is only being read.
var
- half. Is only being read.
delta
- int. Is only being read.
width
- int. Is only being read.

Returns
Returns the 2-byte word referenced by var from the source thread ID as half. If the source
thread ID is out of range or the source thread has exited, the calling thread's own var is
returned.

Description
Calculates a source thread ID by adding delta to the caller's thread ID. The value of var held
by the resulting thread ID is returned: this has the effect of shifting var down the warp by
delta threads. If width is less than warpSize then each subsection of the warp behaves as a
separate entity with a starting logical thread ID of 0. As for __shfl_up_sync(), the ID number of
the source thread will not wrap around the value of width and so the upper delta threads will
remain unchanged.

Note:

For more details for this function see the Warp Shuffle Functions section in the CUDA C++
Programming Guide.

CUDA Math API vRelease Version | 63

Modules

device half2 shfl_down_sync (const unsigned

mask, const __half2 var, const unsigned int delta, const int
width)
Exchange a variable between threads within a warp. Copy from a thread with higher ID relative
to the caller.

Parameters
mask
- unsigned int. Is only being read.
var
- half2. Is only being read.
delta
- int. Is only being read.
width
- int. Is only being read.

Returns
Returns the 4-byte word referenced by var from the source thread ID as half2. If the source
thread ID is out of range or the source thread has exited, the calling thread's own var is
returned.

Note:

For more details for this function see the Warp Shuffle Functions section in the CUDA C++
Programming Guide.

CUDA Math API vRelease Version | 64

Modules

device half shfl_sync (const unsigned mask, const

__half var, const int delta, const int width)
Exchange a variable between threads within a warp. Direct copy from indexed thread.

Parameters
mask
- unsigned int. Is only being read.
var
- half. Is only being read.
delta
- int. Is only being read.
width
- int. Is only being read.

Returns
Returns the 2-byte word referenced by var from the source thread ID as half. If the source
thread ID is out of range or the source thread has exited, the calling thread's own var is
returned.

Description
Returns the value of var held by the thread whose ID is given by delta. If width is less than
warpSize then each subsection of the warp behaves as a separate entity with a starting logical
thread ID of 0. If delta is outside the range [0:width-1], the value returned corresponds to the
value of var held by the delta modulo width (i.e. within the same subsection). width must have
a value which is a power of 2; results are undefined if width is not a power of 2, or is a number
greater than warpSize.

Note:

For more details for this function see the Warp Shuffle Functions section in the CUDA C++
Programming Guide.

device half2 shfl_sync (const unsigned mask, const

__half2 var, const int delta, const int width)
Exchange a variable between threads within a warp. Direct copy from indexed thread.

Parameters
mask
- unsigned int. Is only being read.

CUDA Math API vRelease Version | 65

Modules

var
- half2. Is only being read.
delta
- int. Is only being read.
width
- int. Is only being read.

device half shfl_up_sync (const unsigned mask,

const __half var, const unsigned int delta, const int width)
Exchange a variable between threads within a warp. Copy from a thread with lower ID relative
to the caller.

Parameters
mask
- unsigned int. Is only being read.
var
- half. Is only being read.
delta
- int. Is only being read.
width
- int. Is only being read.

Returns
Returns the 2-byte word referenced by var from the source thread ID as half. If the source
thread ID is out of range or the source thread has exited, the calling thread's own var is
returned.

CUDA Math API vRelease Version | 66

Modules

Description
Calculates a source thread ID by subtracting delta from the caller's lane ID. The value of var
held by the resulting lane ID is returned: in effect, var is shifted up the warp by delta threads.
If width is less than warpSize then each subsection of the warp behaves as a separate entity
with a starting logical thread ID of 0. The source thread index will not wrap around the value of
width, so effectively the lower delta threads will be unchanged. width must have a value which
is a power of 2; results are undefined if width is not a power of 2, or is a number greater than
warpSize.

Note:

For more details for this function see the Warp Shuffle Functions section in the CUDA C++
Programming Guide.

device half2 shfl_up_sync (const unsigned mask,

const __half2 var, const unsigned int delta, const int width)
Exchange a variable between threads within a warp. Copy from a thread with lower ID relative
to the caller.

Parameters
mask
- unsigned int. Is only being read.
var
- half2. Is only being read.
delta
- int. Is only being read.
width
- int. Is only being read.

CUDA Math API vRelease Version | 67

Modules

is a power of 2; results are undefined if width is not a power of 2, or is a number greater than
warpSize.

Note:

For more details for this function see the Warp Shuffle Functions section in the CUDA C++
Programming Guide.

device half shfl_xor_sync (const unsigned mask,

const __half var, const int delta, const int width)
Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR
of own thread ID.

Parameters
mask
- unsigned int. Is only being read.
var
- half. Is only being read.
delta
- int. Is only being read.
width
- int. Is only being read.

Returns
Returns the 2-byte word referenced by var from the source thread ID as half. If the source
thread ID is out of range or the source thread has exited, the calling thread's own var is
returned.

Description
Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask:
the value of var held by the resulting thread ID is returned. If width is less than warpSize then
each group of width consecutive threads are able to access elements from earlier groups of
threads, however if they attempt to access elements from later groups of threads their own
value of var will be returned. This mode implements a butterfly addressing pattern such as is
used in tree reduction and broadcast.

Note:

For more details for this function see the Warp Shuffle Functions section in the CUDA C++
Programming Guide.

CUDA Math API vRelease Version | 68

Modules

device half2 shfl_xor_sync (const unsigned mask,

const __half2 var, const int delta, const int width)
Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR
of own thread ID.

Parameters
mask
- unsigned int. Is only being read.
var
- half2. Is only being read.
delta
- int. Is only being read.
width
- int. Is only being read.

Note:

For more details for this function see the Warp Shuffle Functions section in the CUDA C++
Programming Guide.

device half short2half_rd (const short int i)

Convert a signed short integer to a half in round-down mode.

Parameters
i
- short int. Is only being read.

CUDA Math API vRelease Version | 69

Modules

Returns
half

‣ i converted to half.

Description
Convert the signed short integer value i to a half-precision floating-point value in round-down
mode.

hostdevice half short2half_rn (const short int

i)
Convert a signed short integer to a half in round-to-nearest-even mode.

Parameters
i
- short int. Is only being read.

Returns
half

‣ i converted to half.

Description
Convert the signed short integer value i to a half-precision floating-point value in round-to-
nearest-even mode.

device half short2half_ru (const short int i)

Convert a signed short integer to a half in round-up mode.

Parameters
i
- short int. Is only being read.

Returns
half

‣ i converted to half.

CUDA Math API vRelease Version | 70

Modules

Description
Convert the signed short integer value i to a half-precision floating-point value in round-up
mode.

device half short2half_rz (const short int i)

Convert a signed short integer to a half in round-towards-zero mode.

Parameters
i
- short int. Is only being read.

Returns
half

‣ i converted to half.

Description
Convert the signed short integer value i to a half-precision floating-point value in round-
towards-zero mode.

device half short_as_half (const short int i)

Reinterprets bits in a signed short integer as a half.

Parameters
i
- short int. Is only being read.

Returns
half

‣ The reinterpreted value.

Description
Reinterprets the bits in the signed short integer i as a half-precision floating-point number.

CUDA Math API vRelease Version | 71

Modules

device void stcg (const half *ptr, const __half

value)
Generates a `st.global.cg` store instruction.

Parameters
ptr
- memory location
value
- the value to be stored

device void stcg (const half2 *ptr, const __half2

value)
Generates a `st.global.cg` store instruction.

Parameters
ptr
- memory location
value
- the value to be stored

device void stcs (const half *ptr, const __half

value)
Generates a `st.global.cs` store instruction.

Parameters
ptr
- memory location
value
- the value to be stored

device void stcs (const half2 *ptr, const __half2

value)
Generates a `st.global.cs` store instruction.

Parameters
ptr
- memory location
value
- the value to be stored

CUDA Math API vRelease Version | 72

Modules

device void stwb (const half *ptr, const __half

value)
Generates a `st.global.wb` store instruction.

Parameters
ptr
- memory location
value
- the value to be stored

device void stwb (const half2 *ptr, const __half2

value)
Generates a `st.global.wb` store instruction.

Parameters
ptr
- memory location
value
- the value to be stored

device void stwt (const half *ptr, const __half

value)
Generates a `st.global.wt` store instruction.

Parameters
ptr
- memory location
value
- the value to be stored

device void stwt (const half2 *ptr, const __half2

value)
Generates a `st.global.wt` store instruction.

Parameters
ptr
- memory location
value
- the value to be stored

CUDA Math API vRelease Version | 73

Modules

device half uint2half_rd (const unsigned int i)

Convert an unsigned integer to a half in round-down mode.

Parameters
i
- unsigned int. Is only being read.

Returns
half

‣ i converted to half.

Description
Convert the unsigned integer value i to a half-precision floating-point value in round-down
mode.

hostdevice half uint2half_rn (const unsigned

int i)
Convert an unsigned integer to a half in round-to-nearest-even mode.

Parameters
i
- unsigned int. Is only being read.

Returns
half

‣ i converted to half.

Description
Convert the unsigned integer value i to a half-precision floating-point value in round-to-
nearest-even mode.

device half uint2half_ru (const unsigned int i)

Convert an unsigned integer to a half in round-up mode.

Parameters
i
- unsigned int. Is only being read.

CUDA Math API vRelease Version | 74

Modules

Returns
half

‣ i converted to half.

Description
Convert the unsigned integer value i to a half-precision floating-point value in round-up
mode.

device half uint2half_rz (const unsigned int i)

Convert an unsigned integer to a half in round-towards-zero mode.

Parameters
i
- unsigned int. Is only being read.

Returns
half

‣ i converted to half.

Description
Convert the unsigned integer value i to a half-precision floating-point value in round-towards-
zero mode.

device half ull2half_rd (const unsigned long long

int i)
Convert an unsigned 64-bit integer to a half in round-down mode.

Parameters
i
- unsigned long long int. Is only being read.

Returns
half

‣ i converted to half.

CUDA Math API vRelease Version | 75

Modules

Description
Convert the unsigned 64-bit integer value i to a half-precision floating-point value in round-
down mode.

hostdevice half ull2half_rn (const unsigned

long long int i)
Convert an unsigned 64-bit integer to a half in round-to-nearest-even mode.

Parameters
i
- unsigned long long int. Is only being read.

Returns
half

‣ i converted to half.

Description
Convert the unsigned 64-bit integer value i to a half-precision floating-point value in round-
to-nearest-even mode.

device half ull2half_ru (const unsigned long long

int i)
Convert an unsigned 64-bit integer to a half in round-up mode.

Parameters
i
- unsigned long long int. Is only being read.

Returns
half

‣ i converted to half.

Description
Convert the unsigned 64-bit integer value i to a half-precision floating-point value in round-up
mode.

CUDA Math API vRelease Version | 76

Modules

device half ull2half_rz (const unsigned long long

int i)
Convert an unsigned 64-bit integer to a half in round-towards-zero mode.

Parameters
i
- unsigned long long int. Is only being read.

Returns
half

‣ i converted to half.

Description
Convert the unsigned 64-bit integer value i to a half-precision floating-point value in round-
towards-zero mode.

device half ushort2half_rd (const unsigned short

int i)
Convert an unsigned short integer to a half in round-down mode.

Parameters
i
- unsigned short int. Is only being read.

Returns
half

‣ i converted to half.

Description
Convert the unsigned short integer value i to a half-precision floating-point value in round-
down mode.

CUDA Math API vRelease Version | 77

Modules

hostdevice half ushort2half_rn (const

unsigned short int i)
Convert an unsigned short integer to a half in round-to-nearest-even mode.

Parameters
i
- unsigned short int. Is only being read.

Returns
half

‣ i converted to half.

Description
Convert the unsigned short integer value i to a half-precision floating-point value in round-to-
nearest-even mode.

device half ushort2half_ru (const unsigned short

int i)
Convert an unsigned short integer to a half in round-up mode.

Parameters
i
- unsigned short int. Is only being read.

Returns
half

‣ i converted to half.

Description
Convert the unsigned short integer value i to a half-precision floating-point value in round-up
mode.

CUDA Math API vRelease Version | 78

Modules

device half ushort2half_rz (const unsigned short

int i)
Convert an unsigned short integer to a half in round-towards-zero mode.

Parameters
i
- unsigned short int. Is only being read.

Returns
half

‣ i converted to half.

Description
Convert the unsigned short integer value i to a half-precision floating-point value in round-
towards-zero mode.

device half ushort_as_half (const unsigned short

int i)
Reinterprets bits in an unsigned short integer as a half.

Parameters
i
- unsigned short int. Is only being read.

Returns
half

‣ The reinterpreted value.

Description
Reinterprets the bits in the unsigned short integer i as a half-precision floating-point number.

1.1.6. Half Math Functions

Half Precision Intrinsics
To use these functions, include the header file cuda_fp16.h in your program.

CUDA Math API vRelease Version | 79

Modules

device half hceil (const half h)

Calculate ceiling of the input argument.

Parameters
h
- half. Is only being read.

Returns
half

‣ The smallest integer value not less than h.

Description
Compute the smallest integer value not less than h.

device half hcos (const half a)

Calculates half cosine in round-to-nearest-even mode.

Parameters
a
- half. Is only being read.

Returns
half

‣ The cosine of a.

Description
Calculates half cosine of input a in round-to-nearest-even mode.

device half hexp (const half a)

Calculates half natural exponential function in round-to-nearest mode.

Parameters
a
- half. Is only being read.

Returns
half

‣ The natural exponential function on a.

CUDA Math API vRelease Version | 80

Modules

Description
Calculates half natural exponential function of input a in round-to-nearest-even mode.

device half hexp10 (const half a)

Calculates half decimal exponential function in round-to-nearest mode.

Parameters
a
- half. Is only being read.

Returns
half

‣ The decimal exponential function on a.

Description
Calculates half decimal exponential function of input a in round-to-nearest-even mode.

device half hexp2 (const half a)

Calculates half binary exponential function in round-to-nearest mode.

Parameters
a
- half. Is only being read.

Returns
half

‣ The binary exponential function on a.

Description
Calculates half binary exponential function of input a in round-to-nearest-even mode.

device half hfloor (const half h)

Calculate the largest integer less than or equal to h.

Parameters
h
- half. Is only being read.

CUDA Math API vRelease Version | 81

Modules

Returns
half

‣ The largest integer value which is less than or equal to h.

Description
Calculate the largest integer value which is less than or equal to h.

device half hlog (const half a)

Calculates half natural logarithm in round-to-nearest-even mode.

Parameters
a
- half. Is only being read.

Returns
half

‣ The natural logarithm of a.

Description
Calculates half natural logarithm of input a in round-to-nearest-even mode.

device half hlog10 (const half a)

Calculates half decimal logarithm in round-to-nearest-even mode.

Parameters
a
- half. Is only being read.

Returns
half

‣ The decimal logarithm of a.

Description
Calculates half decimal logarithm of input a in round-to-nearest-even mode.

CUDA Math API vRelease Version | 82

Modules

device half hlog2 (const half a)

Calculates half binary logarithm in round-to-nearest-even mode.

Parameters
a
- half. Is only being read.

Returns
half

‣ The binary logarithm of a.

Description
Calculates half binary logarithm of input a in round-to-nearest-even mode.

device half hrcp (const half a)

Calculates half reciprocal in round-to-nearest-even mode.

Parameters
a
- half. Is only being read.

Returns
half

‣ The reciprocal of a.

Description
Calculates half reciprocal of input a in round-to-nearest-even mode.

device half hrint (const half h)

Round input to nearest integer value in half-precision floating-point number.

Parameters
h
- half. Is only being read.

Returns
half

‣ The nearest integer to h.

CUDA Math API vRelease Version | 83

Modules

Description
Round h to the nearest integer value in half-precision floating-point format, with halfway
cases rounded to the nearest even integer value.

device half hrsqrt (const half a)

Calculates half reciprocal square root in round-to-nearest-even mode.

Parameters
a
- half. Is only being read.

Returns
half

‣ The reciprocal square root of a.

Description
Calculates half reciprocal square root of input a in round-to-nearest mode.

device half hsin (const half a)

Calculates half sine in round-to-nearest-even mode.

Parameters
a
- half. Is only being read.

Returns
half

‣ The sine of a.

Description
Calculates half sine of input a in round-to-nearest-even mode.

CUDA Math API vRelease Version | 84

Modules

device half hsqrt (const half a)

Calculates half square root in round-to-nearest-even mode.

Parameters
a
- half. Is only being read.

Returns
half

‣ The square root of a.

Description
Calculates half square root of input a in round-to-nearest-even mode.

device half htrunc (const half h)

Truncate input argument to the integral part.

Parameters
h
- half. Is only being read.

Returns
half

‣ The truncated integer value.

Description
Round h to the nearest integer value that does not exceed h in magnitude.

1.1.7. Half2 Math Functions

Half Precision Intrinsics
To use these functions, include the header file cuda_fp16.h in your program.

CUDA Math API vRelease Version | 85

Modules

device half2 h2ceil (const half2 h)

Calculate half2 vector ceiling of the input argument.

Parameters
h
- half2. Is only being read.

Returns
half2

‣ The vector of smallest integers not less than h.

Description
For each component of vector h compute the smallest integer value not less than h.

device half2 h2cos (const half2 a)

Calculates half2 vector cosine in round-to-nearest-even mode.

Parameters
a
- half2. Is only being read.

Returns
half2

‣ The elementwise cosine on vector a.

Description
Calculates half2 cosine of input vector a in round-to-nearest-even mode.

device half2 h2exp (const half2 a)

Calculates half2 vector exponential function in round-to-nearest mode.

Parameters
a
- half2. Is only being read.

Returns
half2

‣ The elementwise exponential function on vector a.

CUDA Math API vRelease Version | 86

Modules

Description
Calculates half2 exponential function of input vector a in round-to-nearest-even mode.

device half2 h2exp10 (const half2 a)

Calculates half2 vector decimal exponential function in round-to-nearest-even mode.

Parameters
a
- half2. Is only being read.

Returns
half2

‣ The elementwise decimal exponential function on vector a.

Description
Calculates half2 decimal exponential function of input vector a in round-to-nearest-even
mode.

device half2 h2exp2 (const half2 a)

Calculates half2 vector binary exponential function in round-to-nearest-even mode.

Parameters
a
- half2. Is only being read.

Returns
half2

‣ The elementwise binary exponential function on vector a.

Description
Calculates half2 binary exponential function of input vector a in round-to-nearest-even
mode.

CUDA Math API vRelease Version | 87

Modules

device half2 h2floor (const half2 h)

Calculate the largest integer less than or equal to h.

Parameters
h
- half2. Is only being read.

Returns
half2

‣ The vector of largest integers which is less than or equal to h.

Description
For each component of vector h calculate the largest integer value which is less than or equal
to h.

device half2 h2log (const half2 a)

Calculates half2 vector natural logarithm in round-to-nearest-even mode.

Parameters
a
- half2. Is only being read.

Returns
half2

‣ The elementwise natural logarithm on vector a.

Description
Calculates half2 natural logarithm of input vector a in round-to-nearest-even mode.

device half2 h2log10 (const half2 a)

Calculates half2 vector decimal logarithm in round-to-nearest-even mode.

Parameters
a
- half2. Is only being read.

Returns
half2

CUDA Math API vRelease Version | 88

Modules

‣ The elementwise decimal logarithm on vector a.

Description
Calculates half2 decimal logarithm of input vector a in round-to-nearest-even mode.

device half2 h2log2 (const half2 a)

Calculates half2 vector binary logarithm in round-to-nearest-even mode.

Parameters
a
- half2. Is only being read.

Returns
half2

‣ The elementwise binary logarithm on vector a.

Description
Calculates half2 binary logarithm of input vector a in round-to-nearest mode.

device half2 h2rcp (const half2 a)

Calculates half2 vector reciprocal in round-to-nearest-even mode.

Parameters
a
- half2. Is only being read.

Returns
half2

‣ The elementwise reciprocal on vector a.

Description
Calculates half2 reciprocal of input vector a in round-to-nearest-even mode.

CUDA Math API vRelease Version | 89

Modules

device half2 h2rint (const half2 h)

Round input to nearest integer value in half-precision floating-point number.

Parameters
h
- half2. Is only being read.

Returns
half2

‣ The vector of rounded integer values.

Description
Round each component of half2 vector h to the nearest integer value in half-precision
floating-point format, with halfway cases rounded to the nearest even integer value.

device half2 h2rsqrt (const half2 a)

Calculates half2 vector reciprocal square root in round-to-nearest mode.

Parameters
a
- half2. Is only being read.

Returns
half2

‣ The elementwise reciprocal square root on vector a.

Description
Calculates half2 reciprocal square root of input vector a in round-to-nearest-even mode.

device half2 h2sin (const half2 a)

Calculates half2 vector sine in round-to-nearest-even mode.

Parameters
a
- half2. Is only being read.

Returns
half2

CUDA Math API vRelease Version | 90

Modules

‣ The elementwise sine on vector a.

Description
Calculates half2 sine of input vector a in round-to-nearest-even mode.

device half2 h2sqrt (const half2 a)

Calculates half2 vector square root in round-to-nearest-even mode.

Parameters
a
- half2. Is only being read.

Returns
half2

‣ The elementwise square root on vector a.

Description
Calculates half2 square root of input vector a in round-to-nearest mode.

device half2 h2trunc (const half2 h)

Truncate half2 vector input argument to the integral part.

Parameters
h
- half2. Is only being read.

Returns
half2

‣ The truncated h.

Description
Round each component of vector h to the nearest integer value that does not exceed h in
magnitude.

1.2. Bfloat16 Precision Intrinsics

This section describes nv_bfloat16 precision intrinsic functions that are only supported in
device code. To use these functions, include the header file cuda_bf16.h in your program.

CUDA Math API vRelease Version | 91

Modules

Bfloat16 Arithmetic Functions

Bfloat162 Arithmetic Functions
Bfloat16 Comparison Functions
Bfloat162 Comparison Functions
Bfloat16 Precision Conversion and Data
Movement
Bfloat16 Math Functions
Bfloat162 Math Functions
1.2.1. Bfloat16 Arithmetic Functions
Bfloat16 Precision Intrinsics
To use these functions, include the header file cuda_bf16.h in your program.

device __nv_bfloat162 h2div (const nv_bfloat162 a,

const __nv_bfloat162 b)
Performs nv_bfloat162 vector division in round-to-nearest-even mode.

Description
Divides nv_bfloat162 input vector a by input vector b in round-to-nearest mode.

device __nv_bfloat16 habs (const nv_bfloat16 a)

Calculates the absolute value of input nv_bfloat16 number and returns the result.

Parameters
a
- nv_bfloat16. Is only being read.

Returns
nv_bfloat16

CUDA Math API vRelease Version | 92

Modules

‣ The absolute value of a.

Description
Calculates the absolute value of input nv_bfloat16 number and returns the result.

device __nv_bfloat16 hadd (const nv_bfloat16 a,

const __nv_bfloat16 b)
Performs nv_bfloat16 addition in round-to-nearest-even mode.

Description
Performs nv_bfloat16 addition of inputs a and b, in round-to-nearest-even mode.

device __nv_bfloat16 __hadd_rn (const __nv_bfloat16 a,

const __nv_bfloat16 b)
Performs nv_bfloat16 addition in round-to-nearest-even mode.

Description
Performs nv_bfloat16 addition of inputs a and b, in round-to-nearest-even mode. Prevents
floating-point contractions of mul+add into fma.

device __nv_bfloat16 __hadd_sat (const __nv_bfloat16

a, const __nv_bfloat16 b)
Performs nv_bfloat16 addition in round-to-nearest-even mode, with saturation to [0.0, 1.0].

Parameters
a
- nv_bfloat16. Is only being read.
b
- nv_bfloat16. Is only being read.

Returns
nv_bfloat16

‣ The sum of a and b, with respect to saturation.

Description
Performs nv_bfloat16 add of inputs a and b, in round-to-nearest-even mode, and clamps
the result to range [0.0, 1.0]. NaN results are flushed to +0.0.

CUDA Math API vRelease Version | 93

Modules

device __nv_bfloat16 hdiv (const nv_bfloat16 a,

const __nv_bfloat16 b)
Performs nv_bfloat16 division in round-to-nearest-even mode.

Description
Divides nv_bfloat16 input a by input b in round-to-nearest mode.

device __nv_bfloat16 hfma (const nv_bfloat16 a,

const __nv_bfloat16 b, const __nv_bfloat16 c)
Performs nv_bfloat16 fused multiply-add in round-to-nearest-even mode.

Description
Performs nv_bfloat16 multiply on inputs a and b, then performs a nv_bfloat16 add of
the result with c, rounding the result once in round-to-nearest-even mode.

device __nv_bfloat16 __hfma_relu (const __nv_bfloat16

a, const __nv_bfloat16 b, const __nv_bfloat16 c)
Performs nv_bfloat16 fused multiply-add in round-to-nearest-even mode with relu
saturation.

Parameters
a
- nv_bfloat16. Is only being read.
b
- nv_bfloat16. Is only being read.
c
- nv_bfloat16. Is only being read.

Returns
nv_bfloat16

‣ The result of fused multiply-add operation on a, b, and c with relu saturation.

Description
Performs nv_bfloat16 multiply on inputs a and b, then performs a nv_bfloat16 add of
the result with c, rounding the result once in round-to-nearest-even mode. Then negative
result is clamped to 0. NaN result is converted to canonical NaN.

CUDA Math API vRelease Version | 94

Modules

device __nv_bfloat16 __hfma_sat (const __nv_bfloat16

a, const __nv_bfloat16 b, const __nv_bfloat16 c)
Performs nv_bfloat16 fused multiply-add in round-to-nearest-even mode, with saturation
to [0.0, 1.0].

Parameters
a
- nv_bfloat16. Is only being read.
b
- nv_bfloat16. Is only being read.
c
- nv_bfloat16. Is only being read.

Returns
nv_bfloat16

‣ The result of fused multiply-add operation on a, b, and c, with respect to saturation.

Description
Performs nv_bfloat16 multiply on inputs a and b, then performs a nv_bfloat16 add of
the result with c, rounding the result once in round-to-nearest-even mode, and clamps the
result to range [0.0, 1.0]. NaN results are flushed to +0.0.

device __nv_bfloat16 hmul (const nv_bfloat16 a,

const __nv_bfloat16 b)
Performs nv_bfloat16 multiplication in round-to-nearest-even mode.

Description
Performs nv_bfloat16 multiplication of inputs a and b, in round-to-nearest mode.

device __nv_bfloat16 __hmul_rn (const __nv_bfloat16

a, const __nv_bfloat16 b)
Performs nv_bfloat16 multiplication in round-to-nearest-even mode.

Description
Performs nv_bfloat16 multiplication of inputs a and b, in round-to-nearest mode. Prevents
floating-point contractions of mul+add or sub into fma.

CUDA Math API vRelease Version | 95

Modules

device __nv_bfloat16 __hmul_sat (const __nv_bfloat16

a, const __nv_bfloat16 b)
Performs nv_bfloat16 multiplication in round-to-nearest-even mode, with saturation to
[0.0, 1.0].

Parameters
a
- nv_bfloat16. Is only being read.
b
- nv_bfloat16. Is only being read.

Returns
nv_bfloat16

‣ The result of multiplying a and b, with respect to saturation.

Description
Performs nv_bfloat16 multiplication of inputs a and b, in round-to-nearest mode, and
clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.

device __nv_bfloat16 hneg (const nv_bfloat16 a)

Negates input nv_bfloat16 number and returns the result.

Description
Negates input nv_bfloat16 number and returns the result.

device __nv_bfloat16 hsub (const nv_bfloat16 a,

const __nv_bfloat16 b)
Performs nv_bfloat16 subtraction in round-to-nearest-even mode.

Description
Subtracts nv_bfloat16 input b from input a in round-to-nearest mode.

CUDA Math API vRelease Version | 96

Modules

device __nv_bfloat16 __hsub_rn (const __nv_bfloat16 a,

const __nv_bfloat16 b)
Performs nv_bfloat16 subtraction in round-to-nearest-even mode.

Description
Subtracts nv_bfloat16 input b from input a in round-to-nearest mode. Prevents floating-
point contractions of mul+sub into fma.

device __nv_bfloat16 __hsub_sat (const __nv_bfloat16

a, const __nv_bfloat16 b)
Performs nv_bfloat16 subtraction in round-to-nearest-even mode, with saturation to [0.0,
1.0].

Parameters
a
- nv_bfloat16. Is only being read.
b
- nv_bfloat16. Is only being read.

Returns
nv_bfloat16

‣ The result of subtraction of b from a, with respect to saturation.

Description
Subtracts nv_bfloat16 input b from input a in round-to-nearest mode, and clamps the
result to range [0.0, 1.0]. NaN results are flushed to +0.0.

device __nv_bfloat16 atomicAdd (const __nv_bfloat16

*address, const __nv_bfloat16 val)
Adds val to the value stored at address in global or shared memory, and writes this value
back to address. This operation is performed in one atomic operation.

Parameters
address
- __nv_bfloat16*. An address in global or shared memory.
val
- __nv_bfloat16. The value to be added.

CUDA Math API vRelease Version | 97

Modules

Returns
__nv_bfloat16

‣ The old value read from address.

Note:

For more details for this function see the Atomic Functions section in the CUDA C++
Programming Guide.

1.2.2. Bfloat162 Arithmetic Functions

Bfloat16 Precision Intrinsics
To use these functions, include the header file cuda_bf16.h in your program.

device __nv_bfloat162 habs2 (const nv_bfloat162

a)
Calculates the absolute value of both halves of the input nv_bfloat162 number and returns
the result.

Parameters
a
- nv_bfloat162. Is only being read.

Returns
bfloat2

‣ Returns a with the absolute value of both halves.

Description
Calculates the absolute value of both halves of the input nv_bfloat162 number and returns
the result.

CUDA Math API vRelease Version | 98

Modules

device __nv_bfloat162 hadd2 (const nv_bfloat162

a, const __nv_bfloat162 b)
Performs nv_bfloat162 vector addition in round-to-nearest-even mode.

Description
Performs nv_bfloat162 vector add of inputs a and b, in round-to-nearest mode.

device __nv_bfloat162 __hadd2_rn (const

__nv_bfloat162 a, const __nv_bfloat162 b)
Performs nv_bfloat162 vector addition in round-to-nearest-even mode.

Description
Performs nv_bfloat162 vector add of inputs a and b, in round-to-nearest mode. Prevents
floating-point contractions of mul+add into fma.

device __nv_bfloat162 __hadd2_sat (const

__nv_bfloat162 a, const __nv_bfloat162 b)
Performs nv_bfloat162 vector addition in round-to-nearest-even mode, with saturation to
[0.0, 1.0].

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The sum of a and b, with respect to saturation.

Description
Performs nv_bfloat162 vector add of inputs a and b, in round-to-nearest mode, and
clamps the results to range [0.0, 1.0]. NaN results are flushed to +0.0.

CUDA Math API vRelease Version | 99

Modules

device __nv_bfloat162 hcmadd (const nv_bfloat162

a, const __nv_bfloat162 b, const __nv_bfloat162 c)
Performs fast complex multiply-accumulate.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.
c
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The result of complex multiply-accumulate operation on complex numbers a, b, and c

Description
Interprets vector nv_bfloat162 input pairs a, b, and c as complex numbers in
nv_bfloat16 precision and performs complex multiply-accumulate operation: a*b + c

device __nv_bfloat162 hfma2 (const nv_bfloat162

a, const __nv_bfloat162 b, const __nv_bfloat162 c)
Performs nv_bfloat162 vector fused multiply-add in round-to-nearest-even mode.

Description
Performs nv_bfloat162 vector multiply on inputs a and b, then performs a nv_bfloat162
vector add of the result with c, rounding the result once in round-to-nearest-even mode.

device __nv_bfloat162 __hfma2_relu (const

__nv_bfloat162 a, const __nv_bfloat162 b, const
__nv_bfloat162 c)
Performs nv_bfloat162 vector fused multiply-add in round-to-nearest-even mode with relu
saturation.

Parameters
a
- nv_bfloat162. Is only being read.

CUDA Math API vRelease Version | 100

Modules

b
- nv_bfloat162. Is only being read.
c
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The result of elementwise fused multiply-add operation on vectors a, b, and c with relu
saturation.

Description
Performs nv_bfloat162 vector multiply on inputs a and b, then performs a nv_bfloat162
vector add of the result with c, rounding the result once in round-to-nearest-even mode. Then
negative result is clamped to 0. NaN result is converted to canonical NaN.

device __nv_bfloat162 __hfma2_sat (const

__nv_bfloat162 a, const __nv_bfloat162 b, const
__nv_bfloat162 c)
Performs nv_bfloat162 vector fused multiply-add in round-to-nearest-even mode, with
saturation to [0.0, 1.0].

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.
c
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The result of elementwise fused multiply-add operation on vectors a, b, and c, with

respect to saturation.

Description
Performs nv_bfloat162 vector multiply on inputs a and b, then performs a nv_bfloat162
vector add of the result with c, rounding the result once in round-to-nearest-even mode, and
clamps the results to range [0.0, 1.0]. NaN results are flushed to +0.0.

CUDA Math API vRelease Version | 101

Modules

device __nv_bfloat162 hmul2 (const nv_bfloat162

a, const __nv_bfloat162 b)
Performs nv_bfloat162 vector multiplication in round-to-nearest-even mode.

Description
Performs nv_bfloat162 vector multiplication of inputs a and b, in round-to-nearest-even
mode.

device __nv_bfloat162 __hmul2_rn (const

__nv_bfloat162 a, const __nv_bfloat162 b)
Performs nv_bfloat162 vector multiplication in round-to-nearest-even mode.

Description
Performs nv_bfloat162 vector multiplication of inputs a and b, in round-to-nearest-even
mode. Prevents floating-point contractions of mul+add or sub into fma.

device __nv_bfloat162 __hmul2_sat (const

__nv_bfloat162 a, const __nv_bfloat162 b)
Performs nv_bfloat162 vector multiplication in round-to-nearest-even mode, with
saturation to [0.0, 1.0].

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The result of elementwise multiplication of vectors a and b, with respect to saturation.

Description
Performs nv_bfloat162 vector multiplication of inputs a and b, in round-to-nearest-even
mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to +0.0.

CUDA Math API vRelease Version | 102

Modules

device __nv_bfloat162 hneg2 (const nv_bfloat162

a)
Negates both halves of the input nv_bfloat162 number and returns the result.

Description
Negates both halves of the input nv_bfloat162 number a and returns the result.

device __nv_bfloat162 hsub2 (const nv_bfloat162

a, const __nv_bfloat162 b)
Performs nv_bfloat162 vector subtraction in round-to-nearest-even mode.

Description
Subtracts nv_bfloat162 input vector b from input vector a in round-to-nearest-even mode.

device __nv_bfloat162 __hsub2_rn (const

__nv_bfloat162 a, const __nv_bfloat162 b)
Performs nv_bfloat162 vector subtraction in round-to-nearest-even mode.

Description
Subtracts nv_bfloat162 input vector b from input vector a in round-to-nearest-even mode.
Prevents floating-point contractions of mul+sub into fma.

device __nv_bfloat162 __hsub2_sat (const

__nv_bfloat162 a, const __nv_bfloat162 b)
Performs nv_bfloat162 vector subtraction in round-to-nearest-even mode, with saturation
to [0.0, 1.0].

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The subtraction of vector b from a, with respect to saturation.

CUDA Math API vRelease Version | 103

Modules

Description
Subtracts nv_bfloat162 input vector b from input vector a in round-to-nearest-even mode,
and clamps the results to range [0.0, 1.0]. NaN results are flushed to +0.0.

device __nv_bfloat162 atomicAdd (const __nv_bfloat162

*address, const __nv_bfloat162 val)
Vector add val to the value stored at address in global or shared memory, and writes this
value back to address. The atomicity of the add operation is guaranteed separately for each
of the two nv_bfloat16 elements; the entire __nv_bfloat162 is not guaranteed to be atomic as a
single 32-bit access.

Parameters
address
- __nv_bfloat162*. An address in global or shared memory.
val
- __nv_bfloat162. The value to be added.

Returns
__nv_bfloat162

‣ The old value read from address.

Note:

For more details for this function see the Atomic Functions section in the CUDA C++
Programming Guide.

1.2.3. Bfloat16 Comparison Functions

Bfloat16 Precision Intrinsics
To use these functions, include the header file cuda_bf16.h in your program.

CUDA Math API vRelease Version | 104

Modules

device bool heq (const nv_bfloat16 a, const

__nv_bfloat16 b)
Performs nv_bfloat16 if-equal comparison.

Parameters
a
- nv_bfloat16. Is only being read.
b
- nv_bfloat16. Is only being read.

Returns
bool

‣ The boolean result of if-equal comparison of a and b.

Description
Performs nv_bfloat16 if-equal comparison of inputs a and b. NaN inputs generate false
results.

device bool hequ (const nv_bfloat16 a, const

__nv_bfloat16 b)
Performs nv_bfloat16 unordered if-equal comparison.

Parameters
a
- nv_bfloat16. Is only being read.
b
- nv_bfloat16. Is only being read.

Returns
bool

‣ The boolean result of unordered if-equal comparison of a and b.

Description
Performs nv_bfloat16 if-equal comparison of inputs a and b. NaN inputs generate true
results.

CUDA Math API vRelease Version | 105

Modules

device bool hge (const nv_bfloat16 a, const

__nv_bfloat16 b)
Performs nv_bfloat16 greater-equal comparison.

Parameters
a
- nv_bfloat16. Is only being read.
b
- nv_bfloat16. Is only being read.

Returns
bool

‣ The boolean result of greater-equal comparison of a and b.

Description
Performs nv_bfloat16 greater-equal comparison of inputs a and b. NaN inputs generate
false results.

device bool hgeu (const nv_bfloat16 a, const

__nv_bfloat16 b)
Performs nv_bfloat16 unordered greater-equal comparison.

Parameters
a
- nv_bfloat16. Is only being read.
b
- nv_bfloat16. Is only being read.

Returns
bool

‣ The boolean result of unordered greater-equal comparison of a and b.

Description
Performs nv_bfloat16 greater-equal comparison of inputs a and b. NaN inputs generate
true results.

CUDA Math API vRelease Version | 106

Modules

device bool hgt (const nv_bfloat16 a, const

__nv_bfloat16 b)
Performs nv_bfloat16 greater-than comparison.

Parameters
a
- nv_bfloat16. Is only being read.
b
- nv_bfloat16. Is only being read.

Returns
bool

‣ The boolean result of greater-than comparison of a and b.

Description
Performs nv_bfloat16 greater-than comparison of inputs a and b. NaN inputs generate
false results.

device bool hgtu (const nv_bfloat16 a, const

__nv_bfloat16 b)
Performs nv_bfloat16 unordered greater-than comparison.

Parameters
a
- nv_bfloat16. Is only being read.
b
- nv_bfloat16. Is only being read.

Returns
bool

‣ The boolean result of unordered greater-than comparison of a and b.

Description
Performs nv_bfloat16 greater-than comparison of inputs a and b. NaN inputs generate
true results.

CUDA Math API vRelease Version | 107

Modules

device int hisinf (const nv_bfloat16 a)

Checks if the input nv_bfloat16 number is infinite.

Parameters
a
- nv_bfloat16. Is only being read.

Returns
int

‣ -1 iff a is equal to negative infinity,

‣ 1 iff a is equal to positive infinity,
‣ 0 otherwise.

Description
Checks if the input nv_bfloat16 number a is infinite.

device bool hisnan (const nv_bfloat16 a)

Determine whether nv_bfloat16 argument is a NaN.

Parameters
a
- nv_bfloat16. Is only being read.

Returns
bool

‣ true iff argument is NaN.

Description
Determine whether nv_bfloat16 value a is a NaN.

device bool hle (const nv_bfloat16 a, const

__nv_bfloat16 b)
Performs nv_bfloat16 less-equal comparison.

Parameters
a
- nv_bfloat16. Is only being read.

CUDA Math API vRelease Version | 108

Modules

b
- nv_bfloat16. Is only being read.

Returns
bool

‣ The boolean result of less-equal comparison of a and b.

Description
Performs nv_bfloat16 less-equal comparison of inputs a and b. NaN inputs generate false
results.

device bool hleu (const nv_bfloat16 a, const

__nv_bfloat16 b)
Performs nv_bfloat16 unordered less-equal comparison.

Parameters
a
- nv_bfloat16. Is only being read.
b
- nv_bfloat16. Is only being read.

Returns
bool

‣ The boolean result of unordered less-equal comparison of a and b.

Description
Performs nv_bfloat16 less-equal comparison of inputs a and b. NaN inputs generate true
results.

device bool hlt (const nv_bfloat16 a, const

__nv_bfloat16 b)
Performs nv_bfloat16 less-than comparison.

Parameters
a
- nv_bfloat16. Is only being read.
b
- nv_bfloat16. Is only being read.

CUDA Math API vRelease Version | 109

Modules

Returns
bool

‣ The boolean result of less-than comparison of a and b.

Description
Performs nv_bfloat16 less-than comparison of inputs a and b. NaN inputs generate false
results.

device bool hltu (const nv_bfloat16 a, const

__nv_bfloat16 b)
Performs nv_bfloat16 unordered less-than comparison.

Parameters
a
- nv_bfloat16. Is only being read.
b
- nv_bfloat16. Is only being read.

Returns
bool

‣ The boolean result of unordered less-than comparison of a and b.

Description
Performs nv_bfloat16 less-than comparison of inputs a and b. NaN inputs generate true
results.

device __nv_bfloat16 hmax (const nv_bfloat16 a,

const __nv_bfloat16 b)
Calculates nv_bfloat16 maximum of two input values.

Description
Calculates nv_bfloat16 max(a, b) defined as (a > b) ? a : b.

‣ If either of inputs is NaN, the other input is returned.

‣ If both inputs are NaNs, then canonical NaN is returned.
‣ If values of both inputs are 0.0, then +0.0 > -0.0

CUDA Math API vRelease Version | 110

Modules

device __nv_bfloat16 __hmax_nan (const __nv_bfloat16

a, const __nv_bfloat16 b)
Calculates nv_bfloat16 maximum of two input values, NaNs pass through.

Description
Calculates nv_bfloat16 max(a, b) defined as (a > b) ? a : b.

‣ If either of inputs is NaN, then canonical NaN is returned.

‣ If values of both inputs are 0.0, then +0.0 > -0.0

device __nv_bfloat16 hmin (const nv_bfloat16 a,

const __nv_bfloat16 b)
Calculates nv_bfloat16 minimum of two input values.

Description
Calculates nv_bfloat16 min(a, b) defined as (a < b) ? a : b.

‣ If either of inputs is NaN, the other input is returned.

‣ If both inputs are NaNs, then canonical NaN is returned.
‣ If values of both inputs are 0.0, then +0.0 > -0.0

device __nv_bfloat16 __hmin_nan (const __nv_bfloat16

a, const __nv_bfloat16 b)
Calculates nv_bfloat16 minimum of two input values, NaNs pass through.

Description
Calculates nv_bfloat16 min(a, b) defined as (a < b) ? a : b.

‣ If either of inputs is NaN, then canonical NaN is returned.

‣ If values of both inputs are 0.0, then +0.0 > -0.0

device bool hne (const nv_bfloat16 a, const

__nv_bfloat16 b)
Performs nv_bfloat16 not-equal comparison.

Parameters
a
- nv_bfloat16. Is only being read.

CUDA Math API vRelease Version | 111

Modules

b
- nv_bfloat16. Is only being read.

Returns
bool

‣ The boolean result of not-equal comparison of a and b.

Description
Performs nv_bfloat16 not-equal comparison of inputs a and b. NaN inputs generate false
results.

device bool hneu (const nv_bfloat16 a, const

__nv_bfloat16 b)
Performs nv_bfloat16 unordered not-equal comparison.

Parameters
a
- nv_bfloat16. Is only being read.
b
- nv_bfloat16. Is only being read.

Returns
bool

‣ The boolean result of unordered not-equal comparison of a and b.

Description
Performs nv_bfloat16 not-equal comparison of inputs a and b. NaN inputs generate true
results.

1.2.4. Bfloat162 Comparison Functions

Bfloat16 Precision Intrinsics
To use these functions, include the header file cuda_bf16.h in your program.

CUDA Math API vRelease Version | 112

Modules

device bool hbeq2 (const nv_bfloat162 a, const

__nv_bfloat162 b)
Performs nv_bfloat162 vector if-equal comparison and returns boolean true iff both
nv_bfloat16 results are true, boolean false otherwise.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
bool

‣ true if both nv_bfloat16 results of if-equal comparison of vectors a and b are true;
‣ false otherwise.

Description
Performs nv_bfloat162 vector if-equal comparison of inputs a and b. The bool result is set
to true only if both nv_bfloat16 if-equal comparisons evaluate to true, or false otherwise.
NaN inputs generate false results.

device bool hbequ2 (const nv_bfloat162 a, const

__nv_bfloat162 b)
Performs nv_bfloat162 vector unordered if-equal comparison and returns boolean true iff
both nv_bfloat16 results are true, boolean false otherwise.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
bool

‣ true if both nv_bfloat16 results of unordered if-equal comparison of vectors a and b are
true;

‣ false otherwise.

CUDA Math API vRelease Version | 113

Modules

device bool hbge2 (const nv_bfloat162 a, const

__nv_bfloat162 b)
Performs nv_bfloat162 vector greater-equal comparison and returns boolean true iff both
nv_bfloat16 results are true, boolean false otherwise.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
bool

‣ true if both nv_bfloat16 results of greater-equal comparison of vectors a and b are

true;

‣ false otherwise.

Description
Performs nv_bfloat162 vector greater-equal comparison of inputs a and b. The bool result
is set to true only if both nv_bfloat16 greater-equal comparisons evaluate to true, or false
otherwise. NaN inputs generate false results.

device bool hbgeu2 (const nv_bfloat162 a, const

__nv_bfloat162 b)
Performs nv_bfloat162 vector unordered greater-equal comparison and returns boolean
true iff both nv_bfloat16 results are true, boolean false otherwise.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

CUDA Math API vRelease Version | 114

Modules

Returns
bool

‣ true if both nv_bfloat16 results of unordered greater-equal comparison of vectors a

and b are true;

‣ false otherwise.

device bool hbgt2 (const nv_bfloat162 a, const

__nv_bfloat162 b)
Performs nv_bfloat162 vector greater-than comparison and returns boolean true iff both
nv_bfloat16 results are true, boolean false otherwise.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
bool

‣ true if both nv_bfloat16 results of greater-than comparison of vectors a and b are true;
‣ false otherwise.

Description
Performs nv_bfloat162 vector greater-than comparison of inputs a and b. The bool result
is set to true only if both nv_bfloat16 greater-than comparisons evaluate to true, or false
otherwise. NaN inputs generate false results.

CUDA Math API vRelease Version | 115

Modules

device bool hbgtu2 (const nv_bfloat162 a, const

__nv_bfloat162 b)
Performs nv_bfloat162 vector unordered greater-than comparison and returns boolean
true iff both nv_bfloat16 results are true, boolean false otherwise.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
bool

‣ true if both nv_bfloat16 results of unordered greater-than comparison of vectors a and

b are true;

‣ false otherwise.

device bool hble2 (const nv_bfloat162 a, const

__nv_bfloat162 b)
Performs nv_bfloat162 vector less-equal comparison and returns boolean true iff both
nv_bfloat16 results are true, boolean false otherwise.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
bool

‣ true if both nv_bfloat16 results of less-equal comparison of vectors a and b are true;
‣ false otherwise.

CUDA Math API vRelease Version | 116

Modules

Description
Performs nv_bfloat162 vector less-equal comparison of inputs a and b. The bool result
is set to true only if both nv_bfloat16 less-equal comparisons evaluate to true, or false
otherwise. NaN inputs generate false results.

device bool hbleu2 (const nv_bfloat162 a, const

__nv_bfloat162 b)
Performs nv_bfloat162 vector unordered less-equal comparison and returns boolean true
iff both nv_bfloat16 results are true, boolean false otherwise.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
bool

‣ true if both nv_bfloat16 results of unordered less-equal comparison of vectors a and b

are true;

‣ false otherwise.

device bool hblt2 (const nv_bfloat162 a, const

__nv_bfloat162 b)
Performs nv_bfloat162 vector less-than comparison and returns boolean true iff both
nv_bfloat16 results are true, boolean false otherwise.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

CUDA Math API vRelease Version | 117

Modules

Returns
bool

‣ true if both nv_bfloat16 results of less-than comparison of vectors a and b are true;
‣ false otherwise.

Description
Performs nv_bfloat162 vector less-than comparison of inputs a and b. The bool result
is set to true only if both nv_bfloat16 less-than comparisons evaluate to true, or false
otherwise. NaN inputs generate false results.

device bool hbltu2 (const nv_bfloat162 a, const

__nv_bfloat162 b)
Performs nv_bfloat162 vector unordered less-than comparison and returns boolean true
iff both nv_bfloat16 results are true, boolean false otherwise.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
bool

‣ true if both nv_bfloat16 results of unordered less-than comparison of vectors a and b

are true;

‣ false otherwise.

CUDA Math API vRelease Version | 118

Modules

device bool hbne2 (const nv_bfloat162 a, const

__nv_bfloat162 b)
Performs nv_bfloat162 vector not-equal comparison and returns boolean true iff both
nv_bfloat16 results are true, boolean false otherwise.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
bool

‣ true if both nv_bfloat16 results of not-equal comparison of vectors a and b are true,
‣ false otherwise.

Description
Performs nv_bfloat162 vector not-equal comparison of inputs a and b. The bool result
is set to true only if both nv_bfloat16 not-equal comparisons evaluate to true, or false
otherwise. NaN inputs generate false results.

device bool hbneu2 (const nv_bfloat162 a, const

__nv_bfloat162 b)
Performs nv_bfloat162 vector unordered not-equal comparison and returns boolean true
iff both nv_bfloat16 results are true, boolean false otherwise.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
bool

‣ true if both nv_bfloat16 results of unordered not-equal comparison of vectors a and b

are true;

‣ false otherwise.

CUDA Math API vRelease Version | 119

Modules

device __nv_bfloat162 heq2 (const nv_bfloat162 a,

const __nv_bfloat162 b)
Performs nv_bfloat162 vector if-equal comparison.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The vector result of if-equal comparison of vectors a and b.

Description
Performs nv_bfloat162 vector if-equal comparison of inputs a and b. The corresponding
nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. NaN inputs generate false
results.

device __nv_bfloat162 hequ2 (const nv_bfloat162

a, const __nv_bfloat162 b)
Performs nv_bfloat162 vector unordered if-equal comparison.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The vector result of unordered if-equal comparison of vectors a and b.

CUDA Math API vRelease Version | 120

Modules

Description
Performs nv_bfloat162 vector if-equal comparison of inputs a and b. The corresponding
nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. NaN inputs generate true
results.

device __nv_bfloat162 hge2 (const nv_bfloat162 a,

const __nv_bfloat162 b)
Performs nv_bfloat162 vector greater-equal comparison.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The vector result of greater-equal comparison of vectors a and b.

Description
Performs nv_bfloat162 vector greater-equal comparison of inputs a and b. The
corresponding nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. NaN inputs
generate false results.

device __nv_bfloat162 hgeu2 (const nv_bfloat162

a, const __nv_bfloat162 b)
Performs nv_bfloat162 vector unordered greater-equal comparison.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The nv_bfloat162 vector result of unordered greater-equal comparison of vectors a and

CUDA Math API vRelease Version | 121

Modules

Description
Performs nv_bfloat162 vector greater-equal comparison of inputs a and b. The
corresponding nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. NaN inputs
generate true results.

device __nv_bfloat162 hgt2 (const nv_bfloat162 a,

const __nv_bfloat162 b)
Performs nv_bfloat162 vector greater-than comparison.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The vector result of greater-than comparison of vectors a and b.

Description
Performs nv_bfloat162 vector greater-than comparison of inputs a and b. The
corresponding nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. NaN inputs
generate false results.

device __nv_bfloat162 hgtu2 (const nv_bfloat162 a,

const __nv_bfloat162 b)
Performs nv_bfloat162 vector unordered greater-than comparison.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The nv_bfloat162 vector result of unordered greater-than comparison of vectors a and

CUDA Math API vRelease Version | 122

Modules

Description
Performs nv_bfloat162 vector greater-than comparison of inputs a and b. The
corresponding nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. NaN inputs
generate true results.

device __nv_bfloat162 hisnan2 (const nv_bfloat162

a)
Determine whether nv_bfloat162 argument is a NaN.

Parameters
a
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The nv_bfloat162 with the corresponding nv_bfloat16 results set to 1.0 for NaN, 0.0
otherwise.

Description
Determine whether each nv_bfloat16 of input nv_bfloat162 number a is a NaN.

device __nv_bfloat162 hle2 (const nv_bfloat162 a,

const __nv_bfloat162 b)
Performs nv_bfloat162 vector less-equal comparison.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The nv_bfloat162 result of less-equal comparison of vectors a and b.

CUDA Math API vRelease Version | 123

Modules

Description
Performs nv_bfloat162 vector less-equal comparison of inputs a and b. The corresponding
nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. NaN inputs generate false
results.

device __nv_bfloat162 hleu2 (const nv_bfloat162 a,

const __nv_bfloat162 b)
Performs nv_bfloat162 vector unordered less-equal comparison.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The vector result of unordered less-equal comparison of vectors a and b.

Description
Performs nv_bfloat162 vector less-equal comparison of inputs a and b. The corresponding
nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. NaN inputs generate true
results.

device __nv_bfloat162 hlt2 (const nv_bfloat162 a,

const __nv_bfloat162 b)
Performs nv_bfloat162 vector less-than comparison.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The nv_bfloat162 vector result of less-than comparison of vectors a and b.

CUDA Math API vRelease Version | 124

Modules

Description
Performs nv_bfloat162 vector less-than comparison of inputs a and b. The corresponding
nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. NaN inputs generate false
results.

device __nv_bfloat162 hltu2 (const nv_bfloat162 a,

const __nv_bfloat162 b)
Performs nv_bfloat162 vector unordered less-than comparison.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The vector result of unordered less-than comparison of vectors a and b.

Description
Performs nv_bfloat162 vector less-than comparison of inputs a and b. The corresponding
nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. NaN inputs generate true
results.

device __nv_bfloat162 hmax2 (const nv_bfloat162

a, const __nv_bfloat162 b)
Calculates nv_bfloat162 vector maximum of two inputs.

Description
Calculates nv_bfloat162 vector max(a, b). Elementwise nv_bfloat16 operation is defined
as (a > b) ? a : b.

‣ If either of inputs is NaN, the other input is returned.

‣ If both inputs are NaNs, then canonical NaN is returned.
‣ If values of both inputs are 0.0, then +0.0 > -0.0
‣ The result of elementwise maximum of vectors a and b

CUDA Math API vRelease Version | 125

Modules

device __nv_bfloat162 __hmax2_nan (const

__nv_bfloat162 a, const __nv_bfloat162 b)
Calculates nv_bfloat162 vector maximum of two inputs, NaNs pass through.

Description
Calculates nv_bfloat162 vector max(a, b). Elementwise nv_bfloat16 operation is defined
as (a > b) ? a : b.

‣ If either of inputs is NaN, then canonical NaN is returned.

‣ If values of both inputs are 0.0, then +0.0 > -0.0
‣ The result of elementwise maximum of vectors a and b, with NaNs pass through

device __nv_bfloat162 hmin2 (const nv_bfloat162

a, const __nv_bfloat162 b)
Calculates nv_bfloat162 vector minimum of two inputs.

Description
Calculates nv_bfloat162 vector min(a, b). Elementwise nv_bfloat16 operation is defined
as (a < b) ? a : b.

‣ If either of inputs is NaN, the other input is returned.

‣ If both inputs are NaNs, then canonical NaN is returned.
‣ If values of both inputs are 0.0, then +0.0 > -0.0
‣ The result of elementwise minimum of vectors a and b

device __nv_bfloat162 __hmin2_nan (const

__nv_bfloat162 a, const __nv_bfloat162 b)
Calculates nv_bfloat162 vector minimum of two inputs, NaNs pass through.

Description
Calculates nv_bfloat162 vector min(a, b). Elementwise nv_bfloat16 operation is defined
as (a < b) ? a : b.

‣ If either of inputs is NaN, then canonical NaN is returned.

‣ If values of both inputs are 0.0, then +0.0 > -0.0
‣ The result of elementwise minimum of vectors a and b, with NaNs pass through

CUDA Math API vRelease Version | 126

Modules

device __nv_bfloat162 hne2 (const nv_bfloat162 a,

const __nv_bfloat162 b)
Performs nv_bfloat162 vector not-equal comparison.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The vector result of not-equal comparison of vectors a and b.

Description
Performs nv_bfloat162 vector not-equal comparison of inputs a and b. The corresponding
nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. NaN inputs generate false
results.

device __nv_bfloat162 hneu2 (const nv_bfloat162

a, const __nv_bfloat162 b)
Performs nv_bfloat162 vector unordered not-equal comparison.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The vector result of unordered not-equal comparison of vectors a and b.

Description
Performs nv_bfloat162 vector not-equal comparison of inputs a and b. The corresponding
nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. NaN inputs generate true
results.

CUDA Math API vRelease Version | 127

Modules

1.2.5. Bfloat16 Precision Conversion and Data

Movement
Bfloat16 Precision Intrinsics
To use these functions, include the header file cuda_bf16.h in your program.

hostdevice float2 __bfloat1622float2 (const

__nv_bfloat162 a)
Converts both halves of nv_bfloat162 to float2 and returns the result.

Parameters
a
- nv_bfloat162. Is only being read.

Returns
float2

‣ a converted to float2.

Description
Converts both halves of nv_bfloat162 input a to float2 and returns the result.

device __nv_bfloat162 __bfloat162bfloat162 (const

__nv_bfloat16 a)
Returns nv_bfloat162 with both halves equal to the input value.

Parameters
a
- nv_bfloat16. Is only being read.

Returns
nv_bfloat162

‣ The vector which has both its halves equal to the input a.

Description
Returns nv_bfloat162 number with both halves equal to the input a nv_bfloat16
number.

CUDA Math API vRelease Version | 128

Modules

hostdevice float __bfloat162float (const

__nv_bfloat16 a)
Converts nv_bfloat16 number to float.

Parameters
a
- float. Is only being read.

Returns
float

‣ a converted to float.

Description
Converts nv_bfloat16 number a to float.

device int __bfloat162int_rd (const __nv_bfloat16 h)

Convert a nv_bfloat16 to a signed integer in round-down mode.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
int

‣ h converted to a signed integer.

Description
Convert the nv_bfloat16 floating-point value h to a signed integer in round-down mode. NaN
inputs are converted to 0.

device int __bfloat162int_rn (const __nv_bfloat16 h)

Convert a nv_bfloat16 to a signed integer in round-to-nearest-even mode.

Parameters
h
- nv_bfloat16. Is only being read.

CUDA Math API vRelease Version | 129

Modules

Returns
int

‣ h converted to a signed integer.

Description
Convert the nv_bfloat16 floating-point value h to a signed integer in round-to-nearest-even
mode. NaN inputs are converted to 0.

device int __bfloat162int_ru (const __nv_bfloat16 h)

Convert a nv_bfloat16 to a signed integer in round-up mode.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
int

‣ h converted to a signed integer.

Description
Convert the nv_bfloat16 floating-point value h to a signed integer in round-up mode. NaN
inputs are converted to 0.

hostdevice int __bfloat162int_rz (const

__nv_bfloat16 h)
Convert a nv_bfloat16 to a signed integer in round-towards-zero mode.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
int

‣ h converted to a signed integer.

CUDA Math API vRelease Version | 130

Modules

Description
Convert the nv_bfloat16 floating-point value h to a signed integer in round-towards-zero
mode. NaN inputs are converted to 0.

device long long int __bfloat162ll_rd (const

__nv_bfloat16 h)
Convert a nv_bfloat16 to a signed 64-bit integer in round-down mode.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
long long int

‣ h converted to a signed 64-bit integer.

Description
Convert the nv_bfloat16 floating-point value h to a signed 64-bit integer in round-down mode.
NaN inputs return a long long int with hex value of 0x8000000000000000.

device long long int __bfloat162ll_rn (const

__nv_bfloat16 h)
Convert a nv_bfloat16 to a signed 64-bit integer in round-to-nearest-even mode.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
long long int

‣ h converted to a signed 64-bit integer.

Description
Convert the nv_bfloat16 floating-point value h to a signed 64-bit integer in round-to-nearest-
even mode. NaN inputs return a long long int with hex value of 0x8000000000000000.

CUDA Math API vRelease Version | 131

Modules

device long long int __bfloat162ll_ru (const

__nv_bfloat16 h)
Convert a nv_bfloat16 to a signed 64-bit integer in round-up mode.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
long long int

‣ h converted to a signed 64-bit integer.

Description
Convert the nv_bfloat16 floating-point value h to a signed 64-bit integer in round-up mode.
NaN inputs return a long long int with hex value of 0x8000000000000000.

hostdevice long long int __bfloat162ll_rz (const

__nv_bfloat16 h)
Convert a nv_bfloat16 to a signed 64-bit integer in round-towards-zero mode.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
long long int

‣ h converted to a signed 64-bit integer.

Description
Convert the nv_bfloat16 floating-point value h to a signed 64-bit integer in round-towards-zero
mode. NaN inputs return a long long int with hex value of 0x8000000000000000.

CUDA Math API vRelease Version | 132

Modules

device short int __bfloat162short_rd (const

__nv_bfloat16 h)
Convert a nv_bfloat16 to a signed short integer in round-down mode.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
short int

‣ h converted to a signed short integer.

Description
Convert the nv_bfloat16 floating-point value h to a signed short integer in round-down mode.
NaN inputs are converted to 0.

device short int __bfloat162short_rn (const

__nv_bfloat16 h)
Convert a nv_bfloat16 to a signed short integer in round-to-nearest-even mode.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
short int

‣ h converted to a signed short integer.

Description
Convert the nv_bfloat16 floating-point value h to a signed short integer in round-to-nearest-
even mode. NaN inputs are converted to 0.

CUDA Math API vRelease Version | 133

Modules

device short int __bfloat162short_ru (const

__nv_bfloat16 h)
Convert a nv_bfloat16 to a signed short integer in round-up mode.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
short int

‣ h converted to a signed short integer.

Description
Convert the nv_bfloat16 floating-point value h to a signed short integer in round-up mode.
NaN inputs are converted to 0.

hostdevice short int __bfloat162short_rz (const

__nv_bfloat16 h)
Convert a nv_bfloat16 to a signed short integer in round-towards-zero mode.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
short int

‣ h converted to a signed short integer.

Description
Convert the nv_bfloat16 floating-point value h to a signed short integer in round-towards-zero
mode. NaN inputs are converted to 0.

CUDA Math API vRelease Version | 134

Modules

device unsigned int __bfloat162uint_rd (const

__nv_bfloat16 h)
Convert a nv_bfloat16 to an unsigned integer in round-down mode.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
unsigned int

‣ h converted to an unsigned integer.

Description
Convert the nv_bfloat16 floating-point value h to an unsigned integer in round-down mode.
NaN inputs are converted to 0.

device unsigned int __bfloat162uint_rn (const

__nv_bfloat16 h)
Convert a nv_bfloat16 to an unsigned integer in round-to-nearest-even mode.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
unsigned int

‣ h converted to an unsigned integer.

Description
Convert the nv_bfloat16 floating-point value h to an unsigned integer in round-to-nearest-even
mode. NaN inputs are converted to 0.

CUDA Math API vRelease Version | 135

Modules

device unsigned int __bfloat162uint_ru (const

__nv_bfloat16 h)
Convert a nv_bfloat16 to an unsigned integer in round-up mode.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
unsigned int

‣ h converted to an unsigned integer.

Description
Convert the nv_bfloat16 floating-point value h to an unsigned integer in round-up mode. NaN
inputs are converted to 0.

hostdevice unsigned int __bfloat162uint_rz (const

__nv_bfloat16 h)
Convert a nv_bfloat16 to an unsigned integer in round-towards-zero mode.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
unsigned int

‣ h converted to an unsigned integer.

Description
Convert the nv_bfloat16 floating-point value h to an unsigned integer in round-towards-zero
mode. NaN inputs are converted to 0.

CUDA Math API vRelease Version | 136

Modules

device unsigned long long int __bfloat162ull_rd (const

__nv_bfloat16 h)
Convert a nv_bfloat16 to an unsigned 64-bit integer in round-down mode.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
unsigned long long int

‣ h converted to an unsigned 64-bit integer.

Description
Convert the nv_bfloat16 floating-point value h to an unsigned 64-bit integer in round-down
mode. NaN inputs return 0x8000000000000000.

device unsigned long long int __bfloat162ull_rn (const

__nv_bfloat16 h)
Convert a nv_bfloat16 to an unsigned 64-bit integer in round-to-nearest-even mode.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
unsigned long long int

‣ h converted to an unsigned 64-bit integer.

Description
Convert the nv_bfloat16 floating-point value h to an unsigned 64-bit integer in round-to-
nearest-even mode. NaN inputs return 0x8000000000000000.

CUDA Math API vRelease Version | 137

Modules

device unsigned long long int __bfloat162ull_ru (const

__nv_bfloat16 h)
Convert a nv_bfloat16 to an unsigned 64-bit integer in round-up mode.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
unsigned long long int

‣ h converted to an unsigned 64-bit integer.

Description
Convert the nv_bfloat16 floating-point value h to an unsigned 64-bit integer in round-up mode.
NaN inputs return 0x8000000000000000.

hostdevice unsigned long long int

__bfloat162ull_rz (const __nv_bfloat16 h)
Convert a nv_bfloat16 to an unsigned 64-bit integer in round-towards-zero mode.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
unsigned long long int

‣ h converted to an unsigned 64-bit integer.

Description
Convert the nv_bfloat16 floating-point value h to an unsigned 64-bit integer in round-towards-
zero mode. NaN inputs return 0x8000000000000000.

CUDA Math API vRelease Version | 138

Modules

device unsigned short int __bfloat162ushort_rd (const

__nv_bfloat16 h)
Convert a nv_bfloat16 to an unsigned short integer in round-down mode.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
unsigned short int

‣ h converted to an unsigned short integer.

Description
Convert the nv_bfloat16 floating-point value h to an unsigned short integer in round-down
mode. NaN inputs are converted to 0.

device unsigned short int __bfloat162ushort_rn (const

__nv_bfloat16 h)
Convert a nv_bfloat16 to an unsigned short integer in round-to-nearest-even mode.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
unsigned short int

‣ h converted to an unsigned short integer.

Description
Convert the nv_bfloat16 floating-point value h to an unsigned short integer in round-to-
nearest-even mode. NaN inputs are converted to 0.

CUDA Math API vRelease Version | 139

Modules

device unsigned short int __bfloat162ushort_ru (const

__nv_bfloat16 h)
Convert a nv_bfloat16 to an unsigned short integer in round-up mode.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
unsigned short int

‣ h converted to an unsigned short integer.

Description
Convert the nv_bfloat16 floating-point value h to an unsigned short integer in round-up mode.
NaN inputs are converted to 0.

hostdevice unsigned short int

__bfloat162ushort_rz (const __nv_bfloat16 h)
Convert a nv_bfloat16 to an unsigned short integer in round-towards-zero mode.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
unsigned short int

‣ h converted to an unsigned short integer.

Description
Convert the nv_bfloat16 floating-point value h to an unsigned short integer in round-towards-
zero mode. NaN inputs are converted to 0.

CUDA Math API vRelease Version | 140

Modules

device short int __bfloat16_as_short (const

__nv_bfloat16 h)
Reinterprets bits in a nv_bfloat16 as a signed short integer.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
short int

‣ The reinterpreted value.

Description
Reinterprets the bits in the nv_bfloat16 floating-point number h as a signed short integer.

device unsigned short int __bfloat16_as_ushort (const

__nv_bfloat16 h)
Reinterprets bits in a nv_bfloat16 as an unsigned short integer.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
unsigned short int

‣ The reinterpreted value.

Description
Reinterprets the bits in the nv_bfloat16 floating-point h as an unsigned short number.

CUDA Math API vRelease Version | 141

Modules

hostdevice __nv_bfloat16 __double2bfloat16 (const

double a)
Converts double number to nv_bfloat16 precision in round-to-nearest-even mode and returns
nv_bfloat16 with converted value.

Parameters
a
- double. Is only being read.

Returns
nv_bfloat16

‣ a converted to nv_bfloat16.

Description
Converts double number a to nv_bfloat16 precision in round-to-nearest-even mode.

hostdevice __nv_bfloat162 __float22bfloat162_rn

(const float2 a)
Converts both components of float2 number to nv_bfloat16 precision in round-to-nearest-even
mode and returns nv_bfloat162 with converted values.

Parameters
a
- float2. Is only being read.

Returns
nv_bfloat162

‣ The nv_bfloat162 which has corresponding halves equal to the converted float2
components.

Description
Converts both components of float2 to nv_bfloat16 precision in round-to-nearest mode and
combines the results into one nv_bfloat162 number. Low 16 bits of the return value
correspond to a.x and high 16 bits of the return value correspond to a.y.

CUDA Math API vRelease Version | 142

Modules

hostdevice __nv_bfloat16 __float2bfloat16 (const

float a)
Converts float number to nv_bfloat16 precision in round-to-nearest-even mode and returns
nv_bfloat16 with converted value.

Parameters
a
- float. Is only being read.

Returns
nv_bfloat16

‣ a converted to nv_bfloat16.

Description
Converts float number a to nv_bfloat16 precision in round-to-nearest-even mode.

hostdevice __nv_bfloat162 __float2bfloat162_rn

(const float a)
Converts input to nv_bfloat16 precision in round-to-nearest-even mode and populates both
halves of nv_bfloat162 with converted value.

Parameters
a
- float. Is only being read.

Returns
nv_bfloat162

‣ The nv_bfloat162 value with both halves equal to the converted nv_bfloat16 precision
number.

Description
Converts input a to nv_bfloat16 precision in round-to-nearest-even mode and populates both
halves of nv_bfloat162 with converted value.

CUDA Math API vRelease Version | 143

Modules

hostdevice __nv_bfloat16 __float2bfloat16_rd

(const float a)
Converts float number to nv_bfloat16 precision in round-down mode and returns
nv_bfloat16 with converted value.

Parameters
a
- float. Is only being read.

Returns
nv_bfloat16

‣ a converted to nv_bfloat16.

Description
Converts float number a to nv_bfloat16 precision in round-down mode.

hostdevice __nv_bfloat16 __float2bfloat16_rn

(const float a)
Converts float number to nv_bfloat16 precision in round-to-nearest-even mode and returns
nv_bfloat16 with converted value.

Parameters
a
- float. Is only being read.

Returns
nv_bfloat16

‣ a converted to nv_bfloat16.

Description
Converts float number a to nv_bfloat16 precision in round-to-nearest-even mode.

CUDA Math API vRelease Version | 144

Modules

hostdevice __nv_bfloat16 __float2bfloat16_ru

(const float a)
Converts float number to nv_bfloat16 precision in round-up mode and returns nv_bfloat16
with converted value.

Parameters
a
- float. Is only being read.

Returns
nv_bfloat16

‣ a converted to nv_bfloat16.

Description
Converts float number a to nv_bfloat16 precision in round-up mode.

hostdevice __nv_bfloat16 __float2bfloat16_rz

(const float a)
Converts float number to nv_bfloat16 precision in round-towards-zero mode and returns
nv_bfloat16 with converted value.

Parameters
a
- float. Is only being read.

Returns
nv_bfloat16

‣ a converted to nv_bfloat16.

Description
Converts float number a to nv_bfloat16 precision in round-towards-zero mode.

CUDA Math API vRelease Version | 145

Modules

hostdevice __nv_bfloat162 __floats2bfloat162_rn

(const float a, const float b)
Converts both input floats to nv_bfloat16 precision in round-to-nearest-even mode and returns
nv_bfloat162 with converted values.

Parameters
a
- float. Is only being read.
b
- float. Is only being read.

Returns
nv_bfloat162

‣ The nv_bfloat162 value with corresponding halves equal to the converted input floats.

Description
Converts both input floats to nv_bfloat16 precision in round-to-nearest-even mode and
combines the results into one nv_bfloat162 number. Low 16 bits of the return value
correspond to the input a, high 16 bits correspond to the input b.

device __nv_bfloat162 __halves2bfloat162 (const

__nv_bfloat16 a, const __nv_bfloat16 b)
Combines two nv_bfloat16 numbers into one nv_bfloat162 number.

Parameters
a
- nv_bfloat16. Is only being read.
b
- nv_bfloat16. Is only being read.

Returns
nv_bfloat162

‣ The nv_bfloat162 with one nv_bfloat16 equal to a and the other to b.

Description
Combines two input nv_bfloat16 number a and b into one nv_bfloat162 number. Input a
is stored in low 16 bits of the return value, input b is stored in high 16 bits of the return value.

CUDA Math API vRelease Version | 146

Modules

device __nv_bfloat16 __high2bfloat16 (const

__nv_bfloat162 a)
Returns high 16 bits of nv_bfloat162 input.

Parameters
a
- nv_bfloat162. Is only being read.

Returns
nv_bfloat16

‣ The high 16 bits of the input.

Description
Returns high 16 bits of nv_bfloat162 input a.

device __nv_bfloat162 __high2bfloat162 (const

__nv_bfloat162 a)
Extracts high 16 bits from nv_bfloat162 input.

Parameters
a
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The nv_bfloat162 with both halves equal to the high 16 bits of the input.

Description
Extracts high 16 bits from nv_bfloat162 input a and returns a new nv_bfloat162 number
which has both halves equal to the extracted bits.

CUDA Math API vRelease Version | 147

Modules

hostdevice float __high2float (const

__nv_bfloat162 a)
Converts high 16 bits of nv_bfloat162 to float and returns the result.

Parameters
a
- nv_bfloat162. Is only being read.

Returns
float

‣ The high 16 bits of a converted to float.

Description
Converts high 16 bits of nv_bfloat162 input a to 32-bit floating-point number and returns
the result.

device __nv_bfloat162 __highs2bfloat162 (const

__nv_bfloat162 a, const __nv_bfloat162 b)
Extracts high 16 bits from each of the two nv_bfloat162 inputs and combines into one
nv_bfloat162 number.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The high 16 bits of a and of b.

Description
Extracts high 16 bits from each of the two nv_bfloat162 inputs and combines into one
nv_bfloat162 number. High 16 bits from input a is stored in low 16 bits of the return value,
high 16 bits from input b is stored in high 16 bits of the return value.

CUDA Math API vRelease Version | 148

Modules

device __nv_bfloat16 __int2bfloat16_rd (const int i)

Convert a signed integer to a nv_bfloat16 in round-down mode.

Parameters
i
- int. Is only being read.

Returns
nv_bfloat16

‣ i converted to nv_bfloat16.

Description
Convert the signed integer value i to a nv_bfloat16 floating-point value in round-down mode.

hostdevice __nv_bfloat16 __int2bfloat16_rn (const

int i)
Convert a signed integer to a nv_bfloat16 in round-to-nearest-even mode.

Parameters
i
- int. Is only being read.

Returns
nv_bfloat16

‣ i converted to nv_bfloat16.

Description
Convert the signed integer value i to a nv_bfloat16 floating-point value in round-to-nearest-
even mode.

device __nv_bfloat16 __int2bfloat16_ru (const int i)

Convert a signed integer to a nv_bfloat16 in round-up mode.

Parameters
i
- int. Is only being read.

CUDA Math API vRelease Version | 149

Modules

Returns
nv_bfloat16

‣ i converted to nv_bfloat16.

Description
Convert the signed integer value i to a nv_bfloat16 floating-point value in round-up mode.

device __nv_bfloat16 __int2bfloat16_rz (const int i)

Convert a signed integer to a nv_bfloat16 in round-towards-zero mode.

Parameters
i
- int. Is only being read.

Returns
nv_bfloat16

‣ i converted to nv_bfloat16.

Description
Convert the signed integer value i to a nv_bfloat16 floating-point value in round-towards-zero
mode.

device __nv_bfloat16 ldca (const nv_bfloat16 *ptr)

Generates a `ld.global.ca` load instruction.

Parameters
ptr
- memory location

Returns
The value pointed by `ptr`

CUDA Math API vRelease Version | 150

Modules

device __nv_bfloat162 ldca (const nv_bfloat162

*ptr)
Generates a `ld.global.ca` load instruction.

Parameters
ptr
- memory location

Returns
The value pointed by `ptr`

device __nv_bfloat16 ldcg (const nv_bfloat16 *ptr)

Generates a `ld.global.cg` load instruction.

Parameters
ptr
- memory location

Returns
The value pointed by `ptr`

device __nv_bfloat162 ldcg (const nv_bfloat162

*ptr)
Generates a `ld.global.cg` load instruction.

Parameters
ptr
- memory location

Returns
The value pointed by `ptr`

device __nv_bfloat16 ldcs (const nv_bfloat16 *ptr)

Generates a `ld.global.cs` load instruction.

Parameters
ptr
- memory location

CUDA Math API vRelease Version | 151

Modules

Returns
The value pointed by `ptr`

device __nv_bfloat162 ldcs (const nv_bfloat162

*ptr)
Generates a `ld.global.cs` load instruction.

Parameters
ptr
- memory location

Returns
The value pointed by `ptr`

device __nv_bfloat16 ldcv (const nv_bfloat16 *ptr)

Generates a `ld.global.cv` load instruction.

Parameters
ptr
- memory location

Returns
The value pointed by `ptr`

device __nv_bfloat162 ldcv (const nv_bfloat162

*ptr)
Generates a `ld.global.cv` load instruction.

Parameters
ptr
- memory location

Returns
The value pointed by `ptr`

CUDA Math API vRelease Version | 152

Modules

device __nv_bfloat16 ldg (const nv_bfloat16 *ptr)

Generates a `ld.global.nc` load instruction.

Parameters
ptr
- memory location

Returns
The value pointed by `ptr`

device __nv_bfloat162 ldg (const nv_bfloat162 *ptr)

Generates a `ld.global.nc` load instruction.

Parameters
ptr
- memory location

Returns
The value pointed by `ptr`

device __nv_bfloat16 ldlu (const nv_bfloat16 *ptr)

Generates a `ld.global.lu` load instruction.

Parameters
ptr
- memory location

Returns
The value pointed by `ptr`

device __nv_bfloat162 ldlu (const nv_bfloat162

*ptr)
Generates a `ld.global.lu` load instruction.

Parameters
ptr
- memory location

Returns
The value pointed by `ptr`

CUDA Math API vRelease Version | 153

Modules

device __nv_bfloat16 __ll2bfloat16_rd (const long long

int i)
Convert a signed 64-bit integer to a nv_bfloat16 in round-down mode.

Parameters
i
- long long int. Is only being read.

Returns
nv_bfloat16

‣ i converted to nv_bfloat16.

Description
Convert the signed 64-bit integer value i to a nv_bfloat16 floating-point value in round-down
mode.

hostdevice __nv_bfloat16 __ll2bfloat16_rn (const

long long int i)
Convert a signed 64-bit integer to a nv_bfloat16 in round-to-nearest-even mode.

Parameters
i
- long long int. Is only being read.

Returns
nv_bfloat16

‣ i converted to nv_bfloat16.

Description
Convert the signed 64-bit integer value i to a nv_bfloat16 floating-point value in round-to-
nearest-even mode.

CUDA Math API vRelease Version | 154

Modules

device __nv_bfloat16 __ll2bfloat16_ru (const long long

int i)
Convert a signed 64-bit integer to a nv_bfloat16 in round-up mode.

Parameters
i
- long long int. Is only being read.

Returns
nv_bfloat16

‣ i converted to nv_bfloat16.

Description
Convert the signed 64-bit integer value i to a nv_bfloat16 floating-point value in round-up
mode.

device __nv_bfloat16 __ll2bfloat16_rz (const long long

int i)
Convert a signed 64-bit integer to a nv_bfloat16 in round-towards-zero mode.

Parameters
i
- long long int. Is only being read.

Returns
nv_bfloat16

‣ i converted to nv_bfloat16.

Description
Convert the signed 64-bit integer value i to a nv_bfloat16 floating-point value in round-
towards-zero mode.

CUDA Math API vRelease Version | 155

Modules

device __nv_bfloat16 __low2bfloat16 (const

__nv_bfloat162 a)
Returns low 16 bits of nv_bfloat162 input.

Parameters
a
- nv_bfloat162. Is only being read.

Returns
nv_bfloat16

‣ Returns nv_bfloat16 which contains low 16 bits of the input a.

Description
Returns low 16 bits of nv_bfloat162 input a.

device __nv_bfloat162 __low2bfloat162 (const

__nv_bfloat162 a)
Extracts low 16 bits from nv_bfloat162 input.

Parameters
a
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The nv_bfloat162 with both halves equal to the low 16 bits of the input.

Description
Extracts low 16 bits from nv_bfloat162 input a and returns a new nv_bfloat162 number
which has both halves equal to the extracted bits.

CUDA Math API vRelease Version | 156

Modules

hostdevice float low2float (const nv_bfloat162

a)
Converts low 16 bits of nv_bfloat162 to float and returns the result.

Parameters
a
- nv_bfloat162. Is only being read.

Returns
float

‣ The low 16 bits of a converted to float.

Description
Converts low 16 bits of nv_bfloat162 input a to 32-bit floating-point number and returns
the result.

device __nv_bfloat162 __lowhigh2highlow (const

__nv_bfloat162 a)
Swaps both halves of the nv_bfloat162 input.

Parameters
a
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ a with its halves being swapped.

Description
Swaps both halves of the nv_bfloat162 input and returns a new nv_bfloat162 number
with swapped halves.

CUDA Math API vRelease Version | 157

Modules

device __nv_bfloat162 __lows2bfloat162 (const

__nv_bfloat162 a, const __nv_bfloat162 b)
Extracts low 16 bits from each of the two nv_bfloat162 inputs and combines into one
nv_bfloat162 number.

Parameters
a
- nv_bfloat162. Is only being read.
b
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The low 16 bits of a and of b.

Description
Extracts low 16 bits from each of the two nv_bfloat162 inputs and combines into one
nv_bfloat162 number. Low 16 bits from input a is stored in low 16 bits of the return value,
low 16 bits from input b is stored in high 16 bits of the return value.

device __nv_bfloat16 __shfl_down_sync (const

unsigned mask, const __nv_bfloat16 var, const unsigned int
delta, const int width)
Exchange a variable between threads within a warp. Copy from a thread with higher ID relative
to the caller.

Parameters
mask
- unsigned int. Is only being read.
var
- nv_bfloat16. Is only being read.
delta
- int. Is only being read.
width
- int. Is only being read.

CUDA Math API vRelease Version | 158

Modules

Returns
Returns the 2-byte word referenced by var from the source thread ID as nv_bfloat16. If the
source thread ID is out of range or the source thread has exited, the calling thread's own var is
returned.

Note:

For more details for this function see the Warp Shuffle Functions section in the CUDA C++
Programming Guide.

device __nv_bfloat162 __shfl_down_sync (const

unsigned mask, const __nv_bfloat162 var, const unsigned
int delta, const int width)
Exchange a variable between threads within a warp. Copy from a thread with higher ID relative
to the caller.

Parameters
mask
- unsigned int. Is only being read.
var
- nv_bfloat162. Is only being read.
delta
- int. Is only being read.
width
- int. Is only being read.

Returns
Returns the 4-byte word referenced by var from the source thread ID as nv_bfloat162. If the
source thread ID is out of range or the source thread has exited, the calling thread's own var is
returned.

CUDA Math API vRelease Version | 159

Modules

Note:

For more details for this function see the Warp Shuffle Functions section in the CUDA C++
Programming Guide.

device __nv_bfloat16 __shfl_sync (const unsigned

mask, const __nv_bfloat16 var, const int delta, const int
width)
Exchange a variable between threads within a warp. Direct copy from indexed thread.

Parameters
mask
- unsigned int. Is only being read.
var
- nv_bfloat16. Is only being read.
delta
- int. Is only being read.
width
- int. Is only being read.

CUDA Math API vRelease Version | 160

Modules

a value which is a power of 2; results are undefined if width is not a power of 2, or is a number
greater than warpSize.

Note:

For more details for this function see the Warp Shuffle Functions section in the CUDA C++
Programming Guide.

device __nv_bfloat162 __shfl_sync (const unsigned

mask, const __nv_bfloat162 var, const int delta, const int
width)
Exchange a variable between threads within a warp. Direct copy from indexed thread.

Parameters
mask
- unsigned int. Is only being read.
var
- nv_bfloat162. Is only being read.
delta
- int. Is only being read.
width
- int. Is only being read.

Note:

For more details for this function see the Warp Shuffle Functions section in the CUDA C++
Programming Guide.

CUDA Math API vRelease Version | 161

Modules

device __nv_bfloat16 __shfl_up_sync (const unsigned

mask, const __nv_bfloat16 var, const unsigned int delta,
const int width)
Exchange a variable between threads within a warp. Copy from a thread with lower ID relative
to the caller.

Parameters
mask
- unsigned int. Is only being read.
var
- nv_bfloat16. Is only being read.
delta
- int. Is only being read.
width
- int. Is only being read.

Note:

For more details for this function see the Warp Shuffle Functions section in the CUDA C++
Programming Guide.

CUDA Math API vRelease Version | 162

Modules

device __nv_bfloat162 __shfl_up_sync (const unsigned

mask, const __nv_bfloat162 var, const unsigned int delta,
const int width)
Exchange a variable between threads within a warp. Copy from a thread with lower ID relative
to the caller.

Parameters
mask
- unsigned int. Is only being read.
var
- nv_bfloat162. Is only being read.
delta
- int. Is only being read.
width
- int. Is only being read.

Note:

For more details for this function see the Warp Shuffle Functions section in the CUDA C++
Programming Guide.

CUDA Math API vRelease Version | 163

Modules

device __nv_bfloat16 __shfl_xor_sync (const unsigned

mask, const __nv_bfloat16 var, const int delta, const int
width)
Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR
of own thread ID.

Parameters
mask
- unsigned int. Is only being read.
var
- nv_bfloat16. Is only being read.
delta
- int. Is only being read.
width
- int. Is only being read.

Note:

For more details for this function see the Warp Shuffle Functions section in the CUDA C++
Programming Guide.

CUDA Math API vRelease Version | 164

Modules

device __nv_bfloat162 __shfl_xor_sync (const unsigned

mask, const __nv_bfloat162 var, const int delta, const int
width)
Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR
of own thread ID.

Parameters
mask
- unsigned int. Is only being read.
var
- nv_bfloat162. Is only being read.
delta
- int. Is only being read.
width
- int. Is only being read.

Note:

For more details for this function see the Warp Shuffle Functions section in the CUDA C++
Programming Guide.

CUDA Math API vRelease Version | 165

Modules

device __nv_bfloat16 __short2bfloat16_rd (const short

int i)
Convert a signed short integer to a nv_bfloat16 in round-down mode.

Parameters
i
- short int. Is only being read.

Returns
nv_bfloat16

‣ i converted to nv_bfloat16.

Description
Convert the signed short integer value i to a nv_bfloat16 floating-point value in round-down
mode.

hostdevice __nv_bfloat16 __short2bfloat16_rn

(const short int i)
Convert a signed short integer to a nv_bfloat16 in round-to-nearest-even mode.

Parameters
i
- short int. Is only being read.

Returns
nv_bfloat16

‣ i converted to nv_bfloat16.

Description
Convert the signed short integer value i to a nv_bfloat16 floating-point value in round-to-
nearest-even mode.

CUDA Math API vRelease Version | 166

Modules

device __nv_bfloat16 __short2bfloat16_ru (const short

int i)
Convert a signed short integer to a nv_bfloat16 in round-up mode.

Parameters
i
- short int. Is only being read.

Returns
nv_bfloat16

‣ i converted to nv_bfloat16.

Description
Convert the signed short integer value i to a nv_bfloat16 floating-point value in round-up
mode.

device __nv_bfloat16 __short2bfloat16_rz (const short

int i)
Convert a signed short integer to a nv_bfloat16 in round-towards-zero mode.

Parameters
i
- short int. Is only being read.

Returns
nv_bfloat16

‣ i converted to nv_bfloat16.

Description
Convert the signed short integer value i to a nv_bfloat16 floating-point value in round-
towards-zero mode.

CUDA Math API vRelease Version | 167

Modules

device __nv_bfloat16 __short_as_bfloat16 (const short

int i)
Reinterprets bits in a signed short integer as a nv_bfloat16.

Parameters
i
- short int. Is only being read.

Returns
nv_bfloat16

‣ The reinterpreted value.

Description
Reinterprets the bits in the signed short integer i as a nv_bfloat16 floating-point number.

device void stcg (const nv_bfloat16 *ptr, const

__nv_bfloat16 value)
Generates a `st.global.cg` store instruction.

Parameters
ptr
- memory location
value
- the value to be stored

device void stcg (const nv_bfloat162 *ptr, const

__nv_bfloat162 value)
Generates a `st.global.cg` store instruction.

Parameters
ptr
- memory location
value
- the value to be stored

CUDA Math API vRelease Version | 168

Modules

device void stcs (const nv_bfloat16 *ptr, const

__nv_bfloat16 value)
Generates a `st.global.cs` store instruction.

Parameters
ptr
- memory location
value
- the value to be stored

device void stcs (const nv_bfloat162 *ptr, const

__nv_bfloat162 value)
Generates a `st.global.cs` store instruction.

Parameters
ptr
- memory location
value
- the value to be stored

device void stwb (const nv_bfloat16 *ptr, const

__nv_bfloat16 value)
Generates a `st.global.wb` store instruction.

Parameters
ptr
- memory location
value
- the value to be stored

device void stwb (const nv_bfloat162 *ptr, const

__nv_bfloat162 value)
Generates a `st.global.wb` store instruction.

Parameters
ptr
- memory location
value
- the value to be stored

CUDA Math API vRelease Version | 169

Modules

device void stwt (const nv_bfloat16 *ptr, const

__nv_bfloat16 value)
Generates a `st.global.wt` store instruction.

Parameters
ptr
- memory location
value
- the value to be stored

device void stwt (const nv_bfloat162 *ptr, const

__nv_bfloat162 value)
Generates a `st.global.wt` store instruction.

Parameters
ptr
- memory location
value
- the value to be stored

device __nv_bfloat16 __uint2bfloat16_rd (const

unsigned int i)
Convert an unsigned integer to a nv_bfloat16 in round-down mode.

Parameters
i
- unsigned int. Is only being read.

Returns
nv_bfloat16

‣ i converted to nv_bfloat16.

Description
Convert the unsigned integer value i to a nv_bfloat16 floating-point value in round-down
mode.

CUDA Math API vRelease Version | 170

Modules

hostdevice __nv_bfloat16 __uint2bfloat16_rn

(const unsigned int i)
Convert an unsigned integer to a nv_bfloat16 in round-to-nearest-even mode.

Parameters
i
- unsigned int. Is only being read.

Returns
nv_bfloat16

‣ i converted to nv_bfloat16.

Description
Convert the unsigned integer value i to a nv_bfloat16 floating-point value in round-to-nearest-
even mode.

device __nv_bfloat16 __uint2bfloat16_ru (const

unsigned int i)
Convert an unsigned integer to a nv_bfloat16 in round-up mode.

Parameters
i
- unsigned int. Is only being read.

Returns
nv_bfloat16

‣ i converted to nv_bfloat16.

Description
Convert the unsigned integer value i to a nv_bfloat16 floating-point value in round-up mode.

CUDA Math API vRelease Version | 171

Modules

device __nv_bfloat16 __uint2bfloat16_rz (const

unsigned int i)
Convert an unsigned integer to a nv_bfloat16 in round-towards-zero mode.

Parameters
i
- unsigned int. Is only being read.

Returns
nv_bfloat16

‣ i converted to nv_bfloat16.

Description
Convert the unsigned integer value i to a nv_bfloat16 floating-point value in round-towards-
zero mode.

device __nv_bfloat16 __ull2bfloat16_rd (const unsigned

long long int i)
Convert an unsigned 64-bit integer to a nv_bfloat16 in round-down mode.

Parameters
i
- unsigned long long int. Is only being read.

Returns
nv_bfloat16

‣ i converted to nv_bfloat16.

Description
Convert the unsigned 64-bit integer value i to a nv_bfloat16 floating-point value in round-
down mode.

CUDA Math API vRelease Version | 172

Modules

hostdevice __nv_bfloat16 __ull2bfloat16_rn (const

unsigned long long int i)
Convert an unsigned 64-bit integer to a nv_bfloat16 in round-to-nearest-even mode.

Parameters
i
- unsigned long long int. Is only being read.

Returns
nv_bfloat16

‣ i converted to nv_bfloat16.

Description
Convert the unsigned 64-bit integer value i to a nv_bfloat16 floating-point value in round-to-
nearest-even mode.

device __nv_bfloat16 __ull2bfloat16_ru (const unsigned

long long int i)
Convert an unsigned 64-bit integer to a nv_bfloat16 in round-up mode.

Parameters
i
- unsigned long long int. Is only being read.

Returns
nv_bfloat16

‣ i converted to nv_bfloat16.

Description
Convert the unsigned 64-bit integer value i to a nv_bfloat16 floating-point value in round-up
mode.

CUDA Math API vRelease Version | 173

Modules

device __nv_bfloat16 __ull2bfloat16_rz (const unsigned

long long int i)
Convert an unsigned 64-bit integer to a nv_bfloat16 in round-towards-zero mode.

Parameters
i
- unsigned long long int. Is only being read.

Returns
nv_bfloat16

‣ i converted to nv_bfloat16.

Description
Convert the unsigned 64-bit integer value i to a nv_bfloat16 floating-point value in round-
towards-zero mode.

device __nv_bfloat16 __ushort2bfloat16_rd (const

unsigned short int i)
Convert an unsigned short integer to a nv_bfloat16 in round-down mode.

Parameters
i
- unsigned short int. Is only being read.

Returns
nv_bfloat16

‣ i converted to nv_bfloat16.

Description
Convert the unsigned short integer value i to a nv_bfloat16 floating-point value in round-down
mode.

CUDA Math API vRelease Version | 174

Modules

hostdevice __nv_bfloat16 __ushort2bfloat16_rn

(const unsigned short int i)
Convert an unsigned short integer to a nv_bfloat16 in round-to-nearest-even mode.

Parameters
i
- unsigned short int. Is only being read.

Returns
nv_bfloat16

‣ i converted to nv_bfloat16.

Description
Convert the unsigned short integer value i to a nv_bfloat16 floating-point value in round-to-
nearest-even mode.

device __nv_bfloat16 __ushort2bfloat16_ru (const

unsigned short int i)
Convert an unsigned short integer to a nv_bfloat16 in round-up mode.

Parameters
i
- unsigned short int. Is only being read.

Returns
nv_bfloat16

‣ i converted to nv_bfloat16.

Description
Convert the unsigned short integer value i to a nv_bfloat16 floating-point value in round-up
mode.

CUDA Math API vRelease Version | 175

Modules

device __nv_bfloat16 __ushort2bfloat16_rz (const

unsigned short int i)
Convert an unsigned short integer to a nv_bfloat16 in round-towards-zero mode.

Parameters
i
- unsigned short int. Is only being read.

Returns
nv_bfloat16

‣ i converted to nv_bfloat16.

Description
Convert the unsigned short integer value i to a nv_bfloat16 floating-point value in round-
towards-zero mode.

device __nv_bfloat16 __ushort_as_bfloat16 (const

unsigned short int i)
Reinterprets bits in an unsigned short integer as a nv_bfloat16.

Parameters
i
- unsigned short int. Is only being read.

Returns
nv_bfloat16

‣ The reinterpreted value.

Description
Reinterprets the bits in the unsigned short integer i as a nv_bfloat16 floating-point number.

1.2.6. Bfloat16 Math Functions

Bfloat16 Precision Intrinsics
To use these functions, include the header file cuda_bf16.h in your program.

CUDA Math API vRelease Version | 176

Modules

device __nv_bfloat16 hceil (const __nv_bfloat16 h)

Calculate ceiling of the input argument.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
nv_bfloat16

‣ The smallest integer value not less than h.

Description
Compute the smallest integer value not less than h.

device __nv_bfloat16 hcos (const __nv_bfloat16 a)

Calculates nv_bfloat16 cosine in round-to-nearest-even mode.

Parameters
a
- nv_bfloat16. Is only being read.

Returns
nv_bfloat16

‣ The cosine of a.

Description
Calculates nv_bfloat16 cosine of input a in round-to-nearest-even mode.

device __nv_bfloat16 hexp (const __nv_bfloat16 a)

Calculates nv_bfloat16 natural exponential function in round-to-nearest mode.

Parameters
a
- nv_bfloat16. Is only being read.

Returns
nv_bfloat16

‣ The natural exponential function on a.

CUDA Math API vRelease Version | 177

Modules

Description
Calculates nv_bfloat16 natural exponential function of input a in round-to-nearest-even
mode.

device __nv_bfloat16 hexp10 (const __nv_bfloat16 a)

Calculates nv_bfloat16 decimal exponential function in round-to-nearest mode.

Parameters
a
- nv_bfloat16. Is only being read.

Returns
nv_bfloat16

‣ The decimal exponential function on a.

Description
Calculates nv_bfloat16 decimal exponential function of input a in round-to-nearest-even
mode.

device __nv_bfloat16 hexp2 (const __nv_bfloat16 a)

Calculates nv_bfloat16 binary exponential function in round-to-nearest mode.

Parameters
a
- nv_bfloat16. Is only being read.

Returns
nv_bfloat16

‣ The binary exponential function on a.

Description
Calculates nv_bfloat16 binary exponential function of input a in round-to-nearest-even
mode.

CUDA Math API vRelease Version | 178

Modules

device __nv_bfloat16 hfloor (const __nv_bfloat16 h)

Calculate the largest integer less than or equal to h.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
nv_bfloat16

‣ The largest integer value which is less than or equal to h.

Description
Calculate the largest integer value which is less than or equal to h.

device __nv_bfloat16 hlog (const __nv_bfloat16 a)

Calculates nv_bfloat16 natural logarithm in round-to-nearest-even mode.

Parameters
a
- nv_bfloat16. Is only being read.

Returns
nv_bfloat16

‣ The natural logarithm of a.

Description
Calculates nv_bfloat16 natural logarithm of input a in round-to-nearest-even mode.

device __nv_bfloat16 hlog10 (const __nv_bfloat16 a)

Calculates nv_bfloat16 decimal logarithm in round-to-nearest-even mode.

Parameters
a
- nv_bfloat16. Is only being read.

Returns
nv_bfloat16

‣ The decimal logarithm of a.

CUDA Math API vRelease Version | 179

Modules

Description
Calculates nv_bfloat16 decimal logarithm of input a in round-to-nearest-even mode.

device __nv_bfloat16 hlog2 (const __nv_bfloat16 a)

Calculates nv_bfloat16 binary logarithm in round-to-nearest-even mode.

Parameters
a
- nv_bfloat16. Is only being read.

Returns
nv_bfloat16

‣ The binary logarithm of a.

Description
Calculates nv_bfloat16 binary logarithm of input a in round-to-nearest-even mode.

device __nv_bfloat16 hrcp (const __nv_bfloat16 a)

Calculates nv_bfloat16 reciprocal in round-to-nearest-even mode.

Parameters
a
- nv_bfloat16. Is only being read.

Returns
nv_bfloat16

‣ The reciprocal of a.

Description
Calculates nv_bfloat16 reciprocal of input a in round-to-nearest-even mode.

device __nv_bfloat16 hrint (const __nv_bfloat16 h)

Round input to nearest integer value in nv_bfloat16 floating-point number.

Parameters
h
- nv_bfloat16. Is only being read.

CUDA Math API vRelease Version | 180

Modules

Returns
nv_bfloat16

‣ The nearest integer to h.

Description
Round h to the nearest integer value in nv_bfloat16 floating-point format, with bfloat16way
cases rounded to the nearest even integer value.

device __nv_bfloat16 hrsqrt (const __nv_bfloat16 a)

Calculates nv_bfloat16 reciprocal square root in round-to-nearest-even mode.

Parameters
a
- nv_bfloat16. Is only being read.

Returns
nv_bfloat16

‣ The reciprocal square root of a.

Description
Calculates nv_bfloat16 reciprocal square root of input a in round-to-nearest mode.

device __nv_bfloat16 hsin (const __nv_bfloat16 a)

Calculates nv_bfloat16 sine in round-to-nearest-even mode.

Parameters
a
- nv_bfloat16. Is only being read.

Returns
nv_bfloat16

‣ The sine of a.

Description
Calculates nv_bfloat16 sine of input a in round-to-nearest-even mode.

CUDA Math API vRelease Version | 181

Modules

device __nv_bfloat16 hsqrt (const __nv_bfloat16 a)

Calculates nv_bfloat16 square root in round-to-nearest-even mode.

Parameters
a
- nv_bfloat16. Is only being read.

Returns
nv_bfloat16

‣ The square root of a.

Description
Calculates nv_bfloat16 square root of input a in round-to-nearest-even mode.

device __nv_bfloat16 htrunc (const __nv_bfloat16 h)

Truncate input argument to the integral part.

Parameters
h
- nv_bfloat16. Is only being read.

Returns
nv_bfloat16

‣ The truncated integer value.

Description
Round h to the nearest integer value that does not exceed h in magnitude.

1.2.7. Bfloat162 Math Functions

Bfloat16 Precision Intrinsics
To use these functions, include the header file cuda_bf16.h in your program.

CUDA Math API vRelease Version | 182

Modules

device __nv_bfloat162 h2ceil (const __nv_bfloat162 h)

Calculate nv_bfloat162 vector ceiling of the input argument.

Parameters
h
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The vector of smallest integers not less than h.

Description
For each component of vector h compute the smallest integer value not less than h.

device __nv_bfloat162 h2cos (const __nv_bfloat162 a)

Calculates nv_bfloat162 vector cosine in round-to-nearest-even mode.

Parameters
a
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The elementwise cosine on vector a.

Description
Calculates nv_bfloat162 cosine of input vector a in round-to-nearest-even mode.

device __nv_bfloat162 h2exp (const __nv_bfloat162 a)

Calculates nv_bfloat162 vector exponential function in round-to-nearest mode.

Parameters
a
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The elementwise exponential function on vector a.

CUDA Math API vRelease Version | 183

Modules

Description
Calculates nv_bfloat162 exponential function of input vector a in round-to-nearest-even
mode.

device __nv_bfloat162 h2exp10 (const __nv_bfloat162 a)

Calculates nv_bfloat162 vector decimal exponential function in round-to-nearest-even
mode.

Parameters
a
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The elementwise decimal exponential function on vector a.

Description
Calculates nv_bfloat162 decimal exponential function of input vector a in round-to-
nearest-even mode.

device __nv_bfloat162 h2exp2 (const __nv_bfloat162 a)

Calculates nv_bfloat162 vector binary exponential function in round-to-nearest-even
mode.

Parameters
a
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The elementwise binary exponential function on vector a.

Description
Calculates nv_bfloat162 binary exponential function of input vector a in round-to-nearest-
even mode.

CUDA Math API vRelease Version | 184

Modules

device __nv_bfloat162 h2floor (const __nv_bfloat162 h)

Calculate the largest integer less than or equal to h.

Parameters
h
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The vector of largest integers which is less than or equal to h.

Description
For each component of vector h calculate the largest integer value which is less than or equal
to h.

device __nv_bfloat162 h2log (const __nv_bfloat162 a)

Calculates nv_bfloat162 vector natural logarithm in round-to-nearest-even mode.

Parameters
a
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The elementwise natural logarithm on vector a.

Description
Calculates nv_bfloat162 natural logarithm of input vector a in round-to-nearest-even
mode.

device __nv_bfloat162 h2log10 (const __nv_bfloat162 a)

Calculates nv_bfloat162 vector decimal logarithm in round-to-nearest-even mode.

Parameters
a
- nv_bfloat162. Is only being read.

CUDA Math API vRelease Version | 185

Modules

Returns
nv_bfloat162

‣ The elementwise decimal logarithm on vector a.

Description
Calculates nv_bfloat162 decimal logarithm of input vector a in round-to-nearest-even
mode.

device __nv_bfloat162 h2log2 (const __nv_bfloat162 a)

Calculates nv_bfloat162 vector binary logarithm in round-to-nearest-even mode.

Parameters
a
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The elementwise binary logarithm on vector a.

Description
Calculates nv_bfloat162 binary logarithm of input vector a in round-to-nearest mode.

device __nv_bfloat162 h2rcp (const __nv_bfloat162 a)

Calculates nv_bfloat162 vector reciprocal in round-to-nearest-even mode.

Parameters
a
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The elementwise reciprocal on vector a.

Description
Calculates nv_bfloat162 reciprocal of input vector a in round-to-nearest-even mode.

CUDA Math API vRelease Version | 186

Modules

device __nv_bfloat162 h2rint (const __nv_bfloat162 h)

Round input to nearest integer value in nv_bfloat16 floating-point number.

Parameters
h
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The vector of rounded integer values.

Description
Round each component of nv_bfloat162 vector h to the nearest integer value in nv_bfloat16
floating-point format, with bfloat16way cases rounded to the nearest even integer value.

device __nv_bfloat162 h2rsqrt (const __nv_bfloat162 a)

Calculates nv_bfloat162 vector reciprocal square root in round-to-nearest mode.

Parameters
a
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The elementwise reciprocal square root on vector a.

Description
Calculates nv_bfloat162 reciprocal square root of input vector a in round-to-nearest-even
mode.

device __nv_bfloat162 h2sin (const __nv_bfloat162 a)

Calculates nv_bfloat162 vector sine in round-to-nearest-even mode.

Parameters
a
- nv_bfloat162. Is only being read.

CUDA Math API vRelease Version | 187

Modules

Returns
nv_bfloat162

‣ The elementwise sine on vector a.

Description
Calculates nv_bfloat162 sine of input vector a in round-to-nearest-even mode.

device __nv_bfloat162 h2sqrt (const __nv_bfloat162 a)

Calculates nv_bfloat162 vector square root in round-to-nearest-even mode.

Parameters
a
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The elementwise square root on vector a.

Description
Calculates nv_bfloat162 square root of input vector a in round-to-nearest mode.

device __nv_bfloat162 h2trunc (const __nv_bfloat162 h)

Truncate nv_bfloat162 vector input argument to the integral part.

Parameters
h
- nv_bfloat162. Is only being read.

Returns
nv_bfloat162

‣ The truncated h.

Description
Round each component of vector h to the nearest integer value that does not exceed h in
magnitude.

CUDA Math API vRelease Version | 188

Modules

1.3. Mathematical Functions

CUDA mathematical functions are always available in device code.
Host implementations of the common mathematical functions are mapped in a platform-
specific way to standard math library functions, provided by the host compiler and respective
host libm where available. Some functions, not available with the host compilers, are
implemented in crt/math_functions.hpp header file. For example, see erfinv(). Other, less
common functions, like rhypot(), cyl_bessel_i0() are only available in device code.
Note that many floating-point and integer functions names are overloaded for different
argument types. For example, the log() function has the following prototypes:
‎ double log(double x);
float log(float x);
float logf(float x);

1.4. Single Precision Mathematical

Functions
This section describes single precision mathematical functions. To use these functions you do
not need to include any additional header files in your program.

device float acosf (float x)

Calculate the arc cosine of the input argument.

Returns
Result will be in radians, in the interval [0, ] for x inside [-1, +1].

‣ acosf(1) returns +0.

‣ acosf(x) returns NaN for x outside [-1, +1].

Description
Calculate the principal value of the arc cosine of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 189

Modules

device float acoshf (float x)

Calculate the nonnegative inverse hyperbolic cosine of the input argument.

Returns
Result will be in the interval [0, ].

‣ acoshf(1) returns 0.
‣ acoshf(x) returns NaN for x in the interval [ , 1).

‣ acoshf( ) returns .

Description
Calculate the nonnegative inverse hyperbolic cosine of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float asinf (float x)

Calculate the arc sine of the input argument.

Returns
Result will be in radians, in the interval [- ,+ ] for x inside [-1, +1].

‣ asinf( ) returns .

‣ asinf(x) returns NaN for x outside [-1, +1].

Description
Calculate the principal value of the arc sine of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 190

Modules

device float asinhf (float x)

Calculate the inverse hyperbolic sine of the input argument.

Returns
‣ asinhf( ) returns .

‣ asinhf( ) returns .

Description
Calculate the inverse hyperbolic sine of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float atan2f (float y, float x)

Calculate the arc tangent of the ratio of first and second input arguments.

Returns
Result will be in radians, in the interval [- ,+ ].

‣ atan2f( , -0) returns .

‣ atan2f( , +0) returns .

‣ atan2f( , x) returns for x < 0.

‣ atan2f( , x) returns for x > 0.

‣ atan2f(y, ) returns /2 for y < 0.

‣ atan2f(y, ) returns /2 for y > 0.

‣ atan2f( , ) returns for finite y > 0.

‣ atan2f( , x) returns /2 for finite x.

‣ atan2f( , ) returns /4.

CUDA Math API vRelease Version | 191

Modules

Description
Calculate the principal value of the arc tangent of the ratio of first and second input arguments
y / x. The quadrant of the result is determined by the signs of inputs y and x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float atanf (float x)

Calculate the arc tangent of the input argument.

Returns
Result will be in radians, in the interval [- ,+ ].

‣ atanf( ) returns .

‣ atanf( ) returns /2.

Description
Calculate the principal value of the arc tangent of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float atanhf (float x)

Calculate the inverse hyperbolic tangent of the input argument.

Returns
‣ atanhf( ) returns .

‣ atanhf( ) returns .

‣ atanhf(x) returns NaN for x outside interval [-1, 1].

Description
Calculate the inverse hyperbolic tangent of the input argument x.

CUDA Math API vRelease Version | 192

Modules

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float cbrtf (float x)

Calculate the cube root of the input argument.

Returns
Returns .

‣ cbrtf( ) returns .

Description
Calculate the cube root of x, .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float ceilf (float x)

Calculate ceiling of the input argument.

Returns
Returns expressed as a floating-point number.

‣ ceilf( ) returns .

Description
Compute the smallest integer value not less than x.

CUDA Math API vRelease Version | 193

Modules

device float copysignf (float x, float y)

Create value with given magnitude, copying sign of second value.

Returns
Returns a value with the magnitude of x and the sign of y.

Description
Create a floating-point value with the magnitude x and the sign of y.

device float cosf (float x)

Calculate the cosine of the input argument.

Returns
‣ cosf( ) returns 1.

‣ cosf( ) returns NaN.

Description
Calculate the cosine of the input argument x (measured in radians).

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

‣ This function is affected by the --use_fast_math compiler flag. See the CUDA C++
Programming Guide, Mathematical Functions Appendix, Intrinsic Functions section for a
complete list of functions affected.

device float coshf (float x)

Calculate the hyperbolic cosine of the input argument.

Returns
‣ coshf( ) returns 1.

‣ coshf( ) returns .

Description
Calculate the hyperbolic cosine of the input argument x.

CUDA Math API vRelease Version | 194

Modules

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float cospif (float x)

Calculate the cosine of the input argument .

Returns
‣ cospif( ) returns 1.

‣ cospif( ) returns NaN.

Description
Calculate the cosine of x (measured in radians), where x is the input argument.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float cyl_bessel_i0f (float x)

Calculate the value of the regular modified cylindrical Bessel function of order 0 for the input
argument.

Returns
Returns the value of the regular modified cylindrical Bessel function of order 0.

Description
Calculate the value of the regular modified cylindrical Bessel function of order 0 for the input
argument x, .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 195

Modules

device float cyl_bessel_i1f (float x)

Calculate the value of the regular modified cylindrical Bessel function of order 1 for the input
argument.

Returns
Returns the value of the regular modified cylindrical Bessel function of order 1.

Description
Calculate the value of the regular modified cylindrical Bessel function of order 1 for the input
argument x, .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float erfcf (float x)

Calculate the complementary error function of the input argument.

Returns
‣ erfcf( ) returns 2.

‣ erfcf( ) returns +0.

Description
Calculate the complementary error function of the input argument x, 1 - erf(x).

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float erfcinvf (float x)

Calculate the inverse complementary error function of the input argument.

Returns
‣ erfcinvf( ) returns .

‣ erfcinvf(2) returns .

CUDA Math API vRelease Version | 196

Modules

‣ erfcinvf(x) returns NaN for x outside [0, 2].

Description
Calculate the inverse complementary error function (x), of the input argument x in the
interval [0, 2].

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float erfcxf (float x)

Calculate the scaled complementary error function of the input argument.

Returns
‣ erfcxf( ) returns .

‣ erfcxf( ) returns +0.

Description
Calculate the scaled complementary error function of the input argument x, .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float erff (float x)

Calculate the error function of the input argument.

Returns
‣ erff( ) returns .

‣ erff( ) returns .

Description

Calculate the value of the error function for the input argument x, .

CUDA Math API vRelease Version | 197

Modules

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float erfinvf (float x)

Calculate the inverse error function of the input argument.

Returns
‣ erfinvf( ) returns .

‣ erfinvf(1) returns .

‣ erfinvf(-1) returns .

‣ erfinvf(x) returns NaN for x outside [-1, +1].

Description
Calculate the inverse error function (x), of the input argument x in the interval [-1, 1].

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float exp10f (float x)

Calculate the base 10 exponential of the input argument.

Returns
‣ exp10f( ) returns 1.

‣ exp10f( ) returns +0.

‣ exp10f( ) returns .

Description
Calculate , the base 10 exponential of the input argument x.

Note:

CUDA Math API vRelease Version | 198

Modules

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float exp2f (float x)

Calculate the base 2 exponential of the input argument.

Returns
‣ exp2f( ) returns 1.

‣ exp2f( ) returns +0.

‣ exp2f( ) returns .

Description
Calculate , the base 2 exponential of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float expf (float x)

Calculate the base exponential of the input argument.

Returns
‣ expf( ) returns 1.

‣ expf( ) returns +0.

‣ expf( ) returns .

Description
Calculate , the base exponential of the input argument x.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 199

Modules

device float expm1f (float x)

Calculate the base exponential of the input argument, minus 1.

Returns
‣ expm1f( ) returns .

‣ expm1f( ) returns -1.

‣ expm1f( ) returns .

Description
Calculate -1, the base exponential of the input argument x, minus 1.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float fabsf (float x)

Calculate the absolute value of its argument.

Returns
Returns the absolute value of its argument.

‣ fabsf( ) returns .

‣ fabsf( ) returns +0.

‣ fabsf(NaN) returns an unspecified NaN.

Description
Calculate the absolute value of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 200

Modules

device float fdimf (float x, float y)

Compute the positive difference between x and y.

Returns
Returns the positive difference between x and y.

‣ fdimf(x, y) returns x - y if x > y.

‣ fdimf(x, y) returns +0 if x y.

Description
Compute the positive difference between x and y. The positive difference is x - y when x > y
and +0 otherwise.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float fdividef (float x, float y)

Divide two floating-point values.

Returns
Returns x / y.

Description
Compute x divided by y. If --use_fast_math is specified, use __fdividef() for higher
performance, otherwise use normal division.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 201

Modules

device float floorf (float x)

Calculate the largest integer less than or equal to x.

Returns
Returns expressed as a floating-point number.

‣ floorf( ) returns .

Description
Calculate the largest integer value which is less than or equal to x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float fmaf (float x, float y, float z)

Compute as a single operation.

Returns
Returns the rounded value of as a single operation.

‣ fmaf( , , z) returns NaN.

‣ fmaf(x, y, ) returns NaN if is an exact .

Description
Compute the value of as a single ternary operation. After computing the value to
infinite precision, the value is rounded once.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 202

Modules

device float fmaxf (float x, float y)

Determine the maximum numeric value of the arguments.

Returns
Returns the maximum numeric values of the arguments x and y.

‣ If both arguments are NaN, returns NaN.

‣ If one argument is NaN, returns the numeric argument.

Description
Determines the maximum numeric value of the arguments x and y. Treats NaN arguments as
missing data. If one argument is a NaN and the other is legitimate numeric value, the numeric
value is chosen.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float fminf (float x, float y)

Determine the minimum numeric value of the arguments.

Returns
Returns the minimum numeric value of the arguments x and y.

‣ If both arguments are NaN, returns NaN.

‣ If one argument is NaN, returns the numeric argument.

Description
Determines the minimum numeric value of the arguments x and y. Treats NaN arguments as
missing data. If one argument is a NaN and the other is legitimate numeric value, the numeric
value is chosen.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 203

Modules

device float fmodf (float x, float y)

Calculate the floating-point remainder of x / y.

Returns
‣ Returns the floating-point remainder of x / y.
‣ fmodf( , y) returns if y is not zero.

‣ fmodf(x, ) returns x if x is finite.

‣ fmodf(x, y) returns NaN if x is or y is zero.

‣ If either argument is NaN, NaN is returned.

Description
Calculate the floating-point remainder of x / y. The floating-point remainder of the division
operation x / y calculated by this function is exactly the value x - n*y, where n is x / y
with its fractional part truncated. The computed value will have the same sign as x, and its
magnitude will be less than the magnitude of y.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float frexpf (float x, int *nptr)

Extract mantissa and exponent of a floating-point value.

Returns
Returns the fractional component m.

‣ frexpf( , nptr) returns and stores zero in the location pointed to by nptr.

‣ frexpf( , nptr) returns and stores an unspecified value in the location to which
nptr points.

‣ frexpf(NaN, y) returns a NaN and stores an unspecified value in the location to which
nptr points.

Description
Decomposes the floating-point value x into a component m for the normalized fraction element
and another term n for the exponent. The absolute value of m will be greater than or equal to

CUDA Math API vRelease Version | 204

Modules

0.5 and less than 1.0 or it will be equal to 0; . The integer exponent n will be stored in
the location to which nptr points.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float hypotf (float x, float y)

Calculate the square root of the sum of squares of two arguments.

Returns
Returns the length of the hypotenuse .

‣ hypotf(x,y), hypotf(y,x), and hypotf(x, -y) are equivalent.

‣ hypotf(x, ) is equivalent to fabsf(x).

‣ hypotf( ,y) returns , even if y is a NaN.

Description
Calculates the length of the hypotenuse of a right triangle whose two sides have lengths x and
y without undue overflow or underflow.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device int ilogbf (float x)

Compute the unbiased integer exponent of the argument.

Returns
‣ If successful, returns the unbiased exponent of the argument.
‣ ilogbf( ) returns INT_MIN.

‣ ilogbf(NaN) returns INT_MIN.

‣ ilogbf( ) returns INT_MAX.

‣ Note: above behavior does not take into account FP_ILOGB0 nor FP_ILOGBNAN.

CUDA Math API vRelease Version | 205

Modules

Description
Calculates the unbiased integer exponent of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device __RETURN_TYPE isfinite (float a)

Determine whether argument is finite.

Returns
‣ With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns true if and only
if a is a finite value.

‣ With other host compilers: __RETURN_TYPE is 'int'. Returns a nonzero value if and only if
a is a finite value.

Description
Determine whether the floating-point value a is a finite value (zero, subnormal, or normal and
not infinity or NaN).

device __RETURN_TYPE isinf (float a)

Determine whether argument is infinite.

Returns
‣ With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns true if and only
if a is an infinite value.

‣ With other host compilers: __RETURN_TYPE is 'int'. Returns a nonzero value if and only if
a is an infinite value.

Description
Determine whether the floating-point value a is an infinite value (positive or negative).

CUDA Math API vRelease Version | 206

Modules

device __RETURN_TYPE isnan (float a)

Determine whether argument is a NaN.

Returns
‣ With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns true if and only
if a is a NaN value.

‣ With other host compilers: __RETURN_TYPE is 'int'. Returns a nonzero value if and only if
a is a NaN value.

Description
Determine whether the floating-point value a is a NaN.

device float j0f (float x)

Calculate the value of the Bessel function of the first kind of order 0 for the input argument.

Returns
Returns the value of the Bessel function of the first kind of order 0.

‣ j0f( ) returns +0.

‣ j0f(NaN) returns NaN.

Description
Calculate the value of the Bessel function of the first kind of order 0 for the input argument x,
.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float j1f (float x)

Calculate the value of the Bessel function of the first kind of order 1 for the input argument.

Returns
Returns the value of the Bessel function of the first kind of order 1.

‣ j1f( ) returns .

CUDA Math API vRelease Version | 207

Modules

‣ j1f(NaN) returns NaN.

Description
Calculate the value of the Bessel function of the first kind of order 1 for the input argument x,
.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float jnf (int n, float x)

Calculate the value of the Bessel function of the first kind of order n for the input argument.

Returns
Returns the value of the Bessel function of the first kind of order n.

‣ jnf(n, NaN) returns NaN.

‣ jnf(n, x) returns NaN for n < 0.
‣ jnf(n, ) returns +0.

Description
Calculate the value of the Bessel function of the first kind of order n for the input argument x,
.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float ldexpf (float x, int exp)

Calculate the value of .

Returns
‣ ldexpf(x, exp) is equivalent to scalbnf(x, exp).

Description
Calculate the value of of the input arguments x and exp.

CUDA Math API vRelease Version | 208

Modules

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float lgammaf (float x)

Calculate the natural logarithm of the absolute value of the gamma function of the input
argument.

Returns
‣ lgammaf(1) returns +0.
‣ lgammaf(2) returns +0.
‣ lgammaf(x) returns if x 0 and x is an integer.

‣ lgammaf( ) returns .

Description
Calculate the natural logarithm of the absolute value of the gamma function of the input
argument x, namely the value of

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device long long int llrintf (float x)

Round input to nearest integer value.

Returns
Returns rounded integer value.

Description
Round x to the nearest integer value, with halfway cases rounded to the nearest even integer
value. If the result is outside the range of the return type, the behavior is undefined.

CUDA Math API vRelease Version | 209

Modules

device long long int llroundf (float x)

Round to nearest integer value.

Returns
Returns rounded integer value.

Description
Round x to the nearest integer value, with halfway cases rounded away from zero. If the result
is outside the range of the return type, the behavior is undefined.

Note:

This function may be slower than alternate rounding methods. See llrintf().

device float log10f (float x)

Calculate the base 10 logarithm of the input argument.

Returns
‣ log10f( ) returns .

‣ log10f(1) returns +0.

‣ log10f(x) returns NaN for x < 0.
‣ log10f( ) returns .

Description
Calculate the base 10 logarithm of the input argument x.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 210

Modules

device float log1pf (float x)

Calculate the value of .

Returns
‣ log1pf( ) returns .

‣ log1pf(-1) returns .

‣ log1pf(x) returns NaN for x < -1.

‣ log1pf( ) returns .

Description
Calculate the value of of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float log2f (float x)

Calculate the base 2 logarithm of the input argument.

Returns
‣ log2f( ) returns .

‣ log2f(1) returns +0.

‣ log2f(x) returns NaN for x < 0.
‣ log2f( ) returns .

Description
Calculate the base 2 logarithm of the input argument x.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 211

Modules

device float logbf (float x)

Calculate the floating-point representation of the exponent of the input argument.

Returns
‣ logbf( ) returns .

‣ logbf( ) returns .

Description
Calculate the floating-point representation of the exponent of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float logf (float x)

Calculate the natural logarithm of the input argument.

Returns
‣ logf( ) returns .

‣ logf(1) returns +0.

‣ logf(x) returns NaN for x < 0.
‣ logf( ) returns .

Description
Calculate the natural logarithm of the input argument x.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 212

Modules

device long int lrintf (float x)

Round input to nearest integer value.

Returns
Returns rounded integer value.

Description
Round x to the nearest integer value, with halfway cases rounded to the nearest even integer
value. If the result is outside the range of the return type, the behavior is undefined.

device long int lroundf (float x)

Round to nearest integer value.

Returns
Returns rounded integer value.

Description
Round x to the nearest integer value, with halfway cases rounded away from zero. If the result
is outside the range of the return type, the behavior is undefined.

Note:

This function may be slower than alternate rounding methods. See lrintf().

device float max (const float a, const float b)

Calculate the maximum value of the input float arguments.

Description
Calculate the maximum value of the arguments a and b. Behavior is equivalent to fmaxf()
function.
Note, this is different from std:: specification

CUDA Math API vRelease Version | 213

Modules

device float min (const float a, const float b)

Calculate the minimum value of the input float arguments.

Description
Calculate the minimum value of the arguments a and b. Behavior is equivalent to fminf()
function.
Note, this is different from std:: specification

device float modff (float x, float *iptr)

Break down the input argument into fractional and integral parts.

Returns
‣ modff( , iptr) returns a result with the same sign as x.

‣ modff( , iptr) returns and stores in the object pointed to by iptr.

‣ modff(NaN, iptr) stores a NaN in the object pointed to by iptr and returns a NaN.

Description
Break down the argument x into fractional and integral parts. The integral part is stored in the
argument iptr. Fractional and integral parts are given the same sign as the argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float nanf (const char *tagp)

Returns "Not a Number" value.

Returns
‣ nanf(tagp) returns NaN.

Description
Return a representation of a quiet NaN. Argument tagp selects one of the possible
representations.

CUDA Math API vRelease Version | 214

Modules

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float nearbyintf (float x)

Round the input argument to the nearest integer.

Returns
‣ nearbyintf( ) returns .

‣ nearbyintf( ) returns .

Description
Round argument x to an integer value in single precision floating-point format. Uses round to
nearest rounding, with ties rounding to even.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float nextafterf (float x, float y)

Return next representable single-precision floating-point value after argument x in the
direction of y.

Returns
‣ nextafterf(x, y) = y if x equals y.
‣ nextafterf(x, y) = NaN if either x or y are NaN.

Description
Calculate the next representable single-precision floating-point value following x in
the direction of y. For example, if y is greater than x, nextafterf() returns the smallest
representable number greater than x

Note:

CUDA Math API vRelease Version | 215

Modules

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float norm3df (float a, float b, float c)

Calculate the square root of the sum of squares of three coordinates of the argument.

Returns
Returns the length of the 3D vector .

‣ In the presence of an exactly infinite coordinate is returned, even if there are NaNs.

Description
Calculates the length of three dimensional vector in Euclidean space without undue overflow
or underflow.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float norm4df (float a, float b, float c, float

d)
Calculate the square root of the sum of squares of four coordinates of the argument.

Returns
Returns the length of the 4D vector .

‣ In the presence of an exactly infinite coordinate is returned, even if there are NaNs.

Description
Calculates the length of four dimensional vector in Euclidean space without undue overflow or
underflow.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 216

Modules

device float normcdff (float x)

Calculate the standard normal cumulative distribution function.

Returns
‣ normcdff( ) returns 1.

‣ normcdff( ) returns +0

Description
Calculate the cumulative distribution function of the standard normal distribution for input
argument x, .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float normcdfinvf (float x)

Calculate the inverse of the standard normal cumulative distribution function.

Returns
‣ normcdfinvf( ) returns .

‣ normcdfinvf(1) returns .

‣ normcdfinvf(x) returns NaN if x is not in the interval [0,1].

Description
Calculate the inverse of the standard normal cumulative distribution function for input
argument x, . The function is defined for input values in the interval .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 217

Modules

device float normf (int dim, const float *p)

Calculate the square root of the sum of squares of any number of coordinates.

Returns
Returns the length of the dim-D vector .

‣ In the presence of an exactly infinite coordinate is returned, even if there are NaNs.

Description
Calculates the length of a vector p, dimension of which is passed as an argument without
undue overflow or underflow.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float powf (float x, float y)

Calculate the value of first argument to the power of second argument.

Returns
‣ powf( , y) returns for y an odd integer less than 0.

‣ powf( , y) returns for y less than 0 and not an odd integer.

‣ powf( , y) returns for y an odd integer greater than 0.

‣ powf( , y) returns +0 for y > 0 and not an odd integer.

‣ powf(-1, ) returns 1.

‣ powf(+1, y) returns 1 for any y, even a NaN.

‣ powf(x, ) returns 1 for any x, even a NaN.

‣ powf(x, y) returns a NaN for finite x < 0 and finite non-integer y.

‣ powf(x, ) returns for .

‣ powf(x, ) returns +0 for .

‣ powf(x, ) returns for .

‣ powf( , y) returns -0 for y an odd integer less than 0.

CUDA Math API vRelease Version | 218

Modules

‣ powf( , y) returns +0 for y < 0 and not an odd integer.

‣ powf( , y) returns for y an odd integer greater than 0.

‣ powf( , y) returns for y > 0 and not an odd integer.

‣ powf( , y) returns +0 for y < 0.

‣ powf( , y) returns for y > 0.

Description
Calculate the value of x to the power of y.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float rcbrtf (float x)

Calculate reciprocal cube root function.

Returns
‣ rcbrt( ) returns .

‣ rcbrt( ) returns .

Description
Calculate reciprocal cube root function of x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 219

Modules

device float remainderf (float x, float y)

Compute single-precision floating-point remainder.

Returns
‣ remainderf(x, ) returns NaN.

‣ remainderf( , y) returns NaN.

‣ remainderf(x, ) returns x for finite x.

Description
Compute single-precision floating-point remainder r of dividing x by y for nonzero y. Thus
. The value n is the integer value nearest . In the case when , the even n
value is chosen.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float remquof (float x, float y, int *quo)

Compute single-precision floating-point remainder and part of quotient.

Returns
Returns the remainder.

‣ remquof(x, , quo) returns NaN and stores an unspecified value in the location to which
quo points.

‣ remquof( , y, quo) returns NaN and stores an unspecified value in the location to
which quo points.

‣ remquof(x, y, quo) returns NaN and stores an unspecified value in the location to which
quo points if either of x or y is NaN.

‣ remquof(x, , quo) returns x and stores zero in the location to which quo points for
finite x.

Description
Compute a single-precision floating-point remainder in the same way as the remainderf()
function. Argument quo returns part of quotient upon division of x by y. Value quo has the

CUDA Math API vRelease Version | 220

Modules

same sign as and may not be the exact quotient but agrees with the exact quotient in the
low order 3 bits.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float rhypotf (float x, float y)

Calculate one over the square root of the sum of squares of two arguments.

Returns
Returns one over the length of the hypotenuse .

‣ rhypotf(x,y), rhypotf(y,x), and rhypotf(x, -y) are equivalent.

‣ rhypotf( ,y) returns +0, even if y is a NaN.

Description
Calculates one over the length of the hypotenuse of a right triangle whose two sides have
lengths x and y without undue overflow or underflow.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float rintf (float x)

Round input to nearest integer value in floating-point.

Returns
Returns rounded integer value.

‣ rintf( ) returns .

Description
Round x to the nearest integer value in floating-point format, with halfway cases rounded to
the nearest even integer value.

CUDA Math API vRelease Version | 221

Modules

device float rnorm3df (float a, float b, float c)

Calculate one over the square root of the sum of squares of three coordinates.

Returns
Returns one over the length of the 3D vector .

‣ In the presence of an exactly infinite coordinate is returned, even if there are NaNs.

Description
Calculates one over the length of three dimension vector in Euclidean space without undue
overflow or underflow.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float rnorm4df (float a, float b, float c, float

d)
Calculate one over the square root of the sum of squares of four coordinates.

Returns
Returns one over the length of the 3D vector .

‣ In the presence of an exactly infinite coordinate is returned, even if there are NaNs.

Description
Calculates one over the length of four dimension vector in Euclidean space without undue
overflow or underflow.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 222

Modules

device float rnormf (int dim, const float *p)

Calculate the reciprocal of square root of the sum of squares of any number of coordinates.

Returns
Returns one over the length of the vector .

‣ In the presence of an exactly infinite coordinate is returned, even if there are NaNs.

Description
Calculates one over the length of vector p, dimension of which is passed as an argument, in
Euclidean space without undue overflow or underflow.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float roundf (float x)

Round to nearest integer value in floating-point.

Returns
Returns rounded integer value.

‣ roundf( ) returns .

Description
Round x to the nearest integer value in floating-point format, with halfway cases rounded
away from zero.

Note:

This function may be slower than alternate rounding methods. See rintf().

CUDA Math API vRelease Version | 223

Modules

device float rsqrtf (float x)

Calculate the reciprocal of the square root of the input argument.

Returns
Returns .

‣ rsqrtf( ) returns +0.

‣ rsqrtf( ) returns .

‣ rsqrtf(x) returns NaN if x is less than 0.

Description

Calculate the reciprocal of the nonnegative square root of x, .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float scalblnf (float x, long int n)

Scale floating-point input by integer power of two.

Returns
Returns x * .

‣ scalblnf( , n) returns .

‣ scalblnf(x, 0) returns x.
‣ scalblnf( , n) returns .

Description
Scale x by by efficient manipulation of the floating-point exponent.

device float scalbnf (float x, int n)

Scale floating-point input by integer power of two.

Returns
Returns x * .

CUDA Math API vRelease Version | 224

Modules

‣ scalbnf( , n) returns .

‣ scalbnf(x, 0) returns x.
‣ scalbnf( , n) returns .

Description
Scale x by by efficient manipulation of the floating-point exponent.

device __RETURN_TYPE signbit (float a)

Return the sign bit of the input.

Returns
Reports the sign bit of all values including infinities, zeros, and NaNs.

‣ With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns true if and only
if a is negative.

‣ With other host compilers: __RETURN_TYPE is 'int'. Returns a nonzero value if and only if
a is negative.

Description
Determine whether the floating-point value a is negative.

device void sincosf (float x, float sptr, float cptr)

Calculate the sine and cosine of the first input argument.

Returns
‣ none

Description
Calculate the sine and cosine of the first input argument x (measured in radians). The results
for sine and cosine are written into the second argument, sptr, and, respectively, third
argument, cptr.

CUDA Math API vRelease Version | 225

Modules

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device void sincospif (float x, float *sptr, float

*cptr)
Calculate the sine and cosine of the first input argument .

Returns
‣ none

Description
Calculate the sine and cosine of the first input argument, x (measured in radians), . The
results for sine and cosine are written into the second argument, sptr, and, respectively,
third argument, cptr.

device float sinf (float x)

Calculate the sine of the input argument.

Returns
‣ sinf( ) returns .

‣ sinf( ) returns NaN.

Description
Calculate the sine of the input argument x (measured in radians).

CUDA Math API vRelease Version | 226

Modules

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float sinhf (float x)

Calculate the hyperbolic sine of the input argument.

Returns
‣ sinhf( ) returns .

‣ sinhf( ) returns .

Description
Calculate the hyperbolic sine of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float sinpif (float x)

Calculate the sine of the input argument .

Returns
‣ sinpif( ) returns .

‣ sinpif( ) returns NaN.

Description
Calculate the sine of x (measured in radians), where x is the input argument.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 227

Modules

device float sqrtf (float x)

Calculate the square root of the input argument.

Returns
Returns .

‣ sqrtf( ) returns .

‣ sqrtf(x) returns NaN if x is less than 0.

Description

Calculate the nonnegative square root of x, .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float tanf (float x)

Calculate the tangent of the input argument.

Returns
‣ tanf( ) returns .

‣ tanf( ) returns NaN.

Description
Calculate the tangent of the input argument x (measured in radians).

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 228

Modules

device float tanhf (float x)

Calculate the hyperbolic tangent of the input argument.

Returns
‣ tanhf( ) returns .

‣ tanhf( ) returns .

Description
Calculate the hyperbolic tangent of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float tgammaf (float x)

Calculate the gamma function of the input argument.

Returns
‣ tgammaf( ) returns .

‣ tgammaf(2) returns +1.

‣ tgammaf(x) returns NaN if x < 0 and x is an integer.
‣ tgammaf( ) returns NaN.

‣ tgammaf( ) returns .

Description

Calculate the gamma function of the input argument x, namely the value of .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 229

Modules

device float truncf (float x)

Truncate input argument to the integral part.

Returns
Returns truncated integer value.

‣ truncf( ) returns .

Description
Round x to the nearest integer value that does not exceed x in magnitude.

device float y0f (float x)

Calculate the value of the Bessel function of the second kind of order 0 for the input argument.

Returns
Returns the value of the Bessel function of the second kind of order 0.

‣ y0f( ) returns .

‣ y0f(x) returns NaN for x < 0.

‣ y0f( ) returns +0.

‣ y0f(NaN) returns NaN.

Description
Calculate the value of the Bessel function of the second kind of order 0 for the input argument
x, .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float y1f (float x)

Calculate the value of the Bessel function of the second kind of order 1 for the input argument.

Returns
Returns the value of the Bessel function of the second kind of order 1.

CUDA Math API vRelease Version | 230

Modules

‣ y1f( ) returns .

‣ y1f(x) returns NaN for x < 0.

‣ y1f( ) returns +0.

‣ y1f(NaN) returns NaN.

Description
Calculate the value of the Bessel function of the second kind of order 1 for the input argument
x, .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float ynf (int n, float x)

Calculate the value of the Bessel function of the second kind of order n for the input argument.

Returns
Returns the value of the Bessel function of the second kind of order n.

‣ ynf(n, x) returns NaN for n < 0.

‣ ynf(n, ) returns .

‣ ynf(n, x) returns NaN for x < 0.

‣ ynf(n, ) returns +0.

‣ ynf(n, NaN) returns NaN.

Description
Calculate the value of the Bessel function of the second kind of order n for the input argument
x, .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 231

Modules

1.5. Double Precision Mathematical

Functions
This section describes double precision mathematical functions. To use these functions you do
not need to include any additional header files in your program.

device double acos (double x)

Calculate the arc cosine of the input argument.

Returns
Result will be in radians, in the interval [0, ] for x inside [-1, +1].

‣ acos(1) returns +0.

‣ acos(x) returns NaN for x outside [-1, +1].

Description
Calculate the principal value of the arc cosine of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double acosh (double x)

Calculate the nonnegative inverse hyperbolic cosine of the input argument.

Returns
Result will be in the interval [0, ].

‣ acosh(1) returns 0.
‣ acosh(x) returns NaN for x in the interval [ , 1).

‣ acosh( ) returns .

Description
Calculate the nonnegative inverse hyperbolic cosine of the input argument x.

CUDA Math API vRelease Version | 232

Modules

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double asin (double x)

Calculate the arc sine of the input argument.

Returns
Result will be in radians, in the interval [- /2, + /2] for x inside [-1, +1].

‣ asin( ) returns .

‣ asin(x) returns NaN for x outside [-1, +1].

Description
Calculate the principal value of the arc sine of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double asinh (double x)

Calculate the inverse hyperbolic sine of the input argument.

Returns
‣ asinh( ) returns .

‣ asinh( ) returns .

Description
Calculate the inverse hyperbolic sine of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 233

Modules

device double atan (double x)

Calculate the arc tangent of the input argument.

Returns
Result will be in radians, in the interval [- /2, + /2].

‣ atan( ) returns .

‣ atan( ) returns /2.

Description
Calculate the principal value of the arc tangent of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double atan2 (double y, double x)

Calculate the arc tangent of the ratio of first and second input arguments.

Returns
Result will be in radians, in the interval [- ,+ ].

‣ atan2( , -0) returns .

‣ atan2( , +0) returns .

‣ atan2( , x) returns for x < 0.

‣ atan2( , x) returns for x > 0.

‣ atan2(y, ) returns /2 for y < 0.

‣ atan2(y, ) returns /2 for y > 0.

‣ atan2( , ) returns for finite y > 0.

‣ atan2( , x) returns /2 for finite x.

‣ atan2( , ) returns /4.

CUDA Math API vRelease Version | 234

Modules

Description
Calculate the principal value of the arc tangent of the ratio of first and second input arguments
y / x. The quadrant of the result is determined by the signs of inputs y and x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double atanh (double x)

Calculate the inverse hyperbolic tangent of the input argument.

Returns
‣ atanh( ) returns .

‣ atanh( ) returns .

‣ atanh(x) returns NaN for x outside interval [-1, 1].

Description
Calculate the inverse hyperbolic tangent of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double cbrt (double x)

Calculate the cube root of the input argument.

Returns
Returns .

‣ cbrt( ) returns .

Description
Calculate the cube root of x, .

CUDA Math API vRelease Version | 235

Modules

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double ceil (double x)

Calculate ceiling of the input argument.

Returns
Returns expressed as a floating-point number.

‣ ceil( ) returns .

Description
Compute the smallest integer value not less than x.

device double copysign (double x, double y)

Create value with given magnitude, copying sign of second value.

Returns
Returns a value with the magnitude of x and the sign of y.

Description
Create a floating-point value with the magnitude x and the sign of y.

device double cos (double x)

Calculate the cosine of the input argument.

Returns
‣ cos( ) returns 1.

‣ cos( ) returns NaN.

Description
Calculate the cosine of the input argument x (measured in radians).

CUDA Math API vRelease Version | 236

Modules

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double cosh (double x)

Calculate the hyperbolic cosine of the input argument.

Returns
‣ cosh( ) returns 1.

‣ cosh( ) returns .

Description
Calculate the hyperbolic cosine of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double cospi (double x)

Calculate the cosine of the input argument .

Returns
‣ cospi( ) returns 1.

‣ cospi( ) returns NaN.

Description
Calculate the cosine of x (measured in radians), where x is the input argument.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 237

Modules

device double cyl_bessel_i0 (double x)

Calculate the value of the regular modified cylindrical Bessel function of order 0 for the input
argument.

Returns
Returns the value of the regular modified cylindrical Bessel function of order 0.

Description
Calculate the value of the regular modified cylindrical Bessel function of order 0 for the input
argument x, .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double cyl_bessel_i1 (double x)

Calculate the value of the regular modified cylindrical Bessel function of order 1 for the input
argument.

Returns
Returns the value of the regular modified cylindrical Bessel function of order 1.

Description
Calculate the value of the regular modified cylindrical Bessel function of order 1 for the input
argument x, .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double erf (double x)

Calculate the error function of the input argument.

Returns
‣ erf( ) returns .

‣ erf( ) returns .

CUDA Math API vRelease Version | 238

Modules

Description

Calculate the value of the error function for the input argument x, .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double erfc (double x)

Calculate the complementary error function of the input argument.

Returns
‣ erfc( ) returns 2.

‣ erfc( ) returns +0.

Description
Calculate the complementary error function of the input argument x, 1 - erf(x).

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double erfcinv (double x)

Calculate the inverse complementary error function of the input argument.

Returns
‣ erfcinv( ) returns .

‣ erfcinv(2) returns .

‣ erfcinv(x) returns NaN for x outside [0, 2].

Description
Calculate the inverse complementary error function (x), of the input argument x in the
interval [0, 2].

CUDA Math API vRelease Version | 239

Modules

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double erfcx (double x)

Calculate the scaled complementary error function of the input argument.

Returns
‣ erfcx( ) returns .

‣ erfcx( ) returns +0.

Description
Calculate the scaled complementary error function of the input argument x, .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double erfinv (double x)

Calculate the inverse error function of the input argument.

Returns
‣ erfinv( ) returns .

‣ erfinv(1) returns .

‣ erfinv(-1) returns .

‣ erfinv(x) returns NaN for x outside [-1, +1].

Description
Calculate the inverse error function (x), of the input argument x in the interval [-1, 1].

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 240

Modules

device double exp (double x)

Calculate the base exponential of the input argument.

Returns
‣ exp( ) returns 1.

‣ exp( ) returns +0.

‣ exp( ) returns .

Description
Calculate , the base exponential of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double exp10 (double x)

Calculate the base 10 exponential of the input argument.

Returns
‣ exp10( ) returns 1.

‣ exp10( ) returns +0.

‣ exp10( ) returns .

Description
Calculate , the base 10 exponential of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 241

Modules

device double exp2 (double x)

Calculate the base 2 exponential of the input argument.

Returns
‣ exp2( ) returns 1.

‣ exp2( ) returns +0.

‣ exp2( ) returns .

Description
Calculate , the base 2 exponential of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double expm1 (double x)

Calculate the base exponential of the input argument, minus 1.

Returns
‣ expm1( ) returns .

‣ expm1( ) returns -1.

‣ expm1( ) returns .

Description
Calculate -1, the base exponential of the input argument x, minus 1.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 242

Modules

device double fabs (double x)

Calculate the absolute value of the input argument.

Returns
Returns the absolute value of the input argument.

‣ fabs( ) returns .

‣ fabs( ) returns +0.

Description
Calculate the absolute value of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double fdim (double x, double y)

Compute the positive difference between x and y.

Returns
Returns the positive difference between x and y.

‣ fdim(x, y) returns x - y if x > y.

‣ fdim(x, y) returns +0 if x y.

Description
Compute the positive difference between x and y. The positive difference is x - y when x > y
and +0 otherwise.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 243

Modules

device double floor (double x)

Calculate the largest integer less than or equal to x.

Returns
Returns expressed as a floating-point number.

‣ floor( ) returns .

Description
Calculates the largest integer value which is less than or equal to x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double fma (double x, double y, double z)

Compute as a single operation.

Returns
Returns the rounded value of as a single operation.

‣ fma( , , z) returns NaN.

‣ fma(x, y, ) returns NaN if is an exact .

Description
Compute the value of as a single ternary operation. After computing the value to
infinite precision, the value is rounded once.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 244

Modules

device double fmax (double, double)

Determine the maximum numeric value of the arguments.

Returns
Returns the maximum numeric values of the arguments x and y.

‣ If both arguments are NaN, returns NaN.

‣ If one argument is NaN, returns the numeric argument.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double fmin (double x, double y)

Determine the minimum numeric value of the arguments.

Returns
Returns the minimum numeric value of the arguments x and y.

‣ If both arguments are NaN, returns NaN.

‣ If one argument is NaN, returns the numeric argument.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 245

Modules

device double fmod (double x, double y)

Calculate the double-precision floating-point remainder of x / y.

Returns
‣ Returns the floating-point remainder of x / y.
‣ fmod( , y) returns if y is not zero.

‣ fmod(x, ) returns x if x is finite.

‣ fmod(x, y) returns NaN if x is or y is zero.

‣ If either argument is NaN, NaN is returned.

Description
Calculate the double-precision floating-point remainder of x / y. The floating-point remainder
of the division operation x / y calculated by this function is exactly the value x - n*y, where
n is x / y with its fractional part truncated. The computed value will have the same sign as x,
and its magnitude will be less than the magnitude of y.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double frexp (double x, int *nptr)

Extract mantissa and exponent of a floating-point value.

Returns
Returns the fractional component m.

‣ frexp( , nptr) returns and stores zero in the location pointed to by nptr.

‣ frexp( , nptr) returns and stores an unspecified value in the location to which
nptr points.

‣ frexp(NaN, y) returns a NaN and stores an unspecified value in the location to which nptr
points.

Description
Decompose the floating-point value x into a component m for the normalized fraction element
and another term n for the exponent. The absolute value of m will be greater than or equal to

CUDA Math API vRelease Version | 246

Modules

0.5 and less than 1.0 or it will be equal to 0; . The integer exponent n will be stored in
the location to which nptr points.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double hypot (double x, double y)

Calculate the square root of the sum of squares of two arguments.

Returns
Returns the length of the hypotenuse .

‣ hypot(x,y), hypot(y,x), and hypot(x, -y) are equivalent.

‣ hypot(x, ) is equivalent to fabs(x).

‣ hypot( ,y) returns , even if y is a NaN.

Description
Calculate the length of the hypotenuse of a right triangle whose two sides have lengths x and
y without undue overflow or underflow.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device int ilogb (double x)

Compute the unbiased integer exponent of the argument.

Returns
‣ If successful, returns the unbiased exponent of the argument.
‣ ilogb( ) returns INT_MIN.

‣ ilogb(NaN) returns INT_MIN.

‣ ilogb( ) returns INT_MAX.

‣ Note: above behavior does not take into account FP_ILOGB0 nor FP_ILOGBNAN.

CUDA Math API vRelease Version | 247

Modules

Description
Calculates the unbiased integer exponent of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device __RETURN_TYPE isfinite (double a)

Determine whether argument is finite.

Returns
‣ With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns true if and only
if a is a finite value.

‣ With other host compilers: __RETURN_TYPE is 'int'. Returns a nonzero value if and only if
a is a finite value.

Description
Determine whether the floating-point value a is a finite value (zero, subnormal, or normal and
not infinity or NaN).

device __RETURN_TYPE isinf (double a)

Determine whether argument is infinite.

Returns
‣ With Visual Studio 2013 host compiler: Returns true if and only if a is an infinite value.
‣ With other host compilers: Returns a nonzero value if and only if a is an infinite value.

Description
Determine whether the floating-point value a is an infinite value (positive or negative).

CUDA Math API vRelease Version | 248

Modules

device __RETURN_TYPE isnan (double a)

Determine whether argument is a NaN.

Returns
‣ With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns true if and only
if a is a NaN value.

‣ With other host compilers: __RETURN_TYPE is 'int'. Returns a nonzero value if and only if
a is a NaN value.

Description
Determine whether the floating-point value a is a NaN.

device double j0 (double x)

Calculate the value of the Bessel function of the first kind of order 0 for the input argument.

Returns
Returns the value of the Bessel function of the first kind of order 0.

‣ j0( ) returns +0.

‣ j0(NaN) returns NaN.

Description
Calculate the value of the Bessel function of the first kind of order 0 for the input argument x,
.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double j1 (double x)

Calculate the value of the Bessel function of the first kind of order 1 for the input argument.

Returns
Returns the value of the Bessel function of the first kind of order 1.

‣ j1( ) returns .

CUDA Math API vRelease Version | 249

Modules

‣ j1(NaN) returns NaN.

Description
Calculate the value of the Bessel function of the first kind of order 1 for the input argument x,
.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double jn (int n, double x)

Calculate the value of the Bessel function of the first kind of order n for the input argument.

Returns
Returns the value of the Bessel function of the first kind of order n.

‣ jn(n, NaN) returns NaN.

‣ jn(n, x) returns NaN for n < 0.
‣ jn(n, ) returns +0.

Description
Calculate the value of the Bessel function of the first kind of order n for the input argument x,
.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double ldexp (double x, int exp)

Calculate the value of .

Returns
‣ ldexp(x, exp) is equivalent to scalbn(x, exp).

Description
Calculate the value of of the input arguments x and exp.

CUDA Math API vRelease Version | 250

Modules

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double lgamma (double x)

Calculate the natural logarithm of the absolute value of the gamma function of the input
argument.

Returns
‣ lgamma(1) returns +0.
‣ lgamma(2) returns +0.
‣ lgamma(x) returns if x 0 and x is an integer.

‣ lgamma( ) returns .

Description
Calculate the natural logarithm of the absolute value of the gamma function of the input
argument x, namely the value of

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device long long int llrint (double x)

Round input to nearest integer value.

Returns
Returns rounded integer value.

Description
Round x to the nearest integer value, with halfway cases rounded to the nearest even integer
value. If the result is outside the range of the return type, the behavior is undefined.

CUDA Math API vRelease Version | 251

Modules

device long long int llround (double x)

Round to nearest integer value.

Returns
Returns rounded integer value.

Description
Round x to the nearest integer value, with halfway cases rounded away from zero. If the result
is outside the range of the return type, the behavior is undefined.

Note:

This function may be slower than alternate rounding methods. See llrint().

device double log (double x)

Calculate the base logarithm of the input argument.

Returns
‣ log( ) returns .

‣ log(1) returns +0.

‣ log(x) returns NaN for x < 0.
‣ log( ) returns .

Description
Calculate the base logarithm of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double log10 (double x)

Calculate the base 10 logarithm of the input argument.

Returns
‣ log10( ) returns .

CUDA Math API vRelease Version | 252

Modules

‣ log10(1) returns +0.

‣ log10(x) returns NaN for x < 0.
‣ log10( ) returns .

Description
Calculate the base 10 logarithm of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double log1p (double x)

Calculate the value of .

Returns
‣ log1p( ) returns .

‣ log1p(-1) returns .

‣ log1p(x) returns NaN for x < -1.

‣ log1p( ) returns .

Description
Calculate the value of of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double log2 (double x)

Calculate the base 2 logarithm of the input argument.

Returns
‣ log2( ) returns .

‣ log2(1) returns +0.

CUDA Math API vRelease Version | 253

Modules

‣ log2(x) returns NaN for x < 0.

‣ log2( ) returns .

Description
Calculate the base 2 logarithm of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double logb (double x)

Calculate the floating-point representation of the exponent of the input argument.

Returns
‣ logb( ) returns .

‣ logb( ) returns .

Description
Calculate the floating-point representation of the exponent of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device long int lrint (double x)

Round input to nearest integer value.

Returns
Returns rounded integer value.

Description
Round x to the nearest integer value, with halfway cases rounded to the nearest even integer
value. If the result is outside the range of the return type, the behavior is undefined.

CUDA Math API vRelease Version | 254

Modules

device long int lround (double x)

Round to nearest integer value.

Returns
Returns rounded integer value.

Description
Round x to the nearest integer value, with halfway cases rounded away from zero. If the result
is outside the range of the return type, the behavior is undefined.

Note:

This function may be slower than alternate rounding methods. See lrint().

device double max (const double a, const float b)

Calculate the maximum value of the input double and float arguments.

Description
Convert float argument b to double, followed by fmax().
Note, this is different from std:: specification

device double max (const float a, const double b)

Calculate the maximum value of the input float and double arguments.

Description
Convert float argument a to double, followed by fmax().
Note, this is different from std:: specification

device double max (const double a, const double

b)
Calculate the maximum value of the input float arguments.

Description
Calculate the maximum value of the arguments a and b. Behavior is equivalent to fmax()
function.
Note, this is different from std:: specification

CUDA Math API vRelease Version | 255

Modules

device double min (const double a, const float b)

Calculate the minimum value of the input double and float arguments.

Description
Convert float argument b to double, followed by fmin().
Note, this is different from std:: specification

device double min (const float a, const double b)

Calculate the minimum value of the input float and double arguments.

Description
Convert float argument a to double, followed by fmin().
Note, this is different from std:: specification

device double min (const double a, const double

b)
Calculate the minimum value of the input float arguments.

Description
Calculate the minimum value of the arguments a and b. Behavior is equivalent to fmin()
function.
Note, this is different from std:: specification

device double modf (double x, double *iptr)

Break down the input argument into fractional and integral parts.

Returns
‣ modf( , iptr) returns a result with the same sign as x.

‣ modf( , iptr) returns and stores in the object pointed to by iptr.

‣ modf(NaN, iptr) stores a NaN in the object pointed to by iptr and returns a NaN.

Description
Break down the argument x into fractional and integral parts. The integral part is stored in the
argument iptr. Fractional and integral parts are given the same sign as the argument x.

CUDA Math API vRelease Version | 256

Modules

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double nan (const char *tagp)

Returns "Not a Number" value.

Returns
‣ nan(tagp) returns NaN.

Description
Return a representation of a quiet NaN. Argument tagp selects one of the possible
representations.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double nearbyint (double x)

Round the input argument to the nearest integer.

Returns
‣ nearbyint( ) returns .

‣ nearbyint( ) returns .

Description
Round argument x to an integer value in double precision floating-point format. Uses round to
nearest rounding, with ties rounding to even.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 257

Modules

device double nextafter (double x, double y)

Return next representable double-precision floating-point value after argument x in the
direction of y.

Returns
‣ nextafter(x, y) = y if x equals y.
‣ nextafter(x, y) = NaN if either x or y are NaN.

Description
Calculate the next representable double-precision floating-point value following x in
the direction of y. For example, if y is greater than x, nextafter() returns the smallest
representable number greater than x

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double norm (int dim, const double *p)

Calculate the square root of the sum of squares of any number of coordinates.

Returns
Returns the length of the dim-D vector .

‣ In the presence of an exactly infinite coordinate is returned, even if there are NaNs.

Description
Calculate the length of a vector p, dimension of which is passed as an argument without
undue overflow or underflow.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 258

Modules

device double norm3d (double a, double b,

double c)
Calculate the square root of the sum of squares of three coordinates of the argument.

Returns
Returns the length of 3D vector .

‣ In the presence of an exactly infinite coordinate is returned, even if there are NaNs.

Description
Calculate the length of three dimensional vector in Euclidean space without undue overflow or
underflow.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double norm4d (double a, double b,

double c, double d)
Calculate the square root of the sum of squares of four coordinates of the argument.

Returns
Returns the length of 4D vector .

‣ In the presence of an exactly infinite coordinate is returned, even if there are NaNs.

Description
Calculate the length of four dimensional vector in Euclidean space without undue overflow or
underflow.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 259

Modules

device double normcdf (double x)

Calculate the standard normal cumulative distribution function.

Returns
‣ normcdf( ) returns 1.

‣ normcdf( ) returns +0.

Description
Calculate the cumulative distribution function of the standard normal distribution for input
argument x, .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double normcdfinv (double x)

Calculate the inverse of the standard normal cumulative distribution function.

Returns
‣ normcdfinv( ) returns .

‣ normcdfinv(1) returns .

‣ normcdfinv(x) returns NaN if x is not in the interval [0,1].

Description
Calculate the inverse of the standard normal cumulative distribution function for input
argument x, . The function is defined for input values in the interval .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 260

Modules

device double pow (double x, double y)

Calculate the value of first argument to the power of second argument.

Returns
‣ pow( , y) returns for y an odd integer less than 0.

‣ pow( , y) returns for y less than 0 and not an odd integer.

‣ pow( , y) returns for y an odd integer greater than 0.

‣ pow( , y) returns +0 for y > 0 and not an odd integer.

‣ pow(-1, ) returns 1.

‣ pow(+1, y) returns 1 for any y, even a NaN.

‣ pow(x, ) returns 1 for any x, even a NaN.

‣ pow(x, y) returns a NaN for finite x < 0 and finite non-integer y.

‣ pow(x, ) returns for .

‣ pow(x, ) returns +0 for .

‣ pow(x, ) returns for .

‣ pow( , y) returns -0 for y an odd integer less than 0.

‣ pow( , y) returns +0 for y < 0 and not an odd integer.

‣ pow( , y) returns for y an odd integer greater than 0.

‣ pow( , y) returns for y > 0 and not an odd integer.

‣ pow( , y) returns +0 for y < 0.

‣ pow( , y) returns for y > 0.

Description
Calculate the value of x to the power of y.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 261

Modules

device double rcbrt (double x)

Calculate reciprocal cube root function.

Returns
‣ rcbrt( ) returns .

‣ rcbrt( ) returns .

Description
Calculate reciprocal cube root function of x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double remainder (double x, double y)

Compute double-precision floating-point remainder.

Returns
‣ remainder(x, ) returns NaN.

‣ remainder( , y) returns NaN.

‣ remainder(x, ) returns x for finite x.

Description
Compute double-precision floating-point remainder r of dividing x by y for nonzero y. Thus
. The value n is the integer value nearest . In the case when , the even n
value is chosen.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 262

Modules

device double remquo (double x, double y, int

*quo)
Compute double-precision floating-point remainder and part of quotient.

Returns
Returns the remainder.

‣ remquo(x, , quo) returns NaN and stores an unspecified value in the location to which
quo points.

‣ remquo( , y, quo) returns NaN and stores an unspecified value in the location to
which quo points.

‣ remquo(x, y, quo) returns NaN and stores an unspecified value in the location to which
quo points if either of x or y is NaN.

‣ remquo(x, , quo) returns x and stores zero in the location to which quo points for
finite x.

Description
Compute a double-precision floating-point remainder in the same way as the remainder()
function. Argument quo returns part of quotient upon division of x by y. Value quo has the
same sign as and may not be the exact quotient but agrees with the exact quotient in the
low order 3 bits.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double rhypot (double x, double y)

Calculate one over the square root of the sum of squares of two arguments.

Returns
Returns one over the length of the hypotenuse .

‣ rhypot(x,y), rhypot(y,x), and rhypot(x, -y) are equivalent.

‣ rhypot( ,y) returns +0, even if y is a NaN.

CUDA Math API vRelease Version | 263

Modules

Description
Calculate one over the length of the hypotenuse of a right triangle whose two sides have
lengths x and y without undue overflow or underflow.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double rint (double x)

Round to nearest integer value in floating-point.

Returns
Returns rounded integer value.

‣ rint( ) returns .

Description
Round x to the nearest integer value in floating-point format, with halfway cases rounded to
the nearest even integer value.

device double rnorm (int dim, const double *p)

Calculate the reciprocal of square root of the sum of squares of any number of coordinates.

Returns
Returns one over the length of the vector .

‣ In the presence of an exactly infinite coordinate is returned, even if there are NaNs.

Description
Calculates one over the length of vector p, dimension of which is passed as an argument, in
Euclidean space without undue overflow or underflow.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 264

Modules

device double rnorm3d (double a, double b,

double c)
Calculate one over the square root of the sum of squares of three coordinates.

Returns
Returns one over the length of the 3D vector .

‣ In the presence of an exactly infinite coordinate is returned, even if there are NaNs.

Description
Calculate one over the length of three dimensional vector in Euclidean space without undue
overflow or underflow.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double rnorm4d (double a, double b,

double c, double d)
Calculate one over the square root of the sum of squares of four coordinates.

Returns
Returns one over the length of the 3D vector .

‣ In the presence of an exactly infinite coordinate is returned, even if there are NaNs.

Description
Calculate one over the length of four dimensional vector in Euclidean space without undue
overflow or underflow.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 265

Modules

device double round (double x)

Round to nearest integer value in floating-point.

Returns
Returns rounded integer value.

‣ round( ) returns .

Description
Round x to the nearest integer value in floating-point format, with halfway cases rounded
away from zero.

Note:

This function may be slower than alternate rounding methods. See rint().

device double rsqrt (double x)

Calculate the reciprocal of the square root of the input argument.

Returns
Returns .

‣ rsqrt( ) returns +0.

‣ rsqrt( ) returns .

‣ rsqrt(x) returns NaN if x is less than 0.

Description

Calculate the reciprocal of the nonnegative square root of x, .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 266

Modules

device double scalbln (double x, long int n)

Scale floating-point input by integer power of two.

Returns
Returns x * .

‣ scalbln( , n) returns .

‣ scalbln(x, 0) returns x.
‣ scalbln( , n) returns .

Description
Scale x by by efficient manipulation of the floating-point exponent.

device double scalbn (double x, int n)

Scale floating-point input by integer power of two.

Returns
Returns x * .

‣ scalbn( , n) returns .

‣ scalbn(x, 0) returns x.
‣ scalbn( , n) returns .

Description
Scale x by by efficient manipulation of the floating-point exponent.

device __RETURN_TYPE signbit (double a)

Return the sign bit of the input.

Returns
Reports the sign bit of all values including infinities, zeros, and NaNs.

‣ With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns true if and only
if a is negative.

‣ With other host compilers: __RETURN_TYPE is 'int'. Returns a nonzero value if and only if
a is negative.

CUDA Math API vRelease Version | 267

Modules

Description
Determine whether the floating-point value a is negative.

device double sin (double x)

Calculate the sine of the input argument.

Returns
‣ sin( ) returns .

‣ sin( ) returns NaN.

Description
Calculate the sine of the input argument x (measured in radians).

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device void sincos (double x, double *sptr, double

*cptr)
Calculate the sine and cosine of the first input argument.

Returns
‣ none

CUDA Math API vRelease Version | 268

Modules

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device void sincospi (double x, double *sptr,

double *cptr)
Calculate the sine and cosine of the first input argument .

Returns
‣ none

device double sinh (double x)

Calculate the hyperbolic sine of the input argument.

Returns
‣ sinh( ) returns .

‣ sinh( ) returns .

Description
Calculate the hyperbolic sine of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 269

Modules

device double sinpi (double x)

Calculate the sine of the input argument .

Returns
‣ sinpi( ) returns .

‣ sinpi( ) returns NaN.

Description
Calculate the sine of x (measured in radians), where x is the input argument.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double sqrt (double x)

Calculate the square root of the input argument.

Returns
Returns .

‣ sqrt( ) returns .

‣ sqrt(x) returns NaN if x is less than 0.

Description

Calculate the nonnegative square root of x, .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 270

Modules

device double tan (double x)

Calculate the tangent of the input argument.

Returns
‣ tan( ) returns .

‣ tan( ) returns NaN.

Description
Calculate the tangent of the input argument x (measured in radians).

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double tanh (double x)

Calculate the hyperbolic tangent of the input argument.

Returns
‣ tanh( ) returns .

‣ tanh( ) returns .

Description
Calculate the hyperbolic tangent of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double tgamma (double x)

Calculate the gamma function of the input argument.

Returns
‣ tgamma( ) returns .

‣ tgamma(2) returns +1.

CUDA Math API vRelease Version | 271

Modules

‣ tgamma(x) returns NaN if x < 0 and x is an integer.

‣ tgamma( ) returns NaN.

‣ tgamma( ) returns .

Description

Calculate the gamma function of the input argument x, namely the value of .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double trunc (double x)

Truncate input argument to the integral part.

Returns
Returns truncated integer value.

‣ trunc( ) returns .

Description
Round x to the nearest integer value that does not exceed x in magnitude.

device double y0 (double x)

Calculate the value of the Bessel function of the second kind of order 0 for the input argument.

Returns
Returns the value of the Bessel function of the second kind of order 0.

‣ y0( ) returns .

‣ y0(x) returns NaN for x < 0.

‣ y0( ) returns +0.

‣ y0(NaN) returns NaN.

CUDA Math API vRelease Version | 272

Modules

Description
Calculate the value of the Bessel function of the second kind of order 0 for the input argument
x, .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double y1 (double x)

Calculate the value of the Bessel function of the second kind of order 1 for the input argument.

Returns
Returns the value of the Bessel function of the second kind of order 1.

‣ y1( ) returns .

‣ y1(x) returns NaN for x < 0.

‣ y1( ) returns +0.

‣ y1(NaN) returns NaN.

Description
Calculate the value of the Bessel function of the second kind of order 1 for the input argument
x, .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double yn (int n, double x)

Calculate the value of the Bessel function of the second kind of order n for the input argument.

Returns
Returns the value of the Bessel function of the second kind of order n.

‣ yn(n, x) returns NaN for n < 0.

‣ yn(n, ) returns .

CUDA Math API vRelease Version | 273

Modules

‣ yn(n, x) returns NaN for x < 0.

‣ yn(n, ) returns +0.

‣ yn(n, NaN) returns NaN.

Description
Calculate the value of the Bessel function of the second kind of order n for the input argument
x, .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

1.6. Integer Mathematical Functions

This section describes integer mathematical functions. To use these functions you do not need
to include any additional header files in your program.

device int abs (int a)

Calculate the absolute value of the input int argument.

Description
Calculate the absolute value of the input argument a.

device long int labs (long int a)

Calculate the absolute value of the input long int argument.

Description
Calculate the absolute value of the input argument a.

device long long int llabs (long long int a)

Calculate the absolute value of the input long long int argument.

Description
Calculate the absolute value of the input argument a.

CUDA Math API vRelease Version | 274

Modules

device long long int llmax (const long long int a,

const long long int b)
Calculate the maximum value of the input long long int arguments.

Description
Calculate the maximum value of the arguments a and b.

device long long int llmin (const long long int a,

const long long int b)
Calculate the minimum value of the input long long int arguments.

Description
Calculate the minimum value of the arguments a and b.

device unsigned long long int max (const

unsigned long long int a, const long long int b)
Calculate the maximum value of the input unsigned long long int and long long int
arguments.

Description
Calculate the maximum value of the arguments a and b, perform integer promotion first.

device unsigned long long int max (const long

long int a, const unsigned long long int b)
Calculate the maximum value of the input long long int and unsigned long long int
arguments.

Description
Calculate the maximum value of the arguments a and b, perform integer promotion first.

CUDA Math API vRelease Version | 275

Modules

device unsigned long long int max (const

unsigned long long int a, const unsigned long long int
b)
Calculate the maximum value of the input unsigned long long int arguments.

Description
Calculate the maximum value of the arguments a and b.

device long long int max (const long long int a,

const long long int b)
Calculate the maximum value of the input long long int arguments.

Description
Calculate the maximum value of the arguments a and b.

device unsigned long int max (const unsigned

long int a, const long int b)
Calculate the maximum value of the input unsigned long int and long int arguments.

Description
Calculate the maximum value of the arguments a and b, perform integer promotion first.

device unsigned long int max (const long int a,

const unsigned long int b)
Calculate the maximum value of the input long int and unsigned long int arguments.

Description
Calculate the maximum value of the arguments a and b, perform integer promotion first.

device unsigned long int max (const unsigned

long int a, const unsigned long int b)
Calculate the maximum value of the input unsigned long int arguments.

Description
Calculate the maximum value of the arguments a and b.

CUDA Math API vRelease Version | 276

Modules

device long int max (const long int a, const long

int b)
Calculate the maximum value of the input long int arguments.

Description
Calculate the maximum value of the arguments a and b.

device unsigned int max (const unsigned int a,

const int b)
Calculate the maximum value of the input unsigned int and int arguments.

Description
Calculate the maximum value of the arguments a and b, perform integer promotion first.

device unsigned int max (const int a, const

unsigned int b)
Calculate the maximum value of the input int and unsigned int arguments.

Description
Calculate the maximum value of the arguments a and b, perform integer promotion first.

device unsigned int max (const unsigned int a,

const unsigned int b)
Calculate the maximum value of the input unsigned int arguments.

Description
Calculate the maximum value of the arguments a and b.

device int max (const int a, const int b)

Calculate the maximum value of the input int arguments.

Description
Calculate the maximum value of the arguments a and b.

CUDA Math API vRelease Version | 277

Modules

device unsigned long long int min (const

unsigned long long int a, const long long int b)
Calculate the minimum value of the input unsigned long long int and long long int
arguments.

Description
Calculate the minimum value of the arguments a and b, perform integer promotion first.

device unsigned long long int min (const long

long int a, const unsigned long long int b)
Calculate the minimum value of the input long long int and unsigned long long int
arguments.

Description
Calculate the minimum value of the arguments a and b, perform integer promotion first.

device unsigned long long int min (const

unsigned long long int a, const unsigned long long int
b)
Calculate the minimum value of the input unsigned long long int arguments.

Description
Calculate the minimum value of the arguments a and b.

device long long int min (const long long int a,

const long long int b)
Calculate the minimum value of the input long long int arguments.

Description
Calculate the minimum value of the arguments a and b.

CUDA Math API vRelease Version | 278

Modules

device unsigned long int min (const unsigned

long int a, const long int b)
Calculate the minimum value of the input unsigned long int and long int arguments.

Description
Calculate the minimum value of the arguments a and b, perform integer promotion first.

device unsigned long int min (const long int a,

const unsigned long int b)
Calculate the minimum value of the input long int and unsigned long int arguments.

Description
Calculate the minimum value of the arguments a and b, perform integer promotion first.

device unsigned long int min (const unsigned

long int a, const unsigned long int b)
Calculate the minimum value of the input unsigned long int arguments.

Description
Calculate the minimum value of the arguments a and b.

device long int min (const long int a, const long

int b)
Calculate the minimum value of the input long int arguments.

Description
Calculate the minimum value of the arguments a and b.

device unsigned int min (const unsigned int a,

const int b)
Calculate the minimum value of the input unsigned int and int arguments.

Description
Calculate the minimum value of the arguments a and b, perform integer promotion first.

CUDA Math API vRelease Version | 279

Modules

device unsigned int min (const int a, const

unsigned int b)
Calculate the minimum value of the input int and unsigned int arguments.

Description
Calculate the minimum value of the arguments a and b, perform integer promotion first.

device unsigned int min (const unsigned int a,

const unsigned int b)
Calculate the minimum value of the input unsigned int arguments.

Description
Calculate the minimum value of the arguments a and b.

device int min (const int a, const int b)

Calculate the minimum value of the input int arguments.

Description
Calculate the minimum value of the arguments a and b.

device unsigned long long int ullmax (const

unsigned long long int a, const unsigned long long int
b)
Calculate the maximum value of the input unsigned long long int arguments.

Description
Calculate the maximum value of the arguments a and b.

device unsigned long long int ullmin (const

unsigned long long int a, const unsigned long long int
b)
Calculate the minimum value of the input unsigned long long int arguments.

Description
Calculate the minimum value of the arguments a and b.

CUDA Math API vRelease Version | 280

Modules

device unsigned int umax (const unsigned int a,

const unsigned int b)
Calculate the maximum value of the input unsigned int arguments.

Description
Calculate the maximum value of the arguments a and b.

device unsigned int umin (const unsigned int a,

const unsigned int b)
Calculate the minimum value of the input unsigned int arguments.

Description
Calculate the minimum value of the arguments a and b.

1.7. Single Precision Intrinsics

This section describes single precision intrinsic functions that are only supported in device
code. To use these functions you do not need to include any additional header files in your
program.

device float __cosf (float x)

Calculate the fast approximate cosine of the input argument.

Returns
Returns the approximate cosine of x.

Description
Calculate the fast approximate cosine of the input argument x, measured in radians.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Intrinsic Functions section.

CUDA Math API vRelease Version | 281

Modules

device float __exp10f (float x)

Calculate the fast approximate base 10 exponential of the input argument.

Returns
Returns an approximation to .

Description
Calculate the fast approximate base 10 exponential of the input argument x, .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Intrinsic Functions section.

device float __expf (float x)

Calculate the fast approximate base exponential of the input argument.

Returns
Returns an approximation to .

Description
Calculate the fast approximate base exponential of the input argument x, .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Intrinsic Functions section.

device float __fadd_rd (float x, float y)

Add two floating-point values in round-down mode.

Returns
Returns x + y.

Description
Compute the sum of x and y in round-down (to negative infinity) mode.

CUDA Math API vRelease Version | 282

Modules

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

‣ This operation will never be merged into a single multiply-add instruction.

device float __fadd_rn (float x, float y)

Add two floating-point values in round-to-nearest-even mode.

Returns
Returns x + y.

Description
Compute the sum of x and y in round-to-nearest-even rounding mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

‣ This operation will never be merged into a single multiply-add instruction.

device float __fadd_ru (float x, float y)

Add two floating-point values in round-up mode.

Returns
Returns x + y.

Description
Compute the sum of x and y in round-up (to positive infinity) mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

‣ This operation will never be merged into a single multiply-add instruction.

CUDA Math API vRelease Version | 283

Modules

device float __fadd_rz (float x, float y)

Add two floating-point values in round-towards-zero mode.

Returns
Returns x + y.

Description
Compute the sum of x and y in round-towards-zero mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

‣ This operation will never be merged into a single multiply-add instruction.

device float __fdiv_rd (float x, float y)

Divide two floating-point values in round-down mode.

Returns
Returns x / y.

Description
Divide two floating-point values x by y in round-down (to negative infinity) mode.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float __fdiv_rn (float x, float y)

Divide two floating-point values in round-to-nearest-even mode.

Returns
Returns x / y.

Description
Divide two floating-point values x by y in round-to-nearest-even mode.

CUDA Math API vRelease Version | 284

Modules

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float __fdiv_ru (float x, float y)

Divide two floating-point values in round-up mode.

Returns
Returns x / y.

Description
Divide two floating-point values x by y in round-up (to positive infinity) mode.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float __fdiv_rz (float x, float y)

Divide two floating-point values in round-towards-zero mode.

Returns
Returns x / y.

Description
Divide two floating-point values x by y in round-towards-zero mode.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float __fdividef (float x, float y)

Calculate the fast approximate division of the input arguments.

Returns
Returns x / y.

CUDA Math API vRelease Version | 285

Modules

‣ __fdividef( , y) returns NaN for .

‣ __fdividef(x, y) returns 0 for and finite .

Description
Calculate the fast approximate division of x by y.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Intrinsic Functions section.

device float __fmaf_ieee_rd (float x, float y, float

z)
Compute fused multiply-add operation in round-down mode, ignore -ftz=true compiler
flag.

Description
Behavior is the same as __fmaf_rd(x, y, z), the difference is in handling denormalized inputs
and outputs: -ftz compiler flag has no effect.

device float __fmaf_ieee_rn (float x, float y, float

z)
Compute fused multiply-add operation in round-to-nearest-even mode, ignore -ftz=true
compiler flag.

Description
Behavior is the same as __fmaf_rn(x, y, z), the difference is in handling denormalized inputs
and outputs: -ftz compiler flag has no effect.

device float __fmaf_ieee_ru (float x, float y, float

z)
Compute fused multiply-add operation in round-up mode, ignore -ftz=true compiler flag.

Description
Behavior is the same as __fmaf_ru(x, y, z), the difference is in handling denormalized inputs
and outputs: -ftz compiler flag has no effect.

CUDA Math API vRelease Version | 286

Modules

device float __fmaf_ieee_rz (float x, float y, float

z)
Compute fused multiply-add operation in round-towards-zero mode, ignore -ftz=true
compiler flag.

Description
Behavior is the same as __fmaf_rz(x, y, z), the difference is in handling denormalized inputs
and outputs: -ftz compiler flag has no effect.

device float __fmaf_rd (float x, float y, float z)

Compute as a single operation, in round-down mode.

Returns
Returns the rounded value of as a single operation.

‣ fmaf( , , z) returns NaN.

‣ fmaf(x, y, ) returns NaN if is an exact .

Description
Computes the value of as a single ternary operation, rounding the result once in
round-down (to negative infinity) mode.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float __fmaf_rn (float x, float y, float z)

Compute as a single operation, in round-to-nearest-even mode.

Returns
Returns the rounded value of as a single operation.

‣ fmaf( , , z) returns NaN.

CUDA Math API vRelease Version | 287

Modules

‣ fmaf(x, y, ) returns NaN if is an exact .

Description
Computes the value of as a single ternary operation, rounding the result once in
round-to-nearest-even mode.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float __fmaf_ru (float x, float y, float z)

Compute as a single operation, in round-up mode.

Returns
Returns the rounded value of as a single operation.

‣ fmaf( , , z) returns NaN.

‣ fmaf(x, y, ) returns NaN if is an exact .

Description
Computes the value of as a single ternary operation, rounding the result once in
round-up (to positive infinity) mode.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float __fmaf_rz (float x, float y, float z)

Compute as a single operation, in round-towards-zero mode.

Returns
Returns the rounded value of as a single operation.

‣ fmaf( , , z) returns NaN.

CUDA Math API vRelease Version | 288

Modules

‣ fmaf( , , z) returns NaN.

‣ fmaf(x, y, ) returns NaN if is an exact .

Description
Computes the value of as a single ternary operation, rounding the result once in
round-towards-zero mode.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float __fmul_rd (float x, float y)

Multiply two floating-point values in round-down mode.

Returns
Returns x * y.

Description
Compute the product of x and y in round-down (to negative infinity) mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

‣ This operation will never be merged into a single multiply-add instruction.

device float __fmul_rn (float x, float y)

Multiply two floating-point values in round-to-nearest-even mode.

Returns
Returns x * y.

Description
Compute the product of x and y in round-to-nearest-even mode.

CUDA Math API vRelease Version | 289

Modules

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

‣ This operation will never be merged into a single multiply-add instruction.

device float __fmul_ru (float x, float y)

Multiply two floating-point values in round-up mode.

Returns
Returns x * y.

Description
Compute the product of x and y in round-up (to positive infinity) mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

‣ This operation will never be merged into a single multiply-add instruction.

device float __fmul_rz (float x, float y)

Multiply two floating-point values in round-towards-zero mode.

Returns
Returns x * y.

Description
Compute the product of x and y in round-towards-zero mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

‣ This operation will never be merged into a single multiply-add instruction.

CUDA Math API vRelease Version | 290

Modules

device float __frcp_rd (float x)

Compute in round-down mode.

Returns
Returns .

Description
Compute the reciprocal of x in round-down (to negative infinity) mode.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float __frcp_rn (float x)

Compute in round-to-nearest-even mode.

Returns
Returns .

Description
Compute the reciprocal of x in round-to-nearest-even mode.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float __frcp_ru (float x)

Compute in round-up mode.

Returns
Returns .

CUDA Math API vRelease Version | 291

Modules

Description
Compute the reciprocal of x in round-up (to positive infinity) mode.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float __frcp_rz (float x)

Compute in round-towards-zero mode.

Returns
Returns .

Description
Compute the reciprocal of x in round-towards-zero mode.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float __frsqrt_rn (float x)

Compute in round-to-nearest-even mode.

Returns
Returns .

Description
Compute the reciprocal square root of x in round-to-nearest-even mode.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 292

Modules

device float __fsqrt_rd (float x)

Compute in round-down mode.

Returns
Returns .

Description
Compute the square root of x in round-down (to negative infinity) mode.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float __fsqrt_rn (float x)

Compute in round-to-nearest-even mode.

Returns
Returns .

Description
Compute the square root of x in round-to-nearest-even mode.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float __fsqrt_ru (float x)

Compute in round-up mode.

Returns
Returns .

CUDA Math API vRelease Version | 293

Modules

Description
Compute the square root of x in round-up (to positive infinity) mode.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float __fsqrt_rz (float x)

Compute in round-towards-zero mode.

Returns
Returns .

Description
Compute the square root of x in round-towards-zero mode.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

device float __fsub_rd (float x, float y)

Subtract two floating-point values in round-down mode.

Returns
Returns x - y.

Description
Compute the difference of x and y in round-down (to negative infinity) mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

‣ This operation will never be merged into a single multiply-add instruction.

CUDA Math API vRelease Version | 294

Modules

device float __fsub_rn (float x, float y)

Subtract two floating-point values in round-to-nearest-even mode.

Returns
Returns x - y.

Description
Compute the difference of x and y in round-to-nearest-even rounding mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

‣ This operation will never be merged into a single multiply-add instruction.

device float __fsub_ru (float x, float y)

Subtract two floating-point values in round-up mode.

Returns
Returns x - y.

Description
Compute the difference of x and y in round-up (to positive infinity) mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

‣ This operation will never be merged into a single multiply-add instruction.

device float __fsub_rz (float x, float y)

Subtract two floating-point values in round-towards-zero mode.

Returns
Returns x - y.

CUDA Math API vRelease Version | 295

Modules

Description
Compute the difference of x and y in round-towards-zero mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Single-Precision Floating-Point Functions section.

‣ This operation will never be merged into a single multiply-add instruction.

device float __log10f (float x)

Calculate the fast approximate base 10 logarithm of the input argument.

Returns
Returns an approximation to .

Description
Calculate the fast approximate base 10 logarithm of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Intrinsic Functions section.

device float __log2f (float x)

Calculate the fast approximate base 2 logarithm of the input argument.

Returns
Returns an approximation to .

Description
Calculate the fast approximate base 2 logarithm of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Intrinsic Functions section.

CUDA Math API vRelease Version | 296

Modules

device float __logf (float x)

Calculate the fast approximate base logarithm of the input argument.

Returns
Returns an approximation to .

Description
Calculate the fast approximate base logarithm of the input argument x.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Intrinsic Functions section.

device float __powf (float x, float y)

Calculate the fast approximate of .

Returns
Returns an approximation to .

Description
Calculate the fast approximate of x, the first input argument, raised to the power of y, the
second input argument, .

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Intrinsic Functions section.

device float __saturatef (float x)

Clamp the input argument to [+0.0, 1.0].

Returns
‣ __saturatef(x) returns 0 if x < 0.
‣ __saturatef(x) returns 1 if x > 1.
‣ __saturatef(x) returns x if .

CUDA Math API vRelease Version | 297

Modules

‣ __saturatef(NaN) returns 0.

Description
Clamp the input argument x to be within the interval [+0.0, 1.0].

device void __sincosf (float x, float *sptr, float

*cptr)
Calculate the fast approximate of sine and cosine of the first input argument.

Returns
‣ none

Description
Calculate the fast approximate of sine and cosine of the first input argument x (measured in
radians). The results for sine and cosine are written into the second argument, sptr, and,
respectively, third argument, cptr.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Intrinsic Functions section.

‣ Denorm input/output is flushed to sign preserving 0.0.

device float __sinf (float x)

Calculate the fast approximate sine of the input argument.

Returns
Returns the approximate sine of x.

Description
Calculate the fast approximate sine of the input argument x, measured in radians.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Intrinsic Functions section.

‣ Output in the denormal range is flushed to sign preserving 0.0.

CUDA Math API vRelease Version | 298

Modules

device float __tanf (float x)

Calculate the fast approximate tangent of the input argument.

Returns
Returns the approximate tangent of x.

Description
Calculate the fast approximate tangent of the input argument x, measured in radians.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Intrinsic Functions section.

‣ The result is computed as the fast divide of __sinf() by __cosf(). Denormal output is flushed
to sign-preserving 0.0.

1.8. Double Precision Intrinsics

This section describes double precision intrinsic functions that are only supported in device
code. To use these functions you do not need to include any additional header files in your
program.

device double __dadd_rd (double x, double y)

Add two floating-point values in round-down mode.

Returns
Returns x + y.

Description
Adds two floating-point values x and y in round-down (to negative infinity) mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

‣ This operation will never be merged into a single multiply-add instruction.

CUDA Math API vRelease Version | 299

Modules

device double __dadd_rn (double x, double y)

Add two floating-point values in round-to-nearest-even mode.

Returns
Returns x + y.

Description
Adds two floating-point values x and y in round-to-nearest-even mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

‣ This operation will never be merged into a single multiply-add instruction.

device double __dadd_ru (double x, double y)

Add two floating-point values in round-up mode.

Returns
Returns x + y.

Description
Adds two floating-point values x and y in round-up (to positive infinity) mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

‣ This operation will never be merged into a single multiply-add instruction.

device double __dadd_rz (double x, double y)

Add two floating-point values in round-towards-zero mode.

Returns
Returns x + y.

CUDA Math API vRelease Version | 300

Modules

Description
Adds two floating-point values x and y in round-towards-zero mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

‣ This operation will never be merged into a single multiply-add instruction.

device double __ddiv_rd (double x, double y)

Divide two floating-point values in round-down mode.

Returns
Returns x / y.

Description
Divides two floating-point values x by y in round-down (to negative infinity) mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

‣ Requires compute capability >= 2.0.

device double __ddiv_rn (double x, double y)

Divide two floating-point values in round-to-nearest-even mode.

Returns
Returns x / y.

Description
Divides two floating-point values x by y in round-to-nearest-even mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 301

Modules

‣ Requires compute capability >= 2.0.

device double __ddiv_ru (double x, double y)

Divide two floating-point values in round-up mode.

Returns
Returns x / y.

Description
Divides two floating-point values x by y in round-up (to positive infinity) mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

‣ Requires compute capability >= 2.0.

device double __ddiv_rz (double x, double y)

Divide two floating-point values in round-towards-zero mode.

Returns
Returns x / y.

Description
Divides two floating-point values x by y in round-towards-zero mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

‣ Requires compute capability >= 2.0.

device double __dmul_rd (double x, double y)

Multiply two floating-point values in round-down mode.

Returns
Returns x * y.

CUDA Math API vRelease Version | 302

Modules

Description
Multiplies two floating-point values x and y in round-down (to negative infinity) mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

‣ This operation will never be merged into a single multiply-add instruction.

device double __dmul_rn (double x, double y)

Multiply two floating-point values in round-to-nearest-even mode.

Returns
Returns x * y.

Description
Multiplies two floating-point values x and y in round-to-nearest-even mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

‣ This operation will never be merged into a single multiply-add instruction.

device double __dmul_ru (double x, double y)

Multiply two floating-point values in round-up mode.

Returns
Returns x * y.

Description
Multiplies two floating-point values x and y in round-up (to positive infinity) mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 303

Modules

‣ This operation will never be merged into a single multiply-add instruction.

device double __dmul_rz (double x, double y)

Multiply two floating-point values in round-towards-zero mode.

Returns
Returns x * y.

Description
Multiplies two floating-point values x and y in round-towards-zero mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

‣ This operation will never be merged into a single multiply-add instruction.

device double __drcp_rd (double x)

Compute in round-down mode.

Returns
Returns .

Description
Compute the reciprocal of x in round-down (to negative infinity) mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

‣ Requires compute capability >= 2.0.

CUDA Math API vRelease Version | 304

Modules

device double __drcp_rn (double x)

Compute in round-to-nearest-even mode.

Returns
Returns .

Description
Compute the reciprocal of x in round-to-nearest-even mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

‣ Requires compute capability >= 2.0.

device double __drcp_ru (double x)

Compute in round-up mode.

Returns
Returns .

Description
Compute the reciprocal of x in round-up (to positive infinity) mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

‣ Requires compute capability >= 2.0.

device double __drcp_rz (double x)

Compute in round-towards-zero mode.

Returns
Returns .

CUDA Math API vRelease Version | 305

Modules

Description
Compute the reciprocal of x in round-towards-zero mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

‣ Requires compute capability >= 2.0.

device double __dsqrt_rd (double x)

Compute in round-down mode.

Returns
Returns .

Description
Compute the square root of x in round-down (to negative infinity) mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

‣ Requires compute capability >= 2.0.

device double __dsqrt_rn (double x)

Compute in round-to-nearest-even mode.

Returns
Returns .

Description
Compute the square root of x in round-to-nearest-even mode.

Note:

CUDA Math API vRelease Version | 306

Modules

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

‣ Requires compute capability >= 2.0.

device double __dsqrt_ru (double x)

Compute in round-up mode.

Returns
Returns .

Description
Compute the square root of x in round-up (to positive infinity) mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

‣ Requires compute capability >= 2.0.

device double __dsqrt_rz (double x)

Compute in round-towards-zero mode.

Returns
Returns .

Description
Compute the square root of x in round-towards-zero mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

‣ Requires compute capability >= 2.0.

CUDA Math API vRelease Version | 307

Modules

device double __dsub_rd (double x, double y)

Subtract two floating-point values in round-down mode.

Returns
Returns x - y.

Description
Subtracts two floating-point values x and y in round-down (to negative infinity) mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

‣ This operation will never be merged into a single multiply-add instruction.

device double __dsub_rn (double x, double y)

Subtract two floating-point values in round-to-nearest-even mode.

Returns
Returns x - y.

Description
Subtracts two floating-point values x and y in round-to-nearest-even mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

‣ This operation will never be merged into a single multiply-add instruction.

device double __dsub_ru (double x, double y)

Subtract two floating-point values in round-up mode.

Returns
Returns x - y.

CUDA Math API vRelease Version | 308

Modules

Description
Subtracts two floating-point values x and y in round-up (to positive infinity) mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

‣ This operation will never be merged into a single multiply-add instruction.

device double __dsub_rz (double x, double y)

Subtract two floating-point values in round-towards-zero mode.

Returns
Returns x - y.

Description
Subtracts two floating-point values x and y in round-towards-zero mode.

Note:

‣ For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

‣ This operation will never be merged into a single multiply-add instruction.

device double __fma_rd (double x, double y,

double z)
Compute as a single operation in round-down mode.

Returns
Returns the rounded value of as a single operation.

‣ fmaf( , , z) returns NaN.

‣ fmaf(x, y, ) returns NaN if is an exact

CUDA Math API vRelease Version | 309

Modules

Description
Computes the value of as a single ternary operation, rounding the result once in
round-down (to negative infinity) mode.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double __fma_rn (double x, double y,

double z)
Compute as a single operation in round-to-nearest-even mode.

Returns
Returns the rounded value of as a single operation.

‣ fmaf( , , z) returns NaN.

‣ fmaf(x, y, ) returns NaN if is an exact

Description
Computes the value of as a single ternary operation, rounding the result once in
round-to-nearest-even mode.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double __fma_ru (double x, double y,

double z)
Compute as a single operation in round-up mode.

Returns
Returns the rounded value of as a single operation.

CUDA Math API vRelease Version | 310

Modules

‣ fmaf( , , z) returns NaN.

‣ fmaf(x, y, ) returns NaN if is an exact

Description
Computes the value of as a single ternary operation, rounding the result once in
round-up (to positive infinity) mode.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

device double __fma_rz (double x, double y,

double z)
Compute as a single operation in round-towards-zero mode.

Returns
Returns the rounded value of as a single operation.

‣ fmaf( , , z) returns NaN.

‣ fmaf(x, y, ) returns NaN if is an exact

Description
Computes the value of as a single ternary operation, rounding the result once in
round-towards-zero mode.

Note:

For accuracy information see the CUDA C++ Programming Guide, Mathematical Functions
Appendix, Double-Precision Floating-Point Functions section.

CUDA Math API vRelease Version | 311

Modules

1.9. Integer Intrinsics

This section describes integer intrinsic functions that are only supported in device code. To
use these functions you do not need to include any additional header files in your program.

device unsigned int __brev (unsigned int x)

Reverse the bit order of a 32-bit unsigned integer.

Returns
Returns the bit-reversed value of x. i.e. bit N of the return value corresponds to bit 31-N of x.

Description
Reverses the bit order of the 32-bit unsigned integer x.

device unsigned long long int __brevll (unsigned

long long int x)
Reverse the bit order of a 64-bit unsigned integer.

Returns
Returns the bit-reversed value of x. i.e. bit N of the return value corresponds to bit 63-N of x.

Description
Reverses the bit order of the 64-bit unsigned integer x.

device unsigned int __byte_perm (unsigned int x,

unsigned int y, unsigned int s)
Return selected bytes from two 32-bit unsigned integers.

Returns
The returned value r is computed to be: result[n] := input[selector[n]] where
result[n] is the nth byte of r.

Description
byte_perm(x,y,s) returns a 32-bit integer consisting of four bytes from eight input bytes
provided in the two input integers x and y, as specified by a selector, s.

CUDA Math API vRelease Version | 312

Modules

The input bytes are indexed as follows: input[0] = x<7:0> input[1] = x<15:8> input[2] = x<23:16>
input[3] = x<31:24> input[4] = y<7:0> input[5] = y<15:8> input[6] = y<23:16> input[7] = y<31:24>
The selector indices are as follows (the upper 16-bits of the selector are not used): selector[0]
= s<2:0> selector[1] = s<6:4> selector[2] = s<10:8> selector[3] = s<14:12>

device int __clz (int x)

Return the number of consecutive high-order zero bits in a 32-bit integer.

Returns
Returns a value between 0 and 32 inclusive representing the number of zero bits.

Description
Count the number of consecutive leading zero bits, starting at the most significant bit (bit 31)
of x.

device int __clzll (long long int x)

Count the number of consecutive high-order zero bits in a 64-bit integer.

Returns
Returns a value between 0 and 64 inclusive representing the number of zero bits.

Description
Count the number of consecutive leading zero bits, starting at the most significant bit (bit 63)
of x.

device int __ffs (int x)

Find the position of the least significant bit set to 1 in a 32-bit integer.

Returns
Returns a value between 0 and 32 inclusive representing the position of the first bit set.

‣ __ffs(0) returns 0.

Description
Find the position of the first (least significant) bit set to 1 in x, where the least significant bit
position is 1.

CUDA Math API vRelease Version | 313

Modules

device int __ffsll (long long int x)

Find the position of the least significant bit set to 1 in a 64-bit integer.

Returns
Returns a value between 0 and 64 inclusive representing the position of the first bit set.

‣ __ffsll(0) returns 0.

Description
Find the position of the first (least significant) bit set to 1 in x, where the least significant bit
position is 1.

device unsigned int __funnelshift_l (unsigned int

lo, unsigned int hi, unsigned int shift)
Concatenate hi : lo, shift left by shift & 31 bits, return the most significant 32 bits.

Returns
Returns the most significant 32 bits of the shifted 64-bit value.

Description
Shift the 64-bit value formed by concatenating argument lo and hi left by the amount
specified by the argument shift. Argument lo holds bits 31:0 and argument hi holds bits
63:32 of the 64-bit source value. The source is shifted left by the wrapped value of shift
(shift & 31). The most significant 32-bits of the result are returned.

device unsigned int __funnelshift_lc (unsigned

int lo, unsigned int hi, unsigned int shift)
Concatenate hi : lo, shift left by min(shift, 32) bits, return the most significant 32 bits.

Returns
Returns the most significant 32 bits of the shifted 64-bit value.

Description
Shift the 64-bit value formed by concatenating argument lo and hi left by the amount
specified by the argument shift. Argument lo holds bits 31:0 and argument hi holds bits
63:32 of the 64-bit source value. The source is shifted left by the clamped value of shift
(min(shift, 32)). The most significant 32-bits of the result are returned.

CUDA Math API vRelease Version | 314

Modules

device unsigned int __funnelshift_r (unsigned int

lo, unsigned int hi, unsigned int shift)
Concatenate hi : lo, shift right by shift & 31 bits, return the least significant 32 bits.

Returns
Returns the least significant 32 bits of the shifted 64-bit value.

Description
Shift the 64-bit value formed by concatenating argument lo and hi right by the amount
specified by the argument shift. Argument lo holds bits 31:0 and argument hi holds bits
63:32 of the 64-bit source value. The source is shifted right by the wrapped value of shift
(shift & 31). The least significant 32-bits of the result are returned.

device unsigned int __funnelshift_rc (unsigned

int lo, unsigned int hi, unsigned int shift)
Concatenate hi : lo, shift right by min(shift, 32) bits, return the least significant 32 bits.

Returns
Returns the least significant 32 bits of the shifted 64-bit value.

Description
Shift the 64-bit value formed by concatenating argument lo and hi right by the amount
specified by the argument shift. Argument lo holds bits 31:0 and argument hi holds bits
63:32 of the 64-bit source value. The source is shifted right by the clamped value of shift
(min(shift, 32)). The least significant 32-bits of the result are returned.

device int __hadd (int x, int y)

Compute average of signed input arguments, avoiding overflow in the intermediate sum.

Returns
Returns a signed integer value representing the signed average value of the two inputs.

Description
Compute average of signed input arguments x and y as ( x + y ) >> 1, avoiding overflow in the
intermediate sum.

CUDA Math API vRelease Version | 315

Modules

device int __mul24 (int x, int y)

Calculate the least significant 32 bits of the product of the least significant 24 bits of two
integers.

Returns
Returns the least significant 32 bits of the product x * y.

Description
Calculate the least significant 32 bits of the product of the least significant 24 bits of x and y.
The high order 8 bits of x and y are ignored.

device long long int __mul64hi (long long int x,

long long int y)
Calculate the most significant 64 bits of the product of the two 64-bit integers.

Returns
Returns the most significant 64 bits of the product x * y.

Description
Calculate the most significant 64 bits of the 128-bit product x * y, where x and y are 64-bit
integers.

device int __mulhi (int x, int y)

Calculate the most significant 32 bits of the product of the two 32-bit integers.

Returns
Returns the most significant 32 bits of the product x * y.

Description
Calculate the most significant 32 bits of the 64-bit product x * y, where x and y are 32-bit
integers.

device int __popc (unsigned int x)

Count the number of bits that are set to 1 in a 32-bit integer.

Returns
Returns a value between 0 and 32 inclusive representing the number of set bits.

CUDA Math API vRelease Version | 316

Modules

Description
Count the number of bits that are set to 1 in x.

device int __popcll (unsigned long long int x)

Count the number of bits that are set to 1 in a 64-bit integer.

Returns
Returns a value between 0 and 64 inclusive representing the number of set bits.

Description
Count the number of bits that are set to 1 in x.

device int __rhadd (int x, int y)

Compute rounded average of signed input arguments, avoiding overflow in the intermediate
sum.

Returns
Returns a signed integer value representing the signed rounded average value of the two
inputs.

Description
Compute average of signed input arguments x and y as ( x + y + 1 ) >> 1, avoiding overflow in
the intermediate sum.

device unsigned int __sad (int x, int y, unsigned

int z)
Calculate , the sum of absolute difference.

Returns
Returns .

Description
Calculate , the 32-bit sum of the third argument z plus and the absolute value of the
difference between the first argument, x, and second argument, y.
Inputs x and y are signed 32-bit integers, input z is a 32-bit unsigned integer.

CUDA Math API vRelease Version | 317

Modules

device unsigned int __uhadd (unsigned int x,

unsigned int y)
Compute average of unsigned input arguments, avoiding overflow in the intermediate sum.

Returns
Returns an unsigned integer value representing the unsigned average value of the two inputs.

Description
Compute average of unsigned input arguments x and y as ( x + y ) >> 1, avoiding overflow in
the intermediate sum.

device unsigned int __umul24 (unsigned int x,

unsigned int y)
Calculate the least significant 32 bits of the product of the least significant 24 bits of two
unsigned integers.

Returns
Returns the least significant 32 bits of the product x * y.

Description
Calculate the least significant 32 bits of the product of the least significant 24 bits of x and y.
The high order 8 bits of x and y are ignored.

device unsigned long long int __umul64hi

(unsigned long long int x, unsigned long long int y)
Calculate the most significant 64 bits of the product of the two 64 unsigned bit integers.

Returns
Returns the most significant 64 bits of the product x * y.

Description
Calculate the most significant 64 bits of the 128-bit product x * y, where x and y are 64-bit
unsigned integers.

CUDA Math API vRelease Version | 318

Modules

device unsigned int __umulhi (unsigned int x,

unsigned int y)
Calculate the most significant 32 bits of the product of the two 32-bit unsigned integers.

Returns
Returns the most significant 32 bits of the product x * y.

Description
Calculate the most significant 32 bits of the 64-bit product x * y, where x and y are 32-bit
unsigned integers.

device unsigned int __urhadd (unsigned int x,

unsigned int y)
Compute rounded average of unsigned input arguments, avoiding overflow in the intermediate
sum.

Returns
Returns an unsigned integer value representing the unsigned rounded average value of the
two inputs.

Description
Compute average of unsigned input arguments x and y as ( x + y + 1 ) >> 1, avoiding overflow
in the intermediate sum.

device unsigned int __usad (unsigned int x,

unsigned int y, unsigned int z)
Calculate , the sum of absolute difference.

Returns
Returns .

Description
Calculate , the 32-bit sum of the third argument z plus and the absolute value of the
difference between the first argument, x, and second argument, y.
Inputs x, y, and z are unsigned 32-bit integers.

CUDA Math API vRelease Version | 319

Modules

1.10. Type Casting Intrinsics

This section describes type casting intrinsic functions that are only supported in device code.
To use these functions you do not need to include any additional header files in your program.

device float __double2float_rd (double x)

Convert a double to a float in round-down mode.

Returns
Returns converted value.

Description
Convert the double-precision floating-point value x to a single-precision floating-point value in
round-down (to negative infinity) mode.

device float __double2float_rn (double x)

Convert a double to a float in round-to-nearest-even mode.

Returns
Returns converted value.

Description
Convert the double-precision floating-point value x to a single-precision floating-point value in
round-to-nearest-even mode.

device float __double2float_ru (double x)

Convert a double to a float in round-up mode.

Returns
Returns converted value.

Description
Convert the double-precision floating-point value x to a single-precision floating-point value in
round-up (to positive infinity) mode.

CUDA Math API vRelease Version | 320

Modules

device float __double2float_rz (double x)

Convert a double to a float in round-towards-zero mode.

Returns
Returns converted value.

Description
Convert the double-precision floating-point value x to a single-precision floating-point value in
round-towards-zero mode.

device int __double2hiint (double x)

Reinterpret high 32 bits in a double as a signed integer.

Returns
Returns reinterpreted value.

Description
Reinterpret the high 32 bits in the double-precision floating-point value x as a signed integer.

device int __double2int_rd (double x)

Convert a double to a signed int in round-down mode.

Returns
Returns converted value.

Description
Convert the double-precision floating-point value x to a signed integer value in round-down (to
negative infinity) mode.

device int __double2int_rn (double x)

Convert a double to a signed int in round-to-nearest-even mode.

Returns
Returns converted value.

CUDA Math API vRelease Version | 321

Modules

Description
Convert the double-precision floating-point value x to a signed integer value in round-to-
nearest-even mode.

device int __double2int_ru (double x)

Convert a double to a signed int in round-up mode.

Returns
Returns converted value.

Description
Convert the double-precision floating-point value x to a signed integer value in round-up (to
positive infinity) mode.

device int __double2int_rz (double x)

Convert a double to a signed int in round-towards-zero mode.

Returns
Returns converted value.

Description
Convert the double-precision floating-point value x to a signed integer value in round-
towards-zero mode.

device long long int __double2ll_rd (double x)

Convert a double to a signed 64-bit int in round-down mode.

Returns
Returns converted value.

Description
Convert the double-precision floating-point value x to a signed 64-bit integer value in round-
down (to negative infinity) mode.

CUDA Math API vRelease Version | 322

Modules

device long long int __double2ll_rn (double x)

Convert a double to a signed 64-bit int in round-to-nearest-even mode.

Returns
Returns converted value.

Description
Convert the double-precision floating-point value x to a signed 64-bit integer value in round-
to-nearest-even mode.

device long long int __double2ll_ru (double x)

Convert a double to a signed 64-bit int in round-up mode.

Returns
Returns converted value.

Description
Convert the double-precision floating-point value x to a signed 64-bit integer value in round-
up (to positive infinity) mode.

device long long int __double2ll_rz (double x)

Convert a double to a signed 64-bit int in round-towards-zero mode.

Returns
Returns converted value.

Description
Convert the double-precision floating-point value x to a signed 64-bit integer value in round-
towards-zero mode.

device int __double2loint (double x)

Reinterpret low 32 bits in a double as a signed integer.

Returns
Returns reinterpreted value.

CUDA Math API vRelease Version | 323

Modules

Description
Reinterpret the low 32 bits in the double-precision floating-point value x as a signed integer.

device unsigned int __double2uint_rd (double x)

Convert a double to an unsigned int in round-down mode.

Returns
Returns converted value.

Description
Convert the double-precision floating-point value x to an unsigned integer value in round-
down (to negative infinity) mode.

device unsigned int __double2uint_rn (double x)

Convert a double to an unsigned int in round-to-nearest-even mode.

Returns
Returns converted value.

Description
Convert the double-precision floating-point value x to an unsigned integer value in round-to-
nearest-even mode.

device unsigned int __double2uint_ru (double x)

Convert a double to an unsigned int in round-up mode.

Returns
Returns converted value.

Description
Convert the double-precision floating-point value x to an unsigned integer value in round-up
(to positive infinity) mode.

CUDA Math API vRelease Version | 324

Modules

device unsigned int __double2uint_rz (double x)

Convert a double to an unsigned int in round-towards-zero mode.

Returns
Returns converted value.

Description
Convert the double-precision floating-point value x to an unsigned integer value in round-
towards-zero mode.

device unsigned long long int __double2ull_rd

(double x)
Convert a double to an unsigned 64-bit int in round-down mode.

Returns
Returns converted value.

Description
Convert the double-precision floating-point value x to an unsigned 64-bit integer value in
round-down (to negative infinity) mode.

device unsigned long long int __double2ull_rn

(double x)
Convert a double to an unsigned 64-bit int in round-to-nearest-even mode.

Returns
Returns converted value.

Description
Convert the double-precision floating-point value x to an unsigned 64-bit integer value in
round-to-nearest-even mode.

CUDA Math API vRelease Version | 325

Modules

device unsigned long long int __double2ull_ru

(double x)
Convert a double to an unsigned 64-bit int in round-up mode.

Returns
Returns converted value.

Description
Convert the double-precision floating-point value x to an unsigned 64-bit integer value in
round-up (to positive infinity) mode.

device unsigned long long int __double2ull_rz

(double x)
Convert a double to an unsigned 64-bit int in round-towards-zero mode.

Returns
Returns converted value.

Description
Convert the double-precision floating-point value x to an unsigned 64-bit integer value in
round-towards-zero mode.

device long long int __double_as_longlong

(double x)
Reinterpret bits in a double as a 64-bit signed integer.

Returns
Returns reinterpreted value.

Description
Reinterpret the bits in the double-precision floating-point value x as a signed 64-bit integer.

CUDA Math API vRelease Version | 326

Modules

device int __float2int_rd (float x)

Convert a float to a signed integer in round-down mode.

Returns
Returns converted value.

Description
Convert the single-precision floating-point value x to a signed integer in round-down (to
negative infinity) mode.

device int __float2int_rn (float x)

Convert a float to a signed integer in round-to-nearest-even mode.

Returns
Returns converted value.

Description
Convert the single-precision floating-point value x to a signed integer in round-to-nearest-
even mode.

device int __float2int_ru (float)

Convert a float to a signed integer in round-up mode.

Returns
Returns converted value.

Description
Convert the single-precision floating-point value x to a signed integer in round-up (to positive
infinity) mode.

device int __float2int_rz (float x)

Convert a float to a signed integer in round-towards-zero mode.

Returns
Returns converted value.

CUDA Math API vRelease Version | 327

Modules

Description
Convert the single-precision floating-point value x to a signed integer in round-towards-zero
mode.

device long long int __float2ll_rd (float x)

Convert a float to a signed 64-bit integer in round-down mode.

Returns
Returns converted value.

Description
Convert the single-precision floating-point value x to a signed 64-bit integer in round-down (to
negative infinity) mode.

device long long int __float2ll_rn (float x)

Convert a float to a signed 64-bit integer in round-to-nearest-even mode.

Returns
Returns converted value.

Description
Convert the single-precision floating-point value x to a signed 64-bit integer in round-to-
nearest-even mode.

device long long int __float2ll_ru (float x)

Convert a float to a signed 64-bit integer in round-up mode.

Returns
Returns converted value.

Description
Convert the single-precision floating-point value x to a signed 64-bit integer in round-up (to
positive infinity) mode.

CUDA Math API vRelease Version | 328

Modules

device long long int __float2ll_rz (float x)

Convert a float to a signed 64-bit integer in round-towards-zero mode.

Returns
Returns converted value.

Description
Convert the single-precision floating-point value x to a signed 64-bit integer in round-
towards-zero mode.

device unsigned int __float2uint_rd (float x)

Convert a float to an unsigned integer in round-down mode.

Returns
Returns converted value.

Description
Convert the single-precision floating-point value x to an unsigned integer in round-down (to
negative infinity) mode.

device unsigned int __float2uint_rn (float x)

Convert a float to an unsigned integer in round-to-nearest-even mode.

Returns
Returns converted value.

Description
Convert the single-precision floating-point value x to an unsigned integer in round-to-nearest-
even mode.

device unsigned int __float2uint_ru (float x)

Convert a float to an unsigned integer in round-up mode.

Returns
Returns converted value.

CUDA Math API vRelease Version | 329

Modules

Description
Convert the single-precision floating-point value x to an unsigned integer in round-up (to
positive infinity) mode.

device unsigned int __float2uint_rz (float x)

Convert a float to an unsigned integer in round-towards-zero mode.

Returns
Returns converted value.

Description
Convert the single-precision floating-point value x to an unsigned integer in round-towards-
zero mode.

device unsigned long long int __float2ull_rd (float

x)
Convert a float to an unsigned 64-bit integer in round-down mode.

Returns
Returns converted value.

Description
Convert the single-precision floating-point value x to an unsigned 64-bit integer in round-
down (to negative infinity) mode.

device unsigned long long int __float2ull_rn

(float x)
Convert a float to an unsigned 64-bit integer in round-to-nearest-even mode.

Returns
Returns converted value.

Description
Convert the single-precision floating-point value x to an unsigned 64-bit integer in round-to-
nearest-even mode.

CUDA Math API vRelease Version | 330

Modules

device unsigned long long int __float2ull_ru

(float x)
Convert a float to an unsigned 64-bit integer in round-up mode.

Returns
Returns converted value.

Description
Convert the single-precision floating-point value x to an unsigned 64-bit integer in round-up
(to positive infinity) mode.

device unsigned long long int __float2ull_rz (float

x)
Convert a float to an unsigned 64-bit integer in round-towards-zero mode.

Returns
Returns converted value.

Description
Convert the single-precision floating-point value x to an unsigned 64-bit integer in round-
towards-zero mode.

device int __float_as_int (float x)

Reinterpret bits in a float as a signed integer.

Returns
Returns reinterpreted value.

Description
Reinterpret the bits in the single-precision floating-point value x as a signed integer.

device unsigned int __float_as_uint (float x)

Reinterpret bits in a float as a unsigned integer.

Returns
Returns reinterpreted value.

CUDA Math API vRelease Version | 331

Modules

Description
Reinterpret the bits in the single-precision floating-point value x as a unsigned integer.

device double __hiloint2double (int hi, int lo)

Reinterpret high and low 32-bit integer values as a double.

Returns
Returns reinterpreted value.

Description
Reinterpret the integer value of hi as the high 32 bits of a double-precision floating-point
value and the integer value of lo as the low 32 bits of the same double-precision floating-point
value.

device double __int2double_rn (int x)

Convert a signed int to a double.

Returns
Returns converted value.

Description
Convert the signed integer value x to a double-precision floating-point value.

device float __int2float_rd (int x)

Convert a signed integer to a float in round-down mode.

Returns
Returns converted value.

Description
Convert the signed integer value x to a single-precision floating-point value in round-down (to
negative infinity) mode.

CUDA Math API vRelease Version | 332

Modules

device float __int2float_rn (int x)

Convert a signed integer to a float in round-to-nearest-even mode.

Returns
Returns converted value.

Description
Convert the signed integer value x to a single-precision floating-point value in round-to-
nearest-even mode.

device float __int2float_ru (int x)

Convert a signed integer to a float in round-up mode.

Returns
Returns converted value.

Description
Convert the signed integer value x to a single-precision floating-point value in round-up (to
positive infinity) mode.

device float __int2float_rz (int x)

Convert a signed integer to a float in round-towards-zero mode.

Returns
Returns converted value.

Description
Convert the signed integer value x to a single-precision floating-point value in round-towards-
zero mode.

device float __int_as_float (int x)

Reinterpret bits in an integer as a float.

Returns
Returns reinterpreted value.

CUDA Math API vRelease Version | 333

Modules

Description
Reinterpret the bits in the signed integer value x as a single-precision floating-point value.

device double __ll2double_rd (long long int x)

Convert a signed 64-bit int to a double in round-down mode.

Returns
Returns converted value.

Description
Convert the signed 64-bit integer value x to a double-precision floating-point value in round-
down (to negative infinity) mode.

device double __ll2double_rn (long long int x)

Convert a signed 64-bit int to a double in round-to-nearest-even mode.

Returns
Returns converted value.

Description
Convert the signed 64-bit integer value x to a double-precision floating-point value in round-
to-nearest-even mode.

device double __ll2double_ru (long long int x)

Convert a signed 64-bit int to a double in round-up mode.

Returns
Returns converted value.

Description
Convert the signed 64-bit integer value x to a double-precision floating-point value in round-
up (to positive infinity) mode.

CUDA Math API vRelease Version | 334

Modules

device double __ll2double_rz (long long int x)

Convert a signed 64-bit int to a double in round-towards-zero mode.

Returns
Returns converted value.

Description
Convert the signed 64-bit integer value x to a double-precision floating-point value in round-
towards-zero mode.

device float __ll2float_rd (long long int x)

Convert a signed integer to a float in round-down mode.

Returns
Returns converted value.

Description
Convert the signed integer value x to a single-precision floating-point value in round-down (to
negative infinity) mode.

device float __ll2float_rn (long long int x)

Convert a signed 64-bit integer to a float in round-to-nearest-even mode.

Returns
Returns converted value.

Description
Convert the signed 64-bit integer value x to a single-precision floating-point value in round-to-
nearest-even mode.

device float __ll2float_ru (long long int x)

Convert a signed integer to a float in round-up mode.

Returns
Returns converted value.

CUDA Math API vRelease Version | 335

Modules

Description
Convert the signed integer value x to a single-precision floating-point value in round-up (to
positive infinity) mode.

device float __ll2float_rz (long long int x)

Convert a signed integer to a float in round-towards-zero mode.

Returns
Returns converted value.

Description
Convert the signed integer value x to a single-precision floating-point value in round-towards-
zero mode.

device double __longlong_as_double (long long

int x)
Reinterpret bits in a 64-bit signed integer as a double.

Returns
Returns reinterpreted value.

Description
Reinterpret the bits in the 64-bit signed integer value x as a double-precision floating-point
value.

device double __uint2double_rn (unsigned int x)

Convert an unsigned int to a double.

Returns
Returns converted value.

Description
Convert the unsigned integer value x to a double-precision floating-point value.

CUDA Math API vRelease Version | 336

Modules

device float __uint2float_rd (unsigned int x)

Convert an unsigned integer to a float in round-down mode.

Returns
Returns converted value.

Description
Convert the unsigned integer value x to a single-precision floating-point value in round-down
(to negative infinity) mode.

device float __uint2float_rn (unsigned int x)

Convert an unsigned integer to a float in round-to-nearest-even mode.

Returns
Returns converted value.

Description
Convert the unsigned integer value x to a single-precision floating-point value in round-to-
nearest-even mode.

device float __uint2float_ru (unsigned int x)

Convert an unsigned integer to a float in round-up mode.

Returns
Returns converted value.

Description
Convert the unsigned integer value x to a single-precision floating-point value in round-up (to
positive infinity) mode.

device float __uint2float_rz (unsigned int x)

Convert an unsigned integer to a float in round-towards-zero mode.

Returns
Returns converted value.

CUDA Math API vRelease Version | 337

Modules

Description
Convert the unsigned integer value x to a single-precision floating-point value in round-
towards-zero mode.

device float __uint_as_float (unsigned int x)

Reinterpret bits in an unsigned integer as a float.

Returns
Returns reinterpreted value.

Description
Reinterpret the bits in the unsigned integer value x as a single-precision floating-point value.

device double __ull2double_rd (unsigned long

long int x)
Convert an unsigned 64-bit int to a double in round-down mode.

Returns
Returns converted value.

Description
Convert the unsigned 64-bit integer value x to a double-precision floating-point value in
round-down (to negative infinity) mode.

device double __ull2double_rn (unsigned long

long int x)
Convert an unsigned 64-bit int to a double in round-to-nearest-even mode.

Returns
Returns converted value.

Description
Convert the unsigned 64-bit integer value x to a double-precision floating-point value in
round-to-nearest-even mode.

CUDA Math API vRelease Version | 338

Modules

device double __ull2double_ru (unsigned long

long int x)
Convert an unsigned 64-bit int to a double in round-up mode.

Returns
Returns converted value.

Description
Convert the unsigned 64-bit integer value x to a double-precision floating-point value in
round-up (to positive infinity) mode.

device double __ull2double_rz (unsigned long

long int x)
Convert an unsigned 64-bit int to a double in round-towards-zero mode.

Returns
Returns converted value.

Description
Convert the unsigned 64-bit integer value x to a double-precision floating-point value in
round-towards-zero mode.

device float __ull2float_rd (unsigned long long int

x)
Convert an unsigned integer to a float in round-down mode.

Returns
Returns converted value.

Description
Convert the unsigned integer value x to a single-precision floating-point value in round-down
(to negative infinity) mode.

CUDA Math API vRelease Version | 339

Modules

device float __ull2float_rn (unsigned long long

int x)
Convert an unsigned integer to a float in round-to-nearest-even mode.

Returns
Returns converted value.

Description
Convert the unsigned integer value x to a single-precision floating-point value in round-to-
nearest-even mode.

device float __ull2float_ru (unsigned long long

int x)
Convert an unsigned integer to a float in round-up mode.

Returns
Returns converted value.

Description
Convert the unsigned integer value x to a single-precision floating-point value in round-up (to
positive infinity) mode.

device float __ull2float_rz (unsigned long long int

x)
Convert an unsigned integer to a float in round-towards-zero mode.

Returns
Returns converted value.

Description
Convert the unsigned integer value x to a single-precision floating-point value in round-
towards-zero mode.

CUDA Math API vRelease Version | 340

Modules

1.11. SIMD Intrinsics

This section describes SIMD intrinsic functions that are only supported in device code. To use
these functions you do not need to include any additional header files in your program.

device unsigned int __vabs2 (unsigned int a)

Computes per-halfword absolute value.

Returns
Returns computed value.

Description
Splits 4 bytes of argument into 2 parts, each consisting of 2 bytes, then computes absolute
value for each of parts. Partial results are recombined and returned as unsigned int.

device unsigned int __vabs4 (unsigned int a)

Computes per-byte absolute value.

Returns
Returns computed value.

Description
Splits argument by bytes. Computes absolute value of each byte. Partial results are
recombined and returned as unsigned int.

device unsigned int __vabsdiffs2 (unsigned int a,

unsigned int b)
Computes per-halfword sum of absolute difference of signed integer.

Returns
Returns computed value.

Description
Splits 4 bytes of each into 2 parts, each consisting of 2 bytes. For corresponding parts function
computes absolute difference. Partial results are recombined and returned as unsigned int.

CUDA Math API vRelease Version | 341

Modules

device unsigned int __vabsdiffs4 (unsigned int a,

unsigned int b)
Computes per-byte absolute difference of signed integer.

Returns
Returns computed value.

Description
Splits 4 bytes of each into 4 parts, each consisting of 1 byte. For corresponding parts function
computes absolute difference. Partial results are recombined and returned as unsigned int.

device unsigned int __vabsdiffu2 (unsigned int a,

unsigned int b)
Performs per-halfword absolute difference of unsigned integer computation: |a - b|.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. For corresponding
parts function computes absolute difference. Partial results are recombined and returned as
unsigned int.

device unsigned int __vabsdiffu4 (unsigned int a,

unsigned int b)
Computes per-byte absolute difference of unsigned integer.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. For corresponding
parts function computes absolute difference. Partial results are recombined and returned as
unsigned int.

CUDA Math API vRelease Version | 342

Modules

device unsigned int __vabsss2 (unsigned int a)

Computes per-halfword absolute value with signed saturation.

Returns
Returns computed value.

Description
Splits 4 bytes of argument into 2 parts, each consisting of 2 bytes, then computes absolute
value with signed saturation for each of parts. Partial results are recombined and returned as
unsigned int.

device unsigned int __vabsss4 (unsigned int a)

Computes per-byte absolute value with signed saturation.

Returns
Returns computed value.

Description
Splits 4 bytes of argument into 4 parts, each consisting of 1 byte, then computes absolute
value with signed saturation for each of parts. Partial results are recombined and returned as
unsigned int.

device unsigned int __vadd2 (unsigned int a,

unsigned int b)
Performs per-halfword (un)signed addition, with wrap-around: a + b.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes, then performs
unsigned addition on corresponding parts. Partial results are recombined and returned as
unsigned int.

CUDA Math API vRelease Version | 343

Modules

device unsigned int __vadd4 (unsigned int a,

unsigned int b)
Performs per-byte (un)signed addition.

Returns
Returns computed value.

Description
Splits 'a' into 4 bytes, then performs unsigned addition on each of these bytes with the
corresponding byte from 'b', ignoring overflow. Partial results are recombined and returned as
unsigned int.

device unsigned int __vaddss2 (unsigned int a,

unsigned int b)
Performs per-halfword addition with signed saturation.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes, then performs addition
with signed saturation on corresponding parts. Partial results are recombined and returned as
unsigned int.

device unsigned int __vaddss4 (unsigned int a,

unsigned int b)
Performs per-byte addition with signed saturation.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte, then performs addition
with signed saturation on corresponding parts. Partial results are recombined and returned as
unsigned int.

CUDA Math API vRelease Version | 344

Modules

device unsigned int __vaddus2 (unsigned int a,

unsigned int b)
Performs per-halfword addition with unsigned saturation.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes, then performs addition
with unsigned saturation on corresponding parts.

device unsigned int __vaddus4 (unsigned int a,

unsigned int b)
Performs per-byte addition with unsigned saturation.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte, then performs addition
with unsigned saturation on corresponding parts.

device unsigned int __vavgs2 (unsigned int a,

unsigned int b)
Performs per-halfword signed rounded average computation.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes, then computes signed
rounded average of corresponding parts. Partial results are recombined and returned as
unsigned int.

CUDA Math API vRelease Version | 345

Modules

device unsigned int __vavgs4 (unsigned int a,

unsigned int b)
Computes per-byte signed rounded average.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. then computes signed
rounded average of corresponding parts. Partial results are recombined and returned as
unsigned int.

device unsigned int __vavgu2 (unsigned int a,

unsigned int b)
Performs per-halfword unsigned rounded average computation.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes, then computes
unsigned rounded average of corresponding parts. Partial results are recombined and
returned as unsigned int.

device unsigned int __vavgu4 (unsigned int a,

unsigned int b)
Performs per-byte unsigned rounded average.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. then computes
unsigned rounded average of corresponding parts. Partial results are recombined and
returned as unsigned int.

CUDA Math API vRelease Version | 346

Modules

device unsigned int __vcmpeq2 (unsigned int a,

unsigned int b)
Performs per-halfword (un)signed comparison.

Returns
Returns 0xffff computed value.

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. For corresponding
parts result is ffff if they are equal, and 0000 otherwise. For example __vcmpeq2(0x1234aba5,
0x1234aba6) returns 0xffff0000.

device unsigned int __vcmpeq4 (unsigned int a,

unsigned int b)
Performs per-byte (un)signed comparison.

Returns
Returns 0xff if a = b, else returns 0.

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. For corresponding
parts result is ff if they are equal, and 00 otherwise. For example __vcmpeq4(0x1234aba5,
0x1234aba6) returns 0xffffff00.

device unsigned int __vcmpges2 (unsigned int a,

unsigned int b)
Performs per-halfword signed comparison: a >= b ? 0xffff : 0.

Returns
Returns 0xffff if a >= b, else returns 0.

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. For
corresponding parts result is ffff if 'a' part >= 'b' part, and 0000 otherwise. For example
__vcmpges2(0x1234aba5, 0x1234aba6) returns 0xffff0000.

CUDA Math API vRelease Version | 347

Modules

device unsigned int __vcmpges4 (unsigned int a,

unsigned int b)
Performs per-byte signed comparison.

Returns
Returns 0xff if a >= b, else returns 0.

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. For corresponding
parts result is ff if 'a' part >= 'b' part, and 00 otherwise. For example __vcmpges4(0x1234aba5,
0x1234aba6) returns 0xffffff00.

device unsigned int __vcmpgeu2 (unsigned int a,

unsigned int b)
Performs per-halfword unsigned comparison: a >= b ? 0xffff : 0.

Returns
Returns 0xffff if a >= b, else returns 0.

device unsigned int __vcmpgeu4 (unsigned int a,

unsigned int b)
Performs per-byte unsigned comparison.

Returns
Returns 0xff if a = b, else returns 0.

CUDA Math API vRelease Version | 348

Modules

device unsigned int __vcmpgts2 (unsigned int a,

unsigned int b)
Performs per-halfword signed comparison: a > b ? 0xffff : 0.

Returns
Returns 0xffff if a > b, else returns 0.

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. For
corresponding parts result is ffff if 'a' part > 'b' part, and 0000 otherwise. For example
__vcmpgts2(0x1234aba5, 0x1234aba6) returns 0x00000000.

device unsigned int __vcmpgts4 (unsigned int a,

unsigned int b)
Performs per-byte signed comparison.

Returns
Returns 0xff if a > b, else returns 0.

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. For corresponding
parts result is ff if 'a' part > 'b' part, and 00 otherwise. For example __vcmpgts4(0x1234aba5,
0x1234aba6) returns 0x00000000.

device unsigned int __vcmpgtu2 (unsigned int a,

unsigned int b)
Performs per-halfword unsigned comparison: a > b ? 0xffff : 0.

Returns
Returns 0xffff if a > b, else returns 0.

CUDA Math API vRelease Version | 349

Modules

device unsigned int __vcmpgtu4 (unsigned int a,

unsigned int b)
Performs per-byte unsigned comparison.

Returns
Returns 0xff if a > b, else returns 0.

device unsigned int __vcmples2 (unsigned int a,

unsigned int b)
Performs per-halfword signed comparison: a <= b ? 0xffff : 0.

Returns
Returns 0xffff if a <= b, else returns 0.

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. For
corresponding parts result is ffff if 'a' part <= 'b' part, and 0000 otherwise. For example
__vcmples2(0x1234aba5, 0x1234aba6) returns 0xffffffff.

device unsigned int __vcmples4 (unsigned int a,

unsigned int b)
Performs per-byte signed comparison.

Returns
Returns 0xff if a <= b, else returns 0.

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. For corresponding
parts result is ff if 'a' part <= 'b' part, and 00 otherwise. For example __vcmples4(0x1234aba5,
0x1234aba6) returns 0xffffffff.

CUDA Math API vRelease Version | 350

Modules

device unsigned int __vcmpleu2 (unsigned int a,

unsigned int b)
Performs per-halfword unsigned comparison: a <= b ? 0xffff : 0.

Returns
Returns 0xffff if a <= b, else returns 0.

device unsigned int __vcmpleu4 (unsigned int a,

unsigned int b)
Performs per-byte unsigned comparison.

Returns
Returns 0xff if a <= b, else returns 0.

device unsigned int __vcmplts2 (unsigned int a,

unsigned int b)
Performs per-halfword signed comparison: a < b ? 0xffff : 0.

Returns
Returns 0xffff if a < b, else returns 0.

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. For
corresponding parts result is ffff if 'a' part < 'b' part, and 0000 otherwise. For example
__vcmplts2(0x1234aba5, 0x1234aba6) returns 0x0000ffff.

CUDA Math API vRelease Version | 351

Modules

device unsigned int __vcmplts4 (unsigned int a,

unsigned int b)
Performs per-byte signed comparison.

Returns
Returns 0xff if a < b, else returns 0.

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. For corresponding
parts result is ff if 'a' part < 'b' part, and 00 otherwise. For example __vcmplts4(0x1234aba5,
0x1234aba6) returns 0x000000ff.

device unsigned int __vcmpltu2 (unsigned int a,

unsigned int b)
Performs per-halfword unsigned comparison: a < b ? 0xffff : 0.

Returns
Returns 0xffff if a < b, else returns 0.

device unsigned int __vcmpltu4 (unsigned int a,

unsigned int b)
Performs per-byte unsigned comparison.

Returns
Returns 0xff if a < b, else returns 0.

CUDA Math API vRelease Version | 352

Modules

device unsigned int __vcmpne2 (unsigned int a,

unsigned int b)
Performs per-halfword (un)signed comparison: a != b ? 0xffff : 0.

Returns
Returns 0xffff if a != b, else returns 0.

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. For
corresponding parts result is ffff if 'a' part != 'b' part, and 0000 otherwise. For example
__vcmplts2(0x1234aba5, 0x1234aba6) returns 0x0000ffff.

device unsigned int __vcmpne4 (unsigned int a,

unsigned int b)
Performs per-byte (un)signed comparison.

Returns
Returns 0xff if a != b, else returns 0.

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. For corresponding
parts result is ff if 'a' part != 'b' part, and 00 otherwise. For example __vcmplts4(0x1234aba5,
0x1234aba6) returns 0x000000ff.

device unsigned int __vhaddu2 (unsigned int a,

unsigned int b)
Performs per-halfword unsigned average computation.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes, then computes
unsigned average of corresponding parts. Partial results are recombined and returned as
unsigned int.

CUDA Math API vRelease Version | 353

Modules

device unsigned int __vhaddu4 (unsigned int a,

unsigned int b)
Computes per-byte unsigned average.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. then computes
unsigned average of corresponding parts. Partial results are recombined and returned as
unsigned int.

device unsigned int __vmaxs2 (unsigned int a,

unsigned int b)
Performs per-halfword signed maximum computation.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. For corresponding
parts function computes signed maximum. Partial results are recombined and returned as
unsigned int.

device unsigned int __vmaxs4 (unsigned int a,

unsigned int b)
Computes per-byte signed maximum.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. For corresponding parts
function computes signed maximum. Partial results are recombined and returned as unsigned
int.

CUDA Math API vRelease Version | 354

Modules

device unsigned int __vmaxu2 (unsigned int a,

unsigned int b)
Performs per-halfword unsigned maximum computation.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. For corresponding
parts function computes unsigned maximum. Partial results are recombined and returned as
unsigned int.

device unsigned int __vmaxu4 (unsigned int a,

unsigned int b)
Computes per-byte unsigned maximum.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. For corresponding
parts function computes unsigned maximum. Partial results are recombined and returned as
unsigned int.

device unsigned int __vmins2 (unsigned int a,

unsigned int b)
Performs per-halfword signed minimum computation.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. For corresponding
parts function computes signed minimum. Partial results are recombined and returned as
unsigned int.

CUDA Math API vRelease Version | 355

Modules

device unsigned int __vmins4 (unsigned int a,

unsigned int b)
Computes per-byte signed minimum.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. For corresponding parts
function computes signed minimum. Partial results are recombined and returned as unsigned
int.

device unsigned int __vminu2 (unsigned int a,

unsigned int b)
Performs per-halfword unsigned minimum computation.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. For corresponding
parts function computes unsigned minimum. Partial results are recombined and returned as
unsigned int.

device unsigned int __vminu4 (unsigned int a,

unsigned int b)
Computes per-byte unsigned minimum.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. For corresponding
parts function computes unsigned minimum. Partial results are recombined and returned as
unsigned int.

CUDA Math API vRelease Version | 356

Modules

device unsigned int __vneg2 (unsigned int a)

Computes per-halfword negation.

Returns
Returns computed value.

Description
Splits 4 bytes of argument into 2 parts, each consisting of 2 bytes. For each part function
computes negation. Partial results are recombined and returned as unsigned int.

device unsigned int __vneg4 (unsigned int a)

Performs per-byte negation.

Returns
Returns computed value.

Description
Splits 4 bytes of argument into 4 parts, each consisting of 1 byte. For each part function
computes negation. Partial results are recombined and returned as unsigned int.

device unsigned int __vnegss2 (unsigned int a)

Computes per-halfword negation with signed saturation.

Returns
Returns computed value.

Description
Splits 4 bytes of argument into 2 parts, each consisting of 2 bytes. For each part function
computes negation. Partial results are recombined and returned as unsigned int.

device unsigned int __vnegss4 (unsigned int a)

Performs per-byte negation with signed saturation.

Returns
Returns computed value.

CUDA Math API vRelease Version | 357

Modules

Description
Splits 4 bytes of argument into 4 parts, each consisting of 1 byte. For each part function
computes negation. Partial results are recombined and returned as unsigned int.

device unsigned int __vsads2 (unsigned int a,

unsigned int b)
Performs per-halfword sum of absolute difference of signed.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. For corresponding
parts function computes absolute difference and sum it up. Partial results are recombined and
returned as unsigned int.

device unsigned int __vsads4 (unsigned int a,

unsigned int b)
Computes per-byte sum of abs difference of signed.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. For corresponding parts
function computes absolute difference and sum it up. Partial results are recombined and
returned as unsigned int.

device unsigned int __vsadu2 (unsigned int a,

unsigned int b)
Computes per-halfword sum of abs diff of unsigned.

Returns
Returns computed value.

CUDA Math API vRelease Version | 358

Modules

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. For corresponding
parts function computes absolute differences and returns sum of those differences.

device unsigned int __vsadu4 (unsigned int a,

unsigned int b)
Computes per-byte sum of abs difference of unsigned.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. For corresponding parts
function computes absolute differences and returns sum of those differences.

device unsigned int __vseteq2 (unsigned int a,

unsigned int b)
Performs per-halfword (un)signed comparison.

Returns
Returns 1 if a = b, else returns 0.

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. For corresponding
parts function performs comparison 'a' part == 'b' part. If both equalities are satisfied, function
returns 1.

device unsigned int __vseteq4 (unsigned int a,

unsigned int b)
Performs per-byte (un)signed comparison.

Returns
Returns 1 if a = b, else returns 0.

CUDA Math API vRelease Version | 359

Modules

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. For corresponding
parts function performs comparison 'a' part == 'b' part. If both equalities are satisfied, function
returns 1.

device unsigned int __vsetges2 (unsigned int a,

unsigned int b)
Performs per-halfword signed comparison.

Returns
Returns 1 if a >= b, else returns 0.

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. For corresponding
parts function performs comparison 'a' part >= 'b' part. If both inequalities are satisfied,
function returns 1.

device unsigned int __vsetges4 (unsigned int a,

unsigned int b)
Performs per-byte signed comparison.

Returns
Returns 1 if a >= b, else returns 0.

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. For corresponding parts
function performs comparison 'a' part >= 'b' part. If both inequalities are satisfied, function
returns 1.

device unsigned int __vsetgeu2 (unsigned int a,

unsigned int b)
Performs per-halfword unsigned minimum unsigned comparison.

Returns
Returns 1 if a >= b, else returns 0.

CUDA Math API vRelease Version | 360

Modules

device unsigned int __vsetgeu4 (unsigned int a,

unsigned int b)
Performs per-byte unsigned comparison.

Returns
Returns 1 if a >= b, else returns 0.

device unsigned int __vsetgts2 (unsigned int a,

unsigned int b)
Performs per-halfword signed comparison.

Returns
Returns 1 if a > b, else returns 0.

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. For corresponding
parts function performs comparison 'a' part > 'b' part. If both inequalities are satisfied,
function returns 1.

device unsigned int __vsetgts4 (unsigned int a,

unsigned int b)
Performs per-byte signed comparison.

Returns
Returns 1 if a > b, else returns 0.

CUDA Math API vRelease Version | 361

Modules

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. For corresponding parts
function performs comparison 'a' part > 'b' part. If both inequalities are satisfied, function
returns 1.

device unsigned int __vsetgtu2 (unsigned int a,

unsigned int b)
Performs per-halfword unsigned comparison.

Returns
Returns 1 if a > b, else returns 0.

device unsigned int __vsetgtu4 (unsigned int a,

unsigned int b)
Performs per-byte unsigned comparison.

Returns
Returns 1 if a > b, else returns 0.

device unsigned int __vsetles2 (unsigned int a,

unsigned int b)
Performs per-halfword unsigned minimum computation.

Returns
Returns 1 if a <= b, else returns 0.

CUDA Math API vRelease Version | 362

Modules

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. For corresponding
parts function performs comparison 'a' part <= 'b' part. If both inequalities are satisfied,
function returns 1.

device unsigned int __vsetles4 (unsigned int a,

unsigned int b)
Performs per-byte signed comparison.

Returns
Returns 1 if a <= b, else returns 0.

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. For corresponding parts
function performs comparison 'a' part <= 'b' part. If both inequalities are satisfied, function
returns 1.

device unsigned int __vsetleu2 (unsigned int a,

unsigned int b)
Performs per-halfword signed comparison.

Returns
Returns 1 if a <= b, else returns 0.

device unsigned int __vsetleu4 (unsigned int a,

unsigned int b)
Performs per-byte unsigned comparison.

Returns
Returns 1 if a <= b, else returns 0.

CUDA Math API vRelease Version | 363

Modules

Description
Splits 4 bytes of each argument into 4 part, each consisting of 1 byte. For corresponding parts
function performs comparison 'a' part <= 'b' part. If both inequalities are satisfied, function
returns 1.

device unsigned int __vsetlts2 (unsigned int a,

unsigned int b)
Performs per-halfword signed comparison.

Returns
Returns 1 if a < b, else returns 0.

device unsigned int __vsetlts4 (unsigned int a,

unsigned int b)
Performs per-byte signed comparison.

Returns
Returns 1 if a < b, else returns 0.

device unsigned int __vsetltu2 (unsigned int a,

unsigned int b)
Performs per-halfword unsigned comparison.

Returns
Returns 1 if a < b, else returns 0.

CUDA Math API vRelease Version | 364

Modules

device unsigned int __vsetltu4 (unsigned int a,

unsigned int b)
Performs per-byte unsigned comparison.

Returns
Returns 1 if a < b, else returns 0.

device unsigned int __vsetne2 (unsigned int a,

unsigned int b)
Performs per-halfword (un)signed comparison.

Returns
Returns 1 if a != b, else returns 0.

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. For corresponding
parts function performs comparison 'a' part != 'b' part. If both conditions are satisfied, function
returns 1.

device unsigned int __vsetne4 (unsigned int a,

unsigned int b)
Performs per-byte (un)signed comparison.

Returns
Returns 1 if a != b, else returns 0.

CUDA Math API vRelease Version | 365

Modules

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. For corresponding
parts function performs comparison 'a' part != 'b' part. If both conditions are satisfied, function
returns 1.

device unsigned int __vsub2 (unsigned int a,

unsigned int b)
Performs per-halfword (un)signed subtraction, with wrap-around.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. For corresponding
parts function performs subtraction. Partial results are recombined and returned as unsigned
int.

device unsigned int __vsub4 (unsigned int a,

unsigned int b)
Performs per-byte subtraction.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. For corresponding parts
function performs subtraction. Partial results are recombined and returned as unsigned int.

device unsigned int __vsubss2 (unsigned int a,

unsigned int b)
Performs per-halfword (un)signed subtraction, with signed saturation.

Returns
Returns computed value.

CUDA Math API vRelease Version | 366

Modules

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. For corresponding
parts function performs subtraction with signed saturation. Partial results are recombined
and returned as unsigned int.

device unsigned int __vsubss4 (unsigned int a,

unsigned int b)
Performs per-byte subtraction with signed saturation.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. For corresponding parts
function performs subtraction with signed saturation. Partial results are recombined and
returned as unsigned int.

device unsigned int __vsubus2 (unsigned int a,

unsigned int b)
Performs per-halfword subtraction with unsigned saturation.

Returns
Returns computed value.

Description
Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes. For corresponding
parts function performs subtraction with unsigned saturation. Partial results are recombined
and returned as unsigned int.

device unsigned int __vsubus4 (unsigned int a,

unsigned int b)
Performs per-byte subtraction with unsigned saturation.

Returns
Returns computed value.

CUDA Math API vRelease Version | 367

Modules

Description
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. For corresponding parts
function performs subtraction with unsigned saturation. Partial results are recombined and
returned as unsigned int.

CUDA Math API vRelease Version | 368

Notice
This document is provided for information purposes only and shall not be regarded as a warranty of a certain functionality, condition, or quality of a product. NVIDIA
Corporation (“NVIDIA”) makes no representations or warranties, expressed or implied, as to the accuracy or completeness of the information contained in this
document and assumes no responsibility for any errors contained herein. NVIDIA shall have no liability for the consequences or use of such information or for any
infringement of patents or other rights of third parties that may result from its use. This document is not a commitment to develop, release, or deliver any Material
(defined below), code, or functionality.

NVIDIA reserves the right to make corrections, modifications, enhancements, improvements, and any other changes to this document, at any time without notice.

Customer should obtain the latest relevant information before placing orders and should verify that such information is current and complete.

NVIDIA products are sold subject to the NVIDIA standard terms and conditions of sale supplied at the time of order acknowledgement, unless otherwise agreed
in an individual sales agreement signed by authorized representatives of NVIDIA and customer (“Terms of Sale”). NVIDIA hereby expressly objects to applying any
customer general terms and conditions with regards to the purchase of the NVIDIA product referenced in this document. No contractual obligations are formed
either directly or indirectly by this document.

OpenCL
OpenCL is a trademark of Apple Inc. used under license to the Khronos Group Inc.

Trademarks
NVIDIA and the NVIDIA logo are trademarks or registered trademarks of NVIDIA Corporation in the U.S. and other countries. Other company and product names may
be trademarks of the respective companies with which they are associated.

NVIDIA Corporation | 2788 San Tomas Expressway, Santa Clara, CA 95051

https://www.nvidia.com

Using C++ To Connect To Web Services - Steve Gates - CppCon 2014
No ratings yet
Using C++ To Connect To Web Services - Steve Gates - CppCon 2014
40 pages
Parallel Programming With CUDA - Architecture, Analysis
No ratings yet
Parallel Programming With CUDA - Architecture, Analysis
93 pages
CUDA C Programming Guide PDF
No ratings yet
CUDA C Programming Guide PDF
405 pages
DELTA IA-OSW AX Series Standard Instructions Manual-V1.5.0 ENG 20230928
No ratings yet
DELTA IA-OSW AX Series Standard Instructions Manual-V1.5.0 ENG 20230928
281 pages
PM5100 PM5300 ModbusRegisterList
No ratings yet
PM5100 PM5300 ModbusRegisterList
23 pages
S51413 - Developing Optimal CUDA Kernels On Hopper Tensor Cores - 1679452516682001bWRm
No ratings yet
S51413 - Developing Optimal CUDA Kernels On Hopper Tensor Cores - 1679452516682001bWRm
80 pages
Main SCM
No ratings yet
Main SCM
3,420 pages
C++ Metaprogramming - Fedor Pikus - CppCon 2015
100% (1)
C++ Metaprogramming - Fedor Pikus - CppCon 2015
76 pages
Gpucoder Ug
No ratings yet
Gpucoder Ug
560 pages
Algorithmic Differentiation - C++ and Extremum Estimation - Matt P. Dziubinski - CppCon 2015
No ratings yet
Algorithmic Differentiation - C++ and Extremum Estimation - Matt P. Dziubinski - CppCon 2015
283 pages
Compile-Time Tools For Generic Programming in C++ - Abel Sinkovics - CppCon 2015
No ratings yet
Compile-Time Tools For Generic Programming in C++ - Abel Sinkovics - CppCon 2015
241 pages
PF-CE Lab10 Pointers
No ratings yet
PF-CE Lab10 Pointers
9 pages
Cuda Reference Manual
No ratings yet
Cuda Reference Manual
256 pages
Functional Programming - Functors and Monads - Michał Dominiak - CppCon 2015
100% (1)
Functional Programming - Functors and Monads - Michał Dominiak - CppCon 2015
19 pages
CUDA Debugger API PDF
No ratings yet
CUDA Debugger API PDF
206 pages
RCPP - Seamless R and C++ Integration - Matt P. Dziubinski - CppCon 2015
No ratings yet
RCPP - Seamless R and C++ Integration - Matt P. Dziubinski - CppCon 2015
137 pages
The Canonical Class - Michael Caisse - CppCon 2014
No ratings yet
The Canonical Class - Michael Caisse - CppCon 2014
138 pages
Viewing The World Through Array-Shaped Glasses - Łukasz Mendakiewicz - CppCon 2014
No ratings yet
Viewing The World Through Array-Shaped Glasses - Łukasz Mendakiewicz - CppCon 2014
131 pages
MySQL Data Types Quick Reference Table
No ratings yet
MySQL Data Types Quick Reference Table
3 pages
Simple Extensible Pattern Matching With C++14 - John Bandela - CppCon 2015
No ratings yet
Simple Extensible Pattern Matching With C++14 - John Bandela - CppCon 2015
118 pages
CUDA Toolkit Reference Manual
No ratings yet
CUDA Toolkit Reference Manual
441 pages
CUDA Toolkit Reference Manual
No ratings yet
CUDA Toolkit Reference Manual
384 pages
STL Algorithms in Action - Michael VanLoon - CppCon 2015
No ratings yet
STL Algorithms in Action - Michael VanLoon - CppCon 2015
99 pages
Cud A Reference Manual
No ratings yet
Cud A Reference Manual
299 pages
Types Don't Know # - Howard Hinnant - CppCon 2014
No ratings yet
Types Don't Know # - Howard Hinnant - CppCon 2014
95 pages
CUDA For Tegra AppNote
No ratings yet
CUDA For Tegra AppNote
60 pages
Benchmarking C++ Code - Bryce Adelstein Lelbach - CppCon 2015
No ratings yet
Benchmarking C++ Code - Bryce Adelstein Lelbach - CppCon 2015
79 pages
9 Computer Architecture and Organization
No ratings yet
9 Computer Architecture and Organization
52 pages
C Questions: 1. Void Main (Int Const P 5 Printf ("%D",++ ( P) ) )
No ratings yet
C Questions: 1. Void Main (Int Const P 5 Printf ("%D",++ ( P) ) )
64 pages
CUDA C Programming Guide
No ratings yet
CUDA C Programming Guide
346 pages
Modernizing Legacy C++ Code - Gregory and McNellis - CppCon 2014
No ratings yet
Modernizing Legacy C++ Code - Gregory and McNellis - CppCon 2014
81 pages
C++11, 14, 17 Atomics - The Deep Dive - Michael Wong - CppCon 2015
No ratings yet
C++11, 14, 17 Atomics - The Deep Dive - Michael Wong - CppCon 2015
69 pages
Where Did My Performance Go - Fedor Pikus - CppCon 2014
No ratings yet
Where Did My Performance Go - Fedor Pikus - CppCon 2014
66 pages
From Functional To Parallel - Stochastic Modelling in C++ - Kevin Carpenter - CppCon 2015
No ratings yet
From Functional To Parallel - Stochastic Modelling in C++ - Kevin Carpenter - CppCon 2015
64 pages
The Implementation of Value Types - Lawrence Crowl - CppCon 2014
No ratings yet
The Implementation of Value Types - Lawrence Crowl - CppCon 2014
71 pages
Converting CUDA Programs To Run On AMD
No ratings yet
Converting CUDA Programs To Run On AMD
63 pages
HPCXX 2023 d4
No ratings yet
HPCXX 2023 d4
52 pages
Rebuilding Boost Date-Time For C++11 - Jeff Garland - CppCon 2014
No ratings yet
Rebuilding Boost Date-Time For C++11 - Jeff Garland - CppCon 2014
56 pages
Being Smart About Pointers - Michael VanLoon - CppCon 2015
No ratings yet
Being Smart About Pointers - Michael VanLoon - CppCon 2015
47 pages
Reactive Stream Processing Rx4DDS - Sumant Tambe - CppCon 2015
No ratings yet
Reactive Stream Processing Rx4DDS - Sumant Tambe - CppCon 2015
51 pages
The Birth of Study Group 14 - Nicolas Guillemot, Sean Middleditch, Michael Wong - CppCon 2015
No ratings yet
The Birth of Study Group 14 - Nicolas Guillemot, Sean Middleditch, Michael Wong - CppCon 2015
44 pages
MIPS Green Card
No ratings yet
MIPS Green Card
2 pages
C++ Multi-Dimensional Arrays For Computational Physics and Applied Mathematics - Pramod Gupta - CppCon 2015
No ratings yet
C++ Multi-Dimensional Arrays For Computational Physics and Applied Mathematics - Pramod Gupta - CppCon 2015
43 pages
QT - Modern User Interfaces For C++ - Milian Wolff - CppCon 2015
No ratings yet
QT - Modern User Interfaces For C++ - Milian Wolff - CppCon 2015
43 pages
STL Features and Implementation Techniques - Stephan T. Lavavej - CppCon 2014
No ratings yet
STL Features and Implementation Techniques - Stephan T. Lavavej - CppCon 2014
47 pages
Functional Design Explained - David Sankel - CppCon 2015
No ratings yet
Functional Design Explained - David Sankel - CppCon 2015
43 pages
Contracts For Dependable C++ - Gabriel Dos Reis - CppCon 2015
No ratings yet
Contracts For Dependable C++ - Gabriel Dos Reis - CppCon 2015
35 pages
Verilog Project Report
No ratings yet
Verilog Project Report
13 pages
Lecture 1 Number - Representation ECSE 343
No ratings yet
Lecture 1 Number - Representation ECSE 343
51 pages
Cuda Math API
No ratings yet
Cuda Math API
131 pages
C++ On The Web - JF Bastien - CppCon 2015
No ratings yet
C++ On The Web - JF Bastien - CppCon 2015
24 pages
Cuuda Nvidai Guide - Part1
No ratings yet
Cuuda Nvidai Guide - Part1
15 pages
Combinepdf
No ratings yet
Combinepdf
28 pages
Easy Compilation From TouchDevelop To ARM Cortex-M0 Using C++11 - Jonathan Protzenko - CppCon 2015
No ratings yet
Easy Compilation From TouchDevelop To ARM Cortex-M0 Using C++11 - Jonathan Protzenko - CppCon 2015
20 pages
Primjeri
No ratings yet
Primjeri
20 pages
Factory Io Assembler Structurat OB1
100% (1)
Factory Io Assembler Structurat OB1
3 pages
PDF Contrato Individual de Trabajo A Termino Fijo Odontologa y Auxiliar
No ratings yet
PDF Contrato Individual de Trabajo A Termino Fijo Odontologa y Auxiliar
10 pages
HPC Final 4-8
No ratings yet
HPC Final 4-8
25 pages
Lecture 3a - Pointers in C++ Spring 2025
No ratings yet
Lecture 3a - Pointers in C++ Spring 2025
20 pages
Chapter 2 Exercise and Answer Sign
No ratings yet
Chapter 2 Exercise and Answer Sign
10 pages
UR - 1 - T2O - State: T (Robot) - O (PLC) - Robot State and Safety Mode (32 Bytes)
No ratings yet
UR - 1 - T2O - State: T (Robot) - O (PLC) - Robot State and Safety Mode (32 Bytes)
11 pages
C++ in The Telecom Industry - Yani Miguel - CppCon 2015
No ratings yet
C++ in The Telecom Industry - Yani Miguel - CppCon 2015
13 pages
PDSCUDA
No ratings yet
PDSCUDA
11 pages
Introducing Brigand - Edouard Alligand and Joel Falcou - CppCon 2015
No ratings yet
Introducing Brigand - Edouard Alligand and Joel Falcou - CppCon 2015
9 pages
Ex No: 1 Substring Removal From A String Using String Buffer Class Date
No ratings yet
Ex No: 1 Substring Removal From A String Using String Buffer Class Date
47 pages
Optimus Developer Guide
No ratings yet
Optimus Developer Guide
11 pages
Ngôn Ngữ Lập Trình Trên Arduino - Hướng Dẫn Hàm - Cộng Đồng Arduino Việt Nam
No ratings yet
Ngôn Ngữ Lập Trình Trên Arduino - Hướng Dẫn Hàm - Cộng Đồng Arduino Việt Nam
1 page
Introduction To The Cuda Programming
No ratings yet
Introduction To The Cuda Programming
25 pages
01 II Datatype Summary
No ratings yet
01 II Datatype Summary
14 pages
GX20-1703-9 System360 Reference Data 2up PDF
No ratings yet
GX20-1703-9 System360 Reference Data 2up PDF
7 pages
Motuner A Compiler-Based Auto-Tuning Approach For Mixed-Precision Operators
No ratings yet
Motuner A Compiler-Based Auto-Tuning Approach For Mixed-Precision Operators
9 pages
Filipino
No ratings yet
Filipino
13 pages
Resolução de Exercícios EDA: 1-Data Types. Arithmetic Expressions. Input and Output
No ratings yet
Resolução de Exercícios EDA: 1-Data Types. Arithmetic Expressions. Input and Output
19 pages
MySQL Data Tape
No ratings yet
MySQL Data Tape
19 pages
Floating Point Alu
No ratings yet
Floating Point Alu
11 pages
XMC Math C
No ratings yet
XMC Math C
8 pages
3 Cuda
No ratings yet
3 Cuda
5 pages
Qualcomm Math (Qmath) Library
No ratings yet
Qualcomm Math (Qmath) Library
10 pages
CUDA Developer Guide For Optimus Platforms
No ratings yet
CUDA Developer Guide For Optimus Platforms
15 pages
Lab 11 21K-3172
No ratings yet
Lab 11 21K-3172
7 pages
HPC 4 B
No ratings yet
HPC 4 B
5 pages
NVIDIA Ampere GPU Architecture Tuning Guide - Ampere Tuning Guide 12.3 Documentation
No ratings yet
NVIDIA Ampere GPU Architecture Tuning Guide - Ampere Tuning Guide 12.3 Documentation
5 pages
CS614-Assignment 1 Solution Spring 2024
No ratings yet
CS614-Assignment 1 Solution Spring 2024
4 pages
Java Questions
No ratings yet
Java Questions
14 pages
SQL Server To PostgreSQL Migration Reference
No ratings yet
SQL Server To PostgreSQL Migration Reference
2 pages
What Are Bits
No ratings yet
What Are Bits
2 pages
Explain The Single Precision Floating Point Single IEEE 754 Representation
No ratings yet
Explain The Single Precision Floating Point Single IEEE 754 Representation
2 pages
How I Gained 11 Extra Hours a Week: Without Neglecting My Family, Team, or Sanity
From Everand
How I Gained 11 Extra Hours a Week: Without Neglecting My Family, Team, or Sanity
Dewayne Coleman
No ratings yet
The art of building great products: Combine your intuition with the best proven methodologies to build digital products everyone will love
From Everand
The art of building great products: Combine your intuition with the best proven methodologies to build digital products everyone will love
Mayank Mittal
No ratings yet
The Ultimate Career Toolkit : A Step-by-Step Guide to Landing Your Dream Job
From Everand
The Ultimate Career Toolkit : A Step-by-Step Guide to Landing Your Dream Job
Ebenezer Edem Zuh
No ratings yet
Business English Study - Advanced 1 - Fashion Brands
From Everand
Business English Study - Advanced 1 - Fashion Brands
Paul S Fletcher
No ratings yet
I Don't Need an Acting Class
From Everand
I Don't Need an Acting Class
Milton Justice
5/5 (3)
SolidWorks 2021 Step-By-Step Guide: 4, #4
From Everand
SolidWorks 2021 Step-By-Step Guide: 4, #4
Amit Bhatt
No ratings yet
Financial Management Made Easy 'Self-Tuition Approach' Concise Second Edition
From Everand
Financial Management Made Easy 'Self-Tuition Approach' Concise Second Edition
DR. BEN EBO ATTOM
No ratings yet
The Ultimate Business Blueprint Guide
From Everand
The Ultimate Business Blueprint Guide
Larry Navis
No ratings yet
The Future of Learning: Revolutionizing Education Through Generative AI: AI Books, #11
From Everand
The Future of Learning: Revolutionizing Education Through Generative AI: AI Books, #11
Mohammad
No ratings yet
The Business of Audio Engineering
From Everand
The Business of Audio Engineering
Dave Hampton
No ratings yet
Coming Back to the Present: A New ACT Self-Help Workbook to Manage Stress & Live a More Rich, Full, Meaningful Life
From Everand
Coming Back to the Present: A New ACT Self-Help Workbook to Manage Stress & Live a More Rich, Full, Meaningful Life
Audrey N. Hall
No ratings yet
Being Tchitaka
From Everand
Being Tchitaka
RODRIGUE TCHITAKA
No ratings yet
ChatGPT for Business: Strategies for Success
From Everand
ChatGPT for Business: Strategies for Success
Matthew C. Smith
1/5 (1)
The Linux Terminal for Advanced Users - The Command Line Made Easy: First Edition
From Everand
The Linux Terminal for Advanced Users - The Command Line Made Easy: First Edition
Michael Basler
No ratings yet
BlockChain for Beginners
From Everand
BlockChain for Beginners
Matthew Smith
No ratings yet
Gray Hat Hacking the Ethical Hacker's
From Everand
Gray Hat Hacking the Ethical Hacker's
Çağatay Şanlı
5/5 (1)
Options Trading for Income: Learn the strategies and techniques for maximizing returns and minimizing risk in the options market (2023 Guide for Beginners)
From Everand
Options Trading for Income: Learn the strategies and techniques for maximizing returns and minimizing risk in the options market (2023 Guide for Beginners)
Lane Conner
No ratings yet
Intrusion Detection Honeypots
From Everand
Intrusion Detection Honeypots
Chris Sanders
3/5 (2)
Human Nature Potential in Nurture
From Everand
Human Nature Potential in Nurture
David L. Hawk
No ratings yet
A Discourse Analysis of 1 Peter
From Everand
A Discourse Analysis of 1 Peter
Ervin Ray Starwalt
No ratings yet
How to Sell on Amazon Fba
From Everand
How to Sell on Amazon Fba
David L. Ross
No ratings yet
The IPhone 12 Pro Photography User Guide: Your Guide For Smartphone Photography For Taking Pictures Like A Pro Even As A Beginner
From Everand
The IPhone 12 Pro Photography User Guide: Your Guide For Smartphone Photography For Taking Pictures Like A Pro Even As A Beginner
Wendy Hills
No ratings yet
Aquaponics Construct and Operate: Instructions and Everything You Need to Know
From Everand
Aquaponics Construct and Operate: Instructions and Everything You Need to Know
PE David H. Dudley PMP
No ratings yet
10K Blueprint
From Everand
10K Blueprint
Cian O Farrell
5/5 (2)
Blog Smarter, Not Harder: SEO, Blogging, and AI Strategies to Skyrocket Your Traffic
From Everand
Blog Smarter, Not Harder: SEO, Blogging, and AI Strategies to Skyrocket Your Traffic
Jay Nans
No ratings yet
Aquaponics How to do Everything from Backyard to Profitable Business: from BACKYARD to PROFITABLE BUSINESS
From Everand
Aquaponics How to do Everything from Backyard to Profitable Business: from BACKYARD to PROFITABLE BUSINESS
David H Dudley
No ratings yet
Aquaponics for Profit
From Everand
Aquaponics for Profit
David H Dudley
No ratings yet
Aquaponic Design Plans Everything You Needs to Know: Everything You Need to Know from Backyard to Profitable Business
From Everand
Aquaponic Design Plans Everything You Needs to Know: Everything You Need to Know from Backyard to Profitable Business
David H Dudley
No ratings yet
Aquaponics Design Plans, Construction, Operation, and Income: Organic Food
From Everand
Aquaponics Design Plans, Construction, Operation, and Income: Organic Food
David H Dudley
No ratings yet
A To Z of Internet: Everything You Wanted to Know
From Everand
A To Z of Internet: Everything You Wanted to Know
Bittu Kumar
No ratings yet
Software Patterns Made Easy
From Everand
Software Patterns Made Easy
Justice Nanhou
No ratings yet
Web Video Business
From Everand
Web Video Business
MUHAMMAD NUR WAHID ANUAR
No ratings yet
Kellory the Warlock
From Everand
Kellory the Warlock
Lin Carter
No ratings yet

CUDA Math API

Uploaded by

CUDA Math API

Uploaded by

CUDA Math API

API Reference Manual

vRelease Version | January 2022

CUDA Math API vRelease Version | ii

CUDA Math API vRelease Version | iii

CUDA Math API vRelease Version | iv

CUDA Math API vRelease Version | v

CUDA Math API vRelease Version | vi

CUDA Math API vRelease Version | vii

CUDA Math API vRelease Version | viii

CUDA Math API vRelease Version | ix

CUDA Math API vRelease Version | x

CUDA Math API vRelease Version | xi

CUDA Math API vRelease Version | xii

CUDA Math API vRelease Version | xiii

CUDA Math API vRelease Version | xiv

CUDA Math API vRelease Version | xv

CUDA Math API vRelease Version | xvi

CUDA Math API vRelease Version | xvii

CUDA Math API vRelease Version | xviii

CUDA Math API vRelease Version | xix

CUDA Math API vRelease Version | xx

CUDA Math API vRelease Version | xxi

CUDA Math API vRelease Version | xxii

CUDA Math API vRelease Version | xxiii

CUDA Math API vRelease Version | xxiv

CUDA Math API vRelease Version | xxv

CUDA Math API vRelease Version | xxvi

Here is a list of all modules:

‣ Half Precision Intrinsics

CUDA Math API vRelease Version | 1

1.1. Half Precision Intrinsics

Half Arithmetic Functions

__device__ __half __habs (const __half a)

CUDA Math API vRelease Version | 2

‣ The absolute value of a.

__device__ __half __hadd (const __half a, const __half b)

__device__ __half __hadd_rn (const __half a, const __half b)

__device__ __half __hadd_sat (const __half a, const __half

‣ The sum of a and b, with respect to saturation.

CUDA Math API vRelease Version | 3

__device__ __half __hdiv (const __half a, const __half b)

__device__ __half __hfma (const __half a, const __half b,

__device__ __half __hfma_relu (const __half a, const __half

‣ The result of fused multiply-add operation on a, b, and c with relu saturation.

CUDA Math API vRelease Version | 4

__device__ __half __hfma_sat (const __half a, const __half

‣ The result of fused multiply-add operation on a, b, and c, with respect to saturation.

__device__ __half __hmul (const __half a, const __half b)

__device__ __half __hmul_rn (const __half a, const __half

CUDA Math API vRelease Version | 5

__device__ __half __hmul_sat (const __half a, const __half

‣ The result of multiplying a and b, with respect to saturation.

__device__ __half __hneg (const __half a)

__device__ __half __hsub (const __half a, const __half b)

__device__ __half __hsub_rn (const __half a, const __half b)

CUDA Math API vRelease Version | 6

__device__ __half __hsub_sat (const __half a, const __half

‣ The result of subtraction of b from a, with respect to saturation.

__device__ __half atomicAdd (const __half *address, const

‣ The old value read from address.

CUDA Math API vRelease Version | 7

1.1.2. Half2 Arithmetic Functions

__device__ __half2 __h2div (const __half2 a, const __half2

__device__ __half2 __habs2 (const __half2 a)

‣ Returns a with the absolute value of both halves.

CUDA Math API vRelease Version | 8

__device__ __half2 __hadd2 (const __half2 a, const __half2

__device__ __half2 __hadd2_rn (const __half2 a, const

__device__ __half2 __hadd2_sat (const __half2 a, const

‣ The sum of a and b, with respect to saturation.

CUDA Math API vRelease Version | 9

__device__ __half2 __hcmadd (const __half2 a, const

‣ The result of complex multiply-accumulate operation on complex numbers a, b, and c

__device__ __half2 __hfma2 (const __half2 a, const __half2

__device__ __half2 __hfma2_relu (const __half2 a, const

device half habs (const __half a)

device half hadd (const half a, const half b)

device half hadd_rn (const half a, const half b)

device half hadd_sat (const half a, const half

device half hdiv (const half a, const half b)

device half hfma (const half a, const half b,

device half hfma_relu (const half a, const half

device half hfma_sat (const half a, const half

device half hmul (const half a, const half b)

device half hmul_rn (const half a, const half

device half hmul_sat (const half a, const half

device half hneg (const __half a)

device half hsub (const half a, const half b)

device half hsub_rn (const half a, const half b)

device half hsub_sat (const half a, const half

device half atomicAdd (const half *address, const

device half2 h2div (const half2 a, const half2

device half2 habs2 (const __half2 a)

device half2 hadd2 (const half2 a, const half2

device half2 hadd2_rn (const __half2 a, const

device half2 hadd2_sat (const __half2 a, const

device half2 hcmadd (const __half2 a, const

device half2 hfma2 (const half2 a, const half2

device half2 hfma2_relu (const __half2 a, const

device half2 hfma2_sat (const __half2 a, const

device half2 hmul2 (const half2 a, const half2

device half2 hmul2_rn (const __half2 a, const

device half2 hmul2_sat (const __half2 a, const

device half2 hneg2 (const __half2 a)

device half2 hsub2 (const half2 a, const half2

device half2 hsub2_rn (const __half2 a, const

device half2 hsub2_sat (const __half2 a, const

device half2 atomicAdd (const half2 *address,

device bool heq (const half a, const __half b)

device bool hequ (const half a, const __half b)

device bool hge (const half a, const __half b)

device bool hgeu (const half a, const __half b)

device bool hgt (const half a, const __half b)

device bool hgtu (const half a, const __half b)

device int hisinf (const half a)

device bool hisnan (const half a)

device bool hle (const half a, const __half b)

device bool hleu (const half a, const __half b)

device bool hlt (const half a, const __half b)

device bool hltu (const half a, const __half b)

device half hmax (const half a, const half b)

device half hmax_nan (const half a, const half

device half hmin (const half a, const half b)

device half hmin_nan (const half a, const half

device bool hne (const half a, const __half b)

device bool hneu (const half a, const __half b)

device bool hbeq2 (const half2 a, const __half2 b)

device bool hbequ2 (const half2 a, const __half2 b)

device bool hbge2 (const half2 a, const __half2 b)

device bool hbgeu2 (const half2 a, const __half2 b)