0% found this document useful (0 votes)

99 views

Lab 3

The document discusses two lab exercises for a course on vectorization and parallelization. The first exercise implements a matrix multiplication algorithm using AVX2 transpose-and-multiply. Performance is evaluated for single-threaded and multi-threaded versions. The second exercise implements a vectorized parallel algorithm for normalizing arrays of 3D points in an AoS (array of structures) layout.

Uploaded by

Bburnae Brndlgr

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

99 views

Lab 3

Uploaded by

Bburnae Brndlgr

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 1

B190900804 Э.

Мөнх-Оргил Lab 3

ЛАБОРАТОРИЙН АЖИЛ 3 Вектор регистр ашигласан алгогритмууд

Даалгавар 1. Лекц 5-дээр үзсэн AVX2 ашигласан Transpose-and-Multiply.

In [4]: %%writefile transs.cpp
#include <random>
#include <cstdint>
#include <iostream>
#include <immintrin.h>

#include "/content/hpc_helpers.hpp"

void init(float * data, uint64_t length) {

std::mt19937 engine(42);
std::uniform_real_distribution<float> density(-1, 1);

for (uint64_t i = 0; i < length; i++)

data[i] = density(engine);
}

inline float hsum_sse3(__m128 v) {

__m128 shuf = _mm_movehdup_ps(v);
__m128 maxs = _mm_add_ps(v, shuf);
shuf = _mm_movehl_ps(shuf, maxs);
maxs = _mm_add_ss(maxs, shuf);
return _mm_cvtss_f32(maxs);
}

inline float hsum_avx(__m256 v) {

__m128 lo = _mm256_castps256_ps128(v);
__m128 hi = _mm256_extractf128_ps(v, 1);
lo = _mm_add_ps(lo, hi);
return hsum_sse3(lo);
}

void plain_dmm(float * A,
float * B,
float * C,
uint64_t M,
uint64_t L,
uint64_t N,
bool parallel) {

#pragma omp parallel for collapse(2) if(parallel)

for (uint64_t i = 0; i < M; i++)
for (uint64_t j = 0; j < N; j++) {
float accum = float(0);
for (uint64_t k = 0; k < L; k++)
accum += A[i*L+k]*B[j*L+k];
C[i*N+j] = accum;
}
}

void avx_dmm(float * A,
float * B,
float * C,
uint64_t M,
uint64_t L,
uint64_t N,
bool parallel) {

#pragma omp parallel for collapse(2) if(parallel)

for (uint64_t i = 0; i < M; i++)
for (uint64_t j = 0; j < N; j++) {

__m256 X = _mm256_setzero_ps();
for (uint64_t k = 0; k < L; k += 8) {
const __m256 AV = _mm256_load_ps(A+i*L+k);
const __m256 BV = _mm256_load_ps(B+j*L+k);
X = _mm256_add_ps(X, _mm256_mul_ps(AV, BV));
}

C[i*N+j] = hsum_avx(X);
}
}

void avx2_tmm(float * A,
float * B,
float * C,
uint64_t M,
uint64_t L,
uint64_t N,
bool parallel) {
#pragma omp parallel for collapse(2) if(parallel)
for (uint64_t i = 0; i<M; i++)
for (uint64_t j=0; j<N; j++) {

__m256 X = _mm256_setzero_ps();
for (uint64_t k = 0; k < L; k += 8) {
const __m256 AV = _mm256_load_ps(A+i*L+k);
const __m256 BV = _mm256_load_ps(B+j*L+k);
X = _mm256_fmadd_ps(X, _mm256_mul_ps(AV, BV));
}

C[i*N+j] = hsum_avx(X);

}
}

void avx_dmm_unroll_2(float * A,
float * B,
float * C,
uint64_t M,
uint64_t L,
uint64_t N,
bool parallel) {

#pragma omp parallel for collapse(2) if(parallel)

for (uint64_t i = 0; i < M; i++)
for (uint64_t j = 0; j < N; j++) {

__m256 X = _mm256_setzero_ps();
__m256 Y = _mm256_setzero_ps();
for (uint64_t k = 0; k < L; k += 16) {
const __m256 AVX = _mm256_load_ps(A+i*L+k+0);
const __m256 BVX = _mm256_load_ps(B+j*L+k+0);
const __m256 AVY = _mm256_load_ps(A+i*L+k+8);
const __m256 BVY = _mm256_load_ps(B+j*L+k+8);
X = _mm256_add_ps(X, _mm256_mul_ps(AVX, BVX));
Y = _mm256_add_ps(X, _mm256_mul_ps(AVY, BVY));
}

C[i*N+j] = hsum_avx(X)+hsum_avx(Y);
}
}

int main () {

const uint64_t M = 1UL << 10;

const uint64_t L = 1UL << 11;
const uint64_t N = 1UL << 12;

TIMERSTART(alloc_memory)
auto A = static_cast<float*>(_mm_malloc(M*L*sizeof(float) , 32));
auto B = static_cast<float*>(_mm_malloc(N*L*sizeof(float) , 32));
auto C = static_cast<float*>(_mm_malloc(M*N*sizeof(float) , 32));
TIMERSTOP(alloc_memory)

TIMERSTART(init)
init(A, M*L);
init(B, N*L);
TIMERSTOP(init)

TIMERSTART(plain_dmm_single)
plain_dmm(A, B, C, M, L, N, false);
TIMERSTOP(plain_dmm_single)

TIMERSTART(plain_dmm_multi)
plain_dmm(A, B, C, M, L, N, true);
TIMERSTOP(plain_dmm_multi)

TIMERSTART(avx_dmm_single)
avx_dmm(A, B, C, M, L, N, false);
TIMERSTOP(avx_dmm_single)

TIMERSTART(avx_dmm_multi)
avx_dmm(A, B, C, M, L, N, true);
TIMERSTOP(avx_dmm_multi)

TIMERSTART(avx2_tmm_single)
avx2_tmm(A, B, C, M, L, N, false);
TIMERSTOP(avx2_tmm_single)

TIMERSTART(avx2_tmm_multi)
avx2_tmm(A, B, C, M, L, N, true);
TIMERSTOP(avx2_tmm_multi)

TIMERSTART(avx_dmm_unroll_2_single)
avx_dmm_unroll_2(A, B, C, M, L, N, false);
TIMERSTOP(avx_dmm_unroll_2_single)

TIMERSTART(avx_dmm_unroll_2_multi)
avx_dmm_unroll_2(A, B, C, M, L, N, true);
TIMERSTOP(avx_dmm_unroll_2_multi)

TIMERSTART(free_memory)
_mm_free(A);
_mm_free(B);
_mm_free(C);
TIMERSTOP(free_memory)
}

Overwriting transs.cpp

In [5]: %%script bash

g++ -mavx2 -std=c++17 transs.cpp -o transs
./transs

# elapsed time (alloc_memory): 8.0455e-05s

# elapsed time (init): 0.573362s
# elapsed time (plain_dmm_single): 30.1376s
# elapsed time (plain_dmm_multi): 30.1157s
# elapsed time (avx_dmm_single): 8.4705s
# elapsed time (avx_dmm_multi): 8.55333s
# elapsed time (avx2_tmm_single): 8.6349s
# elapsed time (avx2_tmm_multi): 8.65955s
# elapsed time (avx_dmm_unroll_2_single): 8.29264s
# elapsed time (avx_dmm_unroll_2_multi): 8.23551s
# elapsed time (free_memory): 0.00238831s

Дүгнэлт
Энэхүү даалгавараар Лекц 5-дээр үзсэн AVX2 ийн Transpose-and-Multiply алгоритм ийг хэрэгжүүлж, хэр их хугацаа өнгөрсөнг хэвлэж гаргав.

Сурах бичиг дээр өгөгдсөн hpc_helpers.hpp ашиглаж өнгөрсөн хугацааг тодорхойлсон.

Зөвхөн AVX2 биш сурах бичиг дээр байгаа бусад алгоритмийн хамт ажиллуулж, харицуулсан болно.

Жирийн AVX transpose & multiply single алгоритм нь 8.4705s бол parallel нь 8.55333s зарцуулсан. AVX2 transpose & multiply алгоритм болох
avx2_tmm_single нь 8.6349s бол avx2_tmm_multi нь 8.65955s зарцуулсан болно.

Энэхүү хариу миний ашиглаж байгаа интернетээс хамаарч өөр гарсан байх боломжтой

Даалгавар 2. Лекц 6-дээр үзсэн AoS дээрх Vectorized нормалчлал гэсэн

параллел алгоритмыг хэрэгжүүлнэ.
In [7]: %%writefile aos.cpp
#include <random>
#include <cstdint>
#include <iostream>
#include <immintrin.h>
#include "/content/hpc_helpers.hpp"

void aos_init(float * xyz, uint64_t length) {

std::mt19937 engine(42);
std::uniform_real_distribution<float> density(-1, 1);

for (uint64_t i = 0; i < 3*length; i++)

xyz[i] = density(engine);
}

void avx_aos_norm(float * xyz, uint64_t length) {

for (uint64_t i = 0; i < 3length; i += 38) {

m128 M = (m128) (xyz+i);

__m256 M03;
__m256 M14;
__m256 M25;

M03 = _mm256_castps128_ps256(M[0]);
M14 = _mm256_castps128_ps256(M[1]);
M25 = _mm256_castps128_ps256(M[2]);

M03 = _mm256_insertf128_ps(M03 ,M[3],1);

M14 = _mm256_insertf128_ps(M14 ,M[4],1);
M25 = _mm256_insertf128_ps(M25 ,M[5],1);

__m256 XY = _mm256_shuffle_ps(M14, M25, _MM_SHUFFLE( 2,1,3,2));

__m256 YZ = _mm256_shuffle_ps(M03, M14, _MM_SHUFFLE( 1,0,2,1));
__m256 X = _mm256_shuffle_ps(M03, XY , _MM_SHUFFLE( 2,0,3,0));
__m256 Y = _mm256_shuffle_ps(YZ , XY , _MM_SHUFFLE( 3,1,2,0));
__m256 Z = _mm256_shuffle_ps(YZ , M25, _MM_SHUFFLE( 3,0,3,1));

__m256 R = _mm256_add_ps(_mm256_mul_ps(X, X),

_mm256_add_ps(_mm256_mul_ps(Y, Y),
_mm256_mul_ps(Z, Z)));

R = _mm256_rsqrt_ps(R);

X = _mm256_mul_ps(X, R);
Y = _mm256_mul_ps(Y, R);
Z = _mm256_mul_ps(Z, R);

__m256 RXY = _mm256_shuffle_ps(X,Y, _MM_SHUFFLE(2,0,2,0));

__m256 RYZ = _mm256_shuffle_ps(Y,Z, _MM_SHUFFLE(3,1,3,1));
__m256 RZX = _mm256_shuffle_ps(Z,X, _MM_SHUFFLE(3,1,2,0));
__m256 R03 = _mm256_shuffle_ps(RXY, RZX, _MM_SHUFFLE(2,0,2,0));
__m256 R14 = _mm256_shuffle_ps(RYZ, RXY, _MM_SHUFFLE(3,1,2,0));
__m256 R25 = _mm256_shuffle_ps(RZX, RYZ, _MM_SHUFFLE(3,1,3,1));

M[0] = _mm256_castps256_ps128(R03);
M[1] = _mm256_castps256_ps128(R14);
M[2] = _mm256_castps256_ps128(R25);
M[3] = _mm256_extractf128_ps(R03, 1);
M[4] = _mm256_extractf128_ps(R14, 1);
M[5] = _mm256_extractf128_ps(R25, 1);
}
}

void aos_check(float * xyz, uint64_t length) {

for (uint64_t i = 0; i < 3*length; i += 3) {

const float x = xyz[i+0];

const float y = xyz[i+1];
const float z = xyz[i+2];

float rho = xx+yy+z*z;

if ((rho-1)*(rho-1) > 1E-6)

std::cout << "error too big at position "
<< i << std::endl;
}
}
int main () {

const uint64_t num_vectors = 1UL << 28;

const uint64_t num_bytes = 3*num_vectors*sizeof(float);

TIMERSTART(alloc_memory)
auto xyz = static_cast<float*>(_mm_malloc(num_bytes , 32));
TIMERSTOP(alloc_memory)

TIMERSTART(init)
aos_init(xyz, num_vectors);
TIMERSTOP(init)

TIMERSTART(avx_aos_normalize)
avx_aos_norm(xyz, num_vectors);
TIMERSTOP(avx_aos_normalize)

TIMERSTART(check)
aos_check(xyz, num_vectors);
TIMERSTOP(check)

TIMERSTART(free_memory)
_mm_free(xyz);
TIMERSTOP(free_memory)
}

Overwriting aos.cpp

In [8]: %%script bash

g++ -mavx2 -std=c++17 aos.cpp -o aos
./aos

# elapsed time (alloc_memory): 0.00168678s

# elapsed time (init): 43.318s
# elapsed time (avx_aos_normalize): 1.48238s
# elapsed time (check): 1.90671s
# elapsed time (free_memory): 0.116367s

tcmalloc: large alloc 3221225472 bytes == 0x55c36aa4a000 @ 0x7f4768478b6b 0x7f4768498379 0x55c369825bfb 0x55c3698263
db 0x7f476796cc87 0x55c369825aba

Дүгнэлт
Лекц 6-дээр үзсэн AoS дээрх Vectorized нормалчлал гэсэн параллел алгоритмыг хэрэгжүүлэв.

Санах ой бэлдэх хугацаа, Initialization хийх хугацаа, avx_aos_normalize хугацаа, шалгах хугацаа, санах ойг суллах хугацааг хэвлэж
харуулав.

Сурах бичиг дээр өгөгдсөн hpc_helpers.hpp ашиглаж өнгөрсөн хугацааг тодорхойлсон.

AOS форматад 3D векторыг 256 бит регистр SoA формат руу шилжүүлнэ. SoA форматыг ашиглан Vectorized SIMD тооцооллоно. SoA ээс үр
дүнг AoS формат руу шилжүүлнэ. Векторыг холих үйлдэл ашиглана.

Ашигласан Материал
Сурах бичиг: An Introduction to Modern Parallel Programming

Лекц 5 болон 6 ийн хичээлийн ppt

Сурах бичиг дээр өгөгдсөн hpc_helpers.hpp файл

CLARA CLARANS Example
No ratings yet
CLARA CLARANS Example
3 pages
XL CelMap - FX
No ratings yet
XL CelMap - FX
35 pages
Experiment No-8: OBJECT: - To Design IIR Filter Using TMS320C6713 Digital Signal Processing Starter Kit
No ratings yet
Experiment No-8: OBJECT: - To Design IIR Filter Using TMS320C6713 Digital Signal Processing Starter Kit
7 pages
Log
No ratings yet
Log
4,039 pages
HPE and Intel Presentation
100% (1)
HPE and Intel Presentation
33 pages
Design A 4×1 Multiplexer Using Pass Transistor Logic in Schematic and Simulate For Transient Characteristics.
100% (1)
Design A 4×1 Multiplexer Using Pass Transistor Logic in Schematic and Simulate For Transient Characteristics.
6 pages
7.performance Analysis of Wallace Tree Multiplier With Kogge Stone Adder Using 15-4 Compressor
No ratings yet
7.performance Analysis of Wallace Tree Multiplier With Kogge Stone Adder Using 15-4 Compressor
38 pages
90nm Cmos Ekv v301.01
No ratings yet
90nm Cmos Ekv v301.01
4 pages
Simulation of Rotating Triangle
No ratings yet
Simulation of Rotating Triangle
18 pages
ECE 485 Cache Simulation Report
No ratings yet
ECE 485 Cache Simulation Report
30 pages
B052b8dfabafb4e5!1906 PDF
100% (2)
B052b8dfabafb4e5!1906 PDF
3 pages
Design and Implementation of FIR Filter Based On Dual Quality Compressor Based Multipliers With MFA
No ratings yet
Design and Implementation of FIR Filter Based On Dual Quality Compressor Based Multipliers With MFA
24 pages
Whatsapp Patch File
No ratings yet
Whatsapp Patch File
3 pages
Game Programming
No ratings yet
Game Programming
64 pages
Matrix Multiplication Using SIMD Technologies
No ratings yet
Matrix Multiplication Using SIMD Technologies
13 pages
Verilog HDL: Module
No ratings yet
Verilog HDL: Module
9 pages
Image Processing To Manipulate RGB Values Using Verilog.
No ratings yet
Image Processing To Manipulate RGB Values Using Verilog.
5 pages
Fir Filter Verilog Code
100% (1)
Fir Filter Verilog Code
3 pages
SMDP - Project - Final - Proposal - NIT - N - Draft PDF
No ratings yet
SMDP - Project - Final - Proposal - NIT - N - Draft PDF
17 pages
Allprob 08 N 2
No ratings yet
Allprob 08 N 2
11 pages
DFT/ FFT Using TMS320C5515 TM eZDSP USB Stick: Via CCS and Matlab
No ratings yet
DFT/ FFT Using TMS320C5515 TM eZDSP USB Stick: Via CCS and Matlab
12 pages
Homework2 Soln Dev
No ratings yet
Homework2 Soln Dev
18 pages
DSP Processor and Architecture
No ratings yet
DSP Processor and Architecture
45 pages
Computer Graphics Polygon Clipping: by Asmita Nag
No ratings yet
Computer Graphics Polygon Clipping: by Asmita Nag
26 pages
List of Experiments CPP
No ratings yet
List of Experiments CPP
5 pages
Synopsys ASIC Tutorial: Text in Red Through Out The Document Is Indicating A Requirement For The Final Report
No ratings yet
Synopsys ASIC Tutorial: Text in Red Through Out The Document Is Indicating A Requirement For The Final Report
94 pages
Bresenham'S Ellipse Drawing Algorithm: / REGION 1
No ratings yet
Bresenham'S Ellipse Drawing Algorithm: / REGION 1
3 pages
Ahb Slave Agent Classes - Code
100% (1)
Ahb Slave Agent Classes - Code
12 pages
Adder Kogge Stone 32bit With Test Bench
50% (2)
Adder Kogge Stone 32bit With Test Bench
9 pages
Computation and Programming in Physics - Euler & Runge-Kutta Methods
No ratings yet
Computation and Programming in Physics - Euler & Runge-Kutta Methods
34 pages
Exercise 01 As Chapter Assembly (2 Point) Ex1:. ADD: Will Store The Remainder To Destination SUB: Will Store The Remainder To Destination Mul
No ratings yet
Exercise 01 As Chapter Assembly (2 Point) Ex1:. ADD: Will Store The Remainder To Destination SUB: Will Store The Remainder To Destination Mul
4 pages
4 Bit Full Adder
No ratings yet
4 Bit Full Adder
5 pages
Floating Point Multiplier
No ratings yet
Floating Point Multiplier
15 pages
Borland Graphics Interface (BGI) For Windows
No ratings yet
Borland Graphics Interface (BGI) For Windows
3 pages
Data Converters
No ratings yet
Data Converters
37 pages
ADC0808
No ratings yet
ADC0808
15 pages
To Read and Write in To The Internal EEPROM Memory of PIC16F877A Using An Assembly Language
100% (1)
To Read and Write in To The Internal EEPROM Memory of PIC16F877A Using An Assembly Language
6 pages
Assignment - 1
0% (1)
Assignment - 1
4 pages
Asymptotic Notation - Asymptotic Notation in Data Structure FreeFeast PDF
No ratings yet
Asymptotic Notation - Asymptotic Notation in Data Structure FreeFeast PDF
5 pages
Design and Analysis of Approximate Compressors For Multiplication
No ratings yet
Design and Analysis of Approximate Compressors For Multiplication
11 pages
Chapter-1: Tower of Hanoi
No ratings yet
Chapter-1: Tower of Hanoi
28 pages
Caed Question Bank
No ratings yet
Caed Question Bank
28 pages
Exercises 02
No ratings yet
Exercises 02
6 pages
Low Power Digital VLSI Design - Circuits and Systems - Abdellatif Bellaouar, Mohamed I Elmasry
No ratings yet
Low Power Digital VLSI Design - Circuits and Systems - Abdellatif Bellaouar, Mohamed I Elmasry
15 pages
Microprocessor Experiment No.: 1 Use of Programming Tools Microprocessor Lab Experiment No.: 1 Use of Programming Tools
No ratings yet
Microprocessor Experiment No.: 1 Use of Programming Tools Microprocessor Lab Experiment No.: 1 Use of Programming Tools
8 pages
DAA E-Lab
No ratings yet
DAA E-Lab
25 pages
VLSI Design
No ratings yet
VLSI Design
8 pages
Array Multiplier 8x8 Verilog Code
100% (1)
Array Multiplier 8x8 Verilog Code
5 pages
Assignment 5 - OpenCL Optimizations
100% (1)
Assignment 5 - OpenCL Optimizations
2 pages
Index (C++)
No ratings yet
Index (C++)
25 pages
LPC2148 Microcontroller Architecture and
No ratings yet
LPC2148 Microcontroller Architecture and
50 pages
CS2312-c++ Lab
No ratings yet
CS2312-c++ Lab
35 pages
Question Bank CAT 1
100% (1)
Question Bank CAT 1
6 pages
Verilog Codes
No ratings yet
Verilog Codes
102 pages
Brent Kung Adder 16 Bit Full Code
100% (2)
Brent Kung Adder 16 Bit Full Code
8 pages
CG End-Sem
No ratings yet
CG End-Sem
61 pages
Practical
50% (4)
Practical
98 pages
joint_matrix_bfloat16_modified
No ratings yet
joint_matrix_bfloat16_modified
4 pages
Web GPU
0% (1)
Web GPU
40 pages
Correction-Exam-SP-Arch-parallele-21-22
No ratings yet
Correction-Exam-SP-Arch-parallele-21-22
3 pages
Global SLP - Review Meeting
No ratings yet
Global SLP - Review Meeting
29 pages
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet
Blackmagic RAW SDK
No ratings yet
Blackmagic RAW SDK
88 pages
Emu Log
No ratings yet
Emu Log
2 pages
MB Manual Intel800-Bios e
No ratings yet
MB Manual Intel800-Bios e
30 pages
Fundamentals of Computer Assignment Report
No ratings yet
Fundamentals of Computer Assignment Report
27 pages
Intel's Haswell CPU Microarchitecture
No ratings yet
Intel's Haswell CPU Microarchitecture
17 pages
Intel® Core™ I3 Desktop Processor Comparison Chart: Technical Documents Datashseets, Design Guides, Etc
No ratings yet
Intel® Core™ I3 Desktop Processor Comparison Chart: Technical Documents Datashseets, Design Guides, Etc
2 pages
Accelerating GNSS Software Receivers
No ratings yet
Accelerating GNSS Software Receivers
18 pages
SDM Change Document
No ratings yet
SDM Change Document
2,819 pages
2nd-gen-xeon-scalable-datasheet-vol-2
No ratings yet
2nd-gen-xeon-scalable-datasheet-vol-2
60 pages
Intel® Core™ Laptop Processors Comparison Chart: Updated: May 2021
No ratings yet
Intel® Core™ Laptop Processors Comparison Chart: Updated: May 2021
7 pages
614073-336062-Xeon Scalable Mem Datasheet Vol2 R4
No ratings yet
614073-336062-Xeon Scalable Mem Datasheet Vol2 R4
62 pages
Pexip Infinity Server Design Guide V33.a
No ratings yet
Pexip Infinity Server Design Guide V33.a
35 pages
Intel Optimization Reference Manual V1 050
No ratings yet
Intel Optimization Reference Manual V1 050
895 pages
64 Ia 32 Architectures Software Developer Instruction Set Reference Manual 325383 PDF
No ratings yet
64 Ia 32 Architectures Software Developer Instruction Set Reference Manual 325383 PDF
1,479 pages
Emu Log
No ratings yet
Emu Log
14 pages
Ipp - User Guide 8.0 U1
No ratings yet
Ipp - User Guide 8.0 U1
46 pages
x86 Intrinsics Cheat Sheet: Jan Finis Finis@in - Tum.de
100% (1)
x86 Intrinsics Cheat Sheet: Jan Finis Finis@in - Tum.de
1 page
[FREE PDF sample] Modern X86 Assembly Language Programming: Covers x86 64-bit, AVX, AVX2, and AVX-512 Daniel Kusswurm ebooks
100% (2)
[FREE PDF sample] Modern X86 Assembly Language Programming: Covers x86 64-bit, AVX, AVX2, and AVX-512 Daniel Kusswurm ebooks
65 pages
Intro To MPI
No ratings yet
Intro To MPI
44 pages
Hs Err Pid4356
No ratings yet
Hs Err Pid4356
16 pages
Introduction to 64 Bit Windows Assembly Programming Fourth Edition Ray Seyfarth - Quickly download the ebook to never miss any content
100% (2)
Introduction to 64 Bit Windows Assembly Programming Fourth Edition Ray Seyfarth - Quickly download the ebook to never miss any content
47 pages
The Significance of SIMD, SSE and AVX - Intel - Slides (3a - SIMD)
No ratings yet
The Significance of SIMD, SSE and AVX - Intel - Slides (3a - SIMD)
57 pages
Emu Log
No ratings yet
Emu Log
29 pages
AMD64 Technology AMD64 Architecture Programmer's Manual 128-Bit and 256-Bit XOP and FMA4 Instructions
No ratings yet
AMD64 Technology AMD64 Architecture Programmer's Manual 128-Bit and 256-Bit XOP and FMA4 Instructions
276 pages
Openstack Enhanced Platform Awareness
No ratings yet
Openstack Enhanced Platform Awareness
8 pages
Intel Core I7-7700 Mi Procesador
No ratings yet
Intel Core I7-7700 Mi Procesador
1 page
Message
No ratings yet
Message
7 pages
Andes RVV Webinar III
No ratings yet
Andes RVV Webinar III
49 pages

Lab 3

Uploaded by

Lab 3

Uploaded by

B190900804 Э.

ЛАБОРАТОРИЙН АЖИЛ 3 Вектор регистр ашигласан алгогритмууд

Даалгавар 1. Лекц 5-дээр үзсэн AVX2 ашигласан Transpose-and-Multiply.

void init(float * data, uint64_t length) {

for (uint64_t i = 0; i < length; i++)

inline float hsum_sse3(__m128 v) {

inline float hsum_avx(__m256 v) {

#pragma omp parallel for collapse(2) if(parallel)

#pragma omp parallel for collapse(2) if(parallel)

#pragma omp parallel for collapse(2) if(parallel)

const uint64_t M = 1UL << 10;

In [5]: %%script bash

# elapsed time (alloc_memory): 8.0455e-05s

Сурах бичиг дээр өгөгдсөн hpc_helpers.hpp ашиглаж өнгөрсөн хугацааг тодорхойлсон.

Даалгавар 2. Лекц 6-дээр үзсэн AoS дээрх Vectorized нормалчлал гэсэн

void aos_init(float * xyz, uint64_t length) {

for (uint64_t i = 0; i < 3*length; i++)

void avx_aos_norm(float * xyz, uint64_t length) {

for (uint64_t i = 0; i < 3*length; i += 3*8) {

__m128 *M = (__m128*) (xyz+i);

M03 = _mm256_insertf128_ps(M03 ,M[3],1);

__m256 XY = _mm256_shuffle_ps(M14, M25, _MM_SHUFFLE( 2,1,3,2));

__m256 R = _mm256_add_ps(_mm256_mul_ps(X, X),

__m256 RXY = _mm256_shuffle_ps(X,Y, _MM_SHUFFLE(2,0,2,0));

void aos_check(float * xyz, uint64_t length) {

for (uint64_t i = 0; i < 3*length; i += 3) {

const float x = xyz[i+0];

float rho = x*x+y*y+z*z;

if ((rho-1)*(rho-1) > 1E-6)

const uint64_t num_vectors = 1UL << 28;

In [8]: %%script bash

# elapsed time (alloc_memory): 0.00168678s

Сурах бичиг дээр өгөгдсөн hpc_helpers.hpp ашиглаж өнгөрсөн хугацааг тодорхойлсон.

Лекц 5 болон 6 ийн хичээлийн ppt

Сурах бичиг дээр өгөгдсөн hpc_helpers.hpp файл

You might also like

for (uint64_t i = 0; i < 3length; i += 38) {

m128 M = (m128) (xyz+i);

float rho = xx+yy+z*z;