0% found this document useful (0 votes)
15 views6 pages

FFT Full

The document contains assembly code for a Fast Fourier Transform (FFT) implementation, including functions for computing log2(N), bit-reversal reordering, and the vectorized butterfly stages. It defines data sections for real and imaginary parts, temporary storage, and precomputed twiddle factors and bit-reversal indices. The main execution starts at the _start label, which sets up the stack, runs the FFT, and prints the results.

Uploaded by

fawad.sidd17
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
15 views6 pages

FFT Full

The document contains assembly code for a Fast Fourier Transform (FFT) implementation, including functions for computing log2(N), bit-reversal reordering, and the vectorized butterfly stages. It defines data sections for real and imaginary parts, temporary storage, and precomputed twiddle factors and bit-reversal indices. The main execution starts at the _start label, which sets up the stack, runs the FFT, and prints the results.

Uploaded by

fawad.sidd17
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 6

.section .

text
.global _start
_start:
# — Stack & return setup —
la sp, STACK
la ra, _finish

# — Compute log2(N) for later loops —


lw a0, size
call setlogN

# — Run FFT —
la a0, real # real[]
la a1, imag # imag[]
lw a2, size # N
call vFFT

# — Print or inspect results —


call print

# — hang forever —
j _finish

#-------------------------------------------------------------------------------
# setlogN: logsize = log2(a0)
#-------------------------------------------------------------------------------
setlogN:
clz t0, a0
li t1, 31
sub t1, t1, t0
la t0, logsize
sw t1, 0(t0)
jr ra

#-------------------------------------------------------------------------------
# vOrdina: bit-reverse reorder using a lookup table
# a0 = &real, a1 = &imag, a2 = N
#-------------------------------------------------------------------------------
vOrdina:
addi sp, sp, -24
sw ra, 0(sp)
sw a0, 4(sp)
sw a1, 8(sp)
sw a2, 12(sp)

la t0, bitrev # table of N .word reversed indices


la t1, real_temp
la t2, imag_temp

li t3, 0 # processed count

1: bge t3, a2, 2f

# set vector length = min(VLEN, remaining)


sub t4, a2, t3
vsetvli t5, t4, e32

# load a chunk of bit-reversed indices


vle32.v v0, 0(t0)
# gather from real and imag
la t6, real
la t7, imag
vloxei32.v v1, 0(t6), v0
vloxei32.v v2, 0(t7), v0

# scatter into temp at normal order


vse32.v v1, 0(t1)
vse32.v v2, 0(t2)

# advance pointers by VL×4 bytes


slli t8, t5, 2
add t0, t0, t8
add t1, t1, t8
add t2, t2, t8

# update count
add t3, t3, t5
j 1b
2:
# copy back real_temp→real, imag_temp→imag
la t1, real_temp
la t2, imag_temp
la t6, real
la t7, imag
li t3, 0

3: bge t3, a2, 4f


sub t4, a2, t3
vsetvli t5, t4, e32

vle32.v v1, 0(t1)


vle32.v v2, 0(t2)

vse32.v v1, 0(t6)


vse32.v v2, 0(t7)

slli t8, t5, 2


add t1, t1, t8
add t2, t2, t8
add t6, t6, t8
add t7, t7, t8

add t3, t3, t5


j 3b
4:

lw ra, 0(sp)
lw a0, 4(sp)
lw a1, 8(sp)
lw a2,12(sp)
addi sp, sp, 24
jr ra

#-------------------------------------------------------------------------------
# vTransform: the vectorized butterfly stages (same as your original)
# a0=&real, a1=&imag, a2=N, a3=+1 for FFT / –1 for IFFT
#-------------------------------------------------------------------------------
vTransform:
addi sp, sp, -40
sw ra, 0(sp)
sw a0, 4(sp)
sw a1, 8(sp)
sw a2, 12(sp)
sw a3, 16(sp)
sw s0, 32(sp)

# 1. bit-reversal
call vOrdina

# 2. load inverse flag into ft0


fcvt.s.w ft0, a3

# 3. pointers to twiddles
la t1, W_real
la t2, W_imag

# 4. vector-length for data


vsetvli t0, a2, e32, m4

li a5, 1 # n = 1
srai a4, a2, 1 # a = N/2
li t3, 0 # stage counter
lw a3, logsize # #stages = log2(N)
slli t5, t0, 2 # bytes per vector chunk
addi s0, a4, -1 # mask = N/2 – 1

# — Outer loop over stages —


L1: bge t3, a3, Lend
li t4, 0 # index i=0

L2: blt t4, a2, Lbody


j L5

Lbody:
# mask = !(i & n)
slli a6, a5, 2 # a6 = n*4
vid.v v28 # v28 = [0..VLEN-1]
vsll.vi v20, v28, 2 # byte offsets i*4
vand.vx v0, v20, a6
vmseq.vx v0, v0, zero # mask

# k = ((i * a) & (N/2–1)) << 2


vmul.vx v24, v28, a4
vand.vx v24, v24, s0, v0.t
vsll.vi v24, v24, 2, v0.t

# load W_real, W_imag


vloxei32.v v4, 0(t1), v24, v0.t
vloxei32.v v28,0(t2), v24, v0.t
vfsgnjx.vf v28, v28, ft0, v0.t

# load x[i+n]
vadd.vx v16, v20, a6, v0.t
vloxei32.v v8, 0(a0), v16, v0.t
vloxei32.v v12, 0(a1), v16, v0.t
# compute t = w·x[i+n]
vfmul.vv v16, v4, v8, v0.t
vfnmsac.vv v16, v28, v12, v0.t
vfmul.vv v12, v4, v12, v0.t
vfmacc.vv v12, v28, v8, v0.t

# load x[i]
vloxei32.v v4, 0(a0), v20, v0.t
vloxei32.v v28,0(a1), v20, v0.t

# butterfly: top=x[i]+t, bot=x[i]–t


vfadd.vv v8, v4, v16, v0.t
vfsub.vv v4, v4, v16, v0.t
vfadd.vv v16, v28, v12, v0.t
vfsub.vv v28, v28, v12, v0.t

# store back
vsoxei32.v v8, 0(a0), v20, v0.t
vsoxei32.v v16, 0(a1), v20, v0.t
vsoxei32.v v4, 0(a0), v16, v0.t
vsoxei32.v v28, 0(a1), v16, v0.t

add t4, t4, t0


j L2

L5:
slli a5, a5, 1 # n <<= 1
srai a4, a4, 1 # a >>= 1
addi t3, t3, 1
j L1
Lend:
# restore
lw ra, 0(sp)
lw a0, 4(sp)
lw a1, 8(sp)
lw a2, 12(sp)
lw a3, 16(sp)
lw s0, 32(sp)
addi sp, sp, 40
jr ra

#-------------------------------------------------------------------------------
# vFFT / vIFFT wrappers
#-------------------------------------------------------------------------------
vFFT:
addi sp, sp, -8
sw ra, 0(sp)
li a3, 1 # forward
call vTransform
lw ra, 0(sp)
addi sp, sp, 8
ret

vIFFT:
addi sp, sp, -8
sw ra, 0(sp)
li a3, -1 # inverse
call vTransform
# (optional) divide by N in-place here…
lw ra, 0(sp)
addi sp, sp, 8
ret

#-------------------------------------------------------------------------------
# print: simple vector dump (as in your original)
#-------------------------------------------------------------------------------
print:
addi sp, sp, -12
sw ra, 0(sp)
sw a0, 4(sp)
sw a1, 8(sp)

li t0, 0
lw a2, size
vsetvli t3, a2, e32
slli t4, t3, 2

PLoop:
bge t0, a2, PEnd
vle32.v v0, 0(a0)
vle32.v v8, 0(a1)
add a0, a0, t4
add a1, a1, t4
add t0, t0, t3
j PLoop
PEnd:
lw ra, 0(sp)
lw a0, 4(sp)
lw a1, 8(sp)
addi sp, sp, 12
jr ra

_finish:
li x3, 0xd0580000
li x5, 0xff
sb x5, 0(x3)
beq x0, x0, _finish

#-------------------------------------------------------------------------------
# Data Section
#-------------------------------------------------------------------------------
.section .data
.equ N, 1024
.equ N2, N/2

size: .word N
logsize: .word 0

real:
.rept N
.float 0.0
.endr

imag:
.rept N
.float 0.0
.endr
real_temp:
.rept N
.float 0.0
.endr

imag_temp:
.rept N
.float 0.0
.endr

# — Precomputed twiddle tables (generated offline) —


W_real:
.include "W_real.inc"

W_imag:
.include "W_imag.inc"

# — Precomputed bit-reversal indices (0..N–1) —


bitrev:
.include "bitrev.inc"

STACK:
.space 4096

You might also like