FFT Full
FFT Full
text
.global _start
_start:
# — Stack & return setup —
la sp, STACK
la ra, _finish
# — Run FFT —
la a0, real # real[]
la a1, imag # imag[]
lw a2, size # N
call vFFT
# — hang forever —
j _finish
#-------------------------------------------------------------------------------
# setlogN: logsize = log2(a0)
#-------------------------------------------------------------------------------
setlogN:
clz t0, a0
li t1, 31
sub t1, t1, t0
la t0, logsize
sw t1, 0(t0)
jr ra
#-------------------------------------------------------------------------------
# vOrdina: bit-reverse reorder using a lookup table
# a0 = &real, a1 = &imag, a2 = N
#-------------------------------------------------------------------------------
vOrdina:
addi sp, sp, -24
sw ra, 0(sp)
sw a0, 4(sp)
sw a1, 8(sp)
sw a2, 12(sp)
# update count
add t3, t3, t5
j 1b
2:
# copy back real_temp→real, imag_temp→imag
la t1, real_temp
la t2, imag_temp
la t6, real
la t7, imag
li t3, 0
lw ra, 0(sp)
lw a0, 4(sp)
lw a1, 8(sp)
lw a2,12(sp)
addi sp, sp, 24
jr ra
#-------------------------------------------------------------------------------
# vTransform: the vectorized butterfly stages (same as your original)
# a0=&real, a1=&imag, a2=N, a3=+1 for FFT / –1 for IFFT
#-------------------------------------------------------------------------------
vTransform:
addi sp, sp, -40
sw ra, 0(sp)
sw a0, 4(sp)
sw a1, 8(sp)
sw a2, 12(sp)
sw a3, 16(sp)
sw s0, 32(sp)
# 1. bit-reversal
call vOrdina
# 3. pointers to twiddles
la t1, W_real
la t2, W_imag
li a5, 1 # n = 1
srai a4, a2, 1 # a = N/2
li t3, 0 # stage counter
lw a3, logsize # #stages = log2(N)
slli t5, t0, 2 # bytes per vector chunk
addi s0, a4, -1 # mask = N/2 – 1
Lbody:
# mask = !(i & n)
slli a6, a5, 2 # a6 = n*4
vid.v v28 # v28 = [0..VLEN-1]
vsll.vi v20, v28, 2 # byte offsets i*4
vand.vx v0, v20, a6
vmseq.vx v0, v0, zero # mask
# load x[i+n]
vadd.vx v16, v20, a6, v0.t
vloxei32.v v8, 0(a0), v16, v0.t
vloxei32.v v12, 0(a1), v16, v0.t
# compute t = w·x[i+n]
vfmul.vv v16, v4, v8, v0.t
vfnmsac.vv v16, v28, v12, v0.t
vfmul.vv v12, v4, v12, v0.t
vfmacc.vv v12, v28, v8, v0.t
# load x[i]
vloxei32.v v4, 0(a0), v20, v0.t
vloxei32.v v28,0(a1), v20, v0.t
# store back
vsoxei32.v v8, 0(a0), v20, v0.t
vsoxei32.v v16, 0(a1), v20, v0.t
vsoxei32.v v4, 0(a0), v16, v0.t
vsoxei32.v v28, 0(a1), v16, v0.t
L5:
slli a5, a5, 1 # n <<= 1
srai a4, a4, 1 # a >>= 1
addi t3, t3, 1
j L1
Lend:
# restore
lw ra, 0(sp)
lw a0, 4(sp)
lw a1, 8(sp)
lw a2, 12(sp)
lw a3, 16(sp)
lw s0, 32(sp)
addi sp, sp, 40
jr ra
#-------------------------------------------------------------------------------
# vFFT / vIFFT wrappers
#-------------------------------------------------------------------------------
vFFT:
addi sp, sp, -8
sw ra, 0(sp)
li a3, 1 # forward
call vTransform
lw ra, 0(sp)
addi sp, sp, 8
ret
vIFFT:
addi sp, sp, -8
sw ra, 0(sp)
li a3, -1 # inverse
call vTransform
# (optional) divide by N in-place here…
lw ra, 0(sp)
addi sp, sp, 8
ret
#-------------------------------------------------------------------------------
# print: simple vector dump (as in your original)
#-------------------------------------------------------------------------------
print:
addi sp, sp, -12
sw ra, 0(sp)
sw a0, 4(sp)
sw a1, 8(sp)
li t0, 0
lw a2, size
vsetvli t3, a2, e32
slli t4, t3, 2
PLoop:
bge t0, a2, PEnd
vle32.v v0, 0(a0)
vle32.v v8, 0(a1)
add a0, a0, t4
add a1, a1, t4
add t0, t0, t3
j PLoop
PEnd:
lw ra, 0(sp)
lw a0, 4(sp)
lw a1, 8(sp)
addi sp, sp, 12
jr ra
_finish:
li x3, 0xd0580000
li x5, 0xff
sb x5, 0(x3)
beq x0, x0, _finish
#-------------------------------------------------------------------------------
# Data Section
#-------------------------------------------------------------------------------
.section .data
.equ N, 1024
.equ N2, N/2
size: .word N
logsize: .word 0
real:
.rept N
.float 0.0
.endr
imag:
.rept N
.float 0.0
.endr
real_temp:
.rept N
.float 0.0
.endr
imag_temp:
.rept N
.float 0.0
.endr
W_imag:
.include "W_imag.inc"
STACK:
.space 4096