Skip to content
Snippets Groups Projects
Verified Commit 679969f2 authored by Dorian Stoll's avatar Dorian Stoll
Browse files

nas-ft: julia: Initial port

parent c6ec6edb
No related branches found
No related tags found
No related merge requests found
name = "ft"
uuid = "6c99acc8-2c4b-4fe5-977d-7192d3588e0c"
authors = ["Dorian Stoll <dorian.stoll@uni-potsdam.de>"]
version = "0.1.0"
[deps]
FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
#!/usr/bin/env julia
using FLoops
using Printf
using StaticArrays
# Class C
const NX::Int = 512
const NY::Int = 512
const NZ::Int = 512
const MAXDIM::Int = 512
const NITER_DEFAULT::Int = 20
# Total number of grid points with padding
const NXP::Int = NX + 1
const NTOTALP::Int = NXP * NY * NZ
const NTOTALF::Float64 = Float64(NXP) * NY * NZ
# If processor array is 1x1 -> 0D grid decomposition
# Cache blocking params. These values are good for most RISC processors.
# FFT parameters:
# fftblock controls how many ffts are done at a time.
# The default is appropriate for most cache-based machines
# On vector machines, the FFT can be vectorized with vector
# length equal to the block size, so the block size should
# be as large as possible. This is the size of the smallest
# dimension of the problem: 128 for class A, 256 for class B and
# 512 for class C.
const FFTBLOCK_DEFAULT::Int = 32
const FFTBLOCKPAD_DEFAULT::Int = FFTBLOCK_DEFAULT + 2
# other stuff
const SEED::Int = 314159265
const ALPHA::Float64 = 1E-6
global debug::Bool
global niter::Int
global fftblock::Int
global fftblockpad::Int
global u::Array{ComplexF64,1}
global u0::Array{ComplexF64,3}
global u1::Array{ComplexF64,3}
global twiddle::Array{Float64,3}
function ilog2(n::Int)::Int
if n == 1
return 0
end
lg::Int = 1
nn::Int = 2
while nn < n
nn *= 2
lg += 1
end
return lg
end
function alloc_space()
global u = zeros(ComplexF64, NXP)
global u0 = zeros(ComplexF64, (NXP, NY, NZ))
global u1 = zeros(ComplexF64, (NXP, NY, NZ))
global twiddle = zeros(Float64, (NXP, NY, NZ))
end
function setup()
global debug = false
global niter = NITER_DEFAULT
@printf "\n"
@printf "\n"
@printf " NAS Parallel Benchmarks (NPB3.4-OMP) - FT Benchmark\n"
@printf "\n"
@printf " Size : %dx%dx%d\n" NX NY NZ
@printf " Iterations : %d\n" niter
@printf " Number of available threads : %d\n" Threads.nthreads()
@printf "\n"
# ---------------------------------------------------------------------
# Set up info for blocking of ffts and transposes. This improves
# performance on cache-based systems. Blocking involves
# working on a chunk of the problem at a time, taking chunks
# along the first, second, or third dimension.
#
# - In cffts1 blocking is on 2nd dimension (with fft on 1st dim)
# - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims)
#
# Since 1st dim is always in processor, we'll assume it's long enough
# (default blocking factor is 16 so min size for 1st dim is 16)
# The only case we have to worry about is cffts1 in a 2d decomposition.
# so the blocking factor should not be larger than the 2nd dimension.
# ---------------------------------------------------------------------
global fftblock = FFTBLOCK_DEFAULT
global fftblockpad = FFTBLOCKPAD_DEFAULT
if fftblock != FFTBLOCK_DEFAULT
fftblockpad = FFTBLOCK_DEFAULT + 3
end
end
# ---------------------------------------------------------------------
# compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2
# for time evolution exponent.
# ---------------------------------------------------------------------
function compute_indexmap()
AP::Float64 = -4 * ALPHA * pi * pi
d1::Int = size(twiddle, 1) - 1
d2::Int = size(twiddle, 2)
d3::Int = size(twiddle, 3)
# ---------------------------------------------------------------------
# basically we want to convert the fortran indices
# 1 2 3 4 5 6 7 8
# to
# 0 1 2 3 -4 -3 -2 -1
# The following magic formula does the trick:
# mod(i-1+n/2, n) - n/2
# ---------------------------------------------------------------------
@floop for k = 1:d3, j = 1:d2
kk::Int = mod(k-1+d3/2, d3) - d3/2
kk2::Int = kk*kk
jj::Int = mod(j-1+d2/2, d2) - d2/2
kj2::Int = jj*jj+kk2
for i = 1:d1
ii::Int = mod(i-1+d1/2, d1) - d1/2
twiddle[i, j, k] = exp(AP * (ii*ii+kj2))
end
end
end
function compute_initial_conditions()
@ccall srand48(SEED::Clong)::Cvoid
for k in axes(u1, 3), j in axes(u1, 2), i in axes(u1, 1)
u1[i, j, k] = @ccall drand48()::Cdouble
end
end
function evolve()
# ---------------------------------------------------------------------
# evolve u0 -> u1 (t time steps) in fourier space
# ---------------------------------------------------------------------
d1::Int = size(u0, 1) - 1
d2::Int = size(u0, 2)
d3::Int = size(u0, 3)
@floop for k = 1:d3, j = 1:d2, i = 1:d1
u0[i, j, k] *= twiddle[i, j, k]
u1[i, j, k] = u0[i, j, k]
end
end
# ---------------------------------------------------------------------
# compute the roots-of-unity array that will be used for subsequent FFTs.
# ---------------------------------------------------------------------
function fft_init()
# ---------------------------------------------------------------------
# Initialize the U array with sines and cosines in a manner that permits
# stride one access at each FFT iteration.
# ---------------------------------------------------------------------
m::Int = ilog2(size(u0, 1) - 1)
u[1] = m
ku::Int = 2
ln::Int = 1
for j = 1:m
t::Float64 = pi / ln
for i = 0:ln-1
ti::Float64 = i * t
u[i + ku] = complex(cos(ti), sin(ti))
end
ku += ln
ln *= 2
end
end
function fft(dir::Int, x1::Array{ComplexF64, 3}, x2::Array{ComplexF64, 3})
# ---------------------------------------------------------------------
# note: args x1, x2 must be different arrays
# note: args for cfftsx are (direction, layout, xin, xout, scratch)
# xin/xout may be the same and it can be somewhat faster
# if they are
# ---------------------------------------------------------------------
if dir == 1
cffts1(1, x1, x1)
cffts2(1, x1, x1)
cffts3(1, x1, x2)
else
cffts3(-1, x1, x1)
cffts2(-1, x1, x1)
cffts1(-1, x1, x2)
end
end
function cffts1(is::Int, x::Array{ComplexF64, 3}, xout::Array{ComplexF64, 3})
d1::Int = size(x, 1) - 1
d2::Int = size(x, 2)
d3::Int = size(x, 3)
logd1::Int = ilog2(d1)
@floop for k = 1:d3, jn = 0:d2/fftblock - 1
@init y1 = zeros(ComplexF64, (fftblockpad, MAXDIM))
@init y2 = zeros(ComplexF64, (fftblockpad, MAXDIM))
jj::Int = jn * fftblock
for j = 1:fftblock, i = 1:d1
y1[j, i] = x[i, j+jj, k]
end
cfftz(is, logd1, d1, y1, y2)
for j = 1:fftblock, i = 1:d1
xout[i, j+jj, k] = y1[j, i]
end
end
end
function cffts2(is::Int, x::Array{ComplexF64, 3}, xout::Array{ComplexF64, 3})
d1::Int = size(x, 1) - 1
d2::Int = size(x, 2)
d3::Int = size(x, 3)
logd2::Int = ilog2(d2)
@floop for k = 1:d3, in = 0:d1/fftblock - 1
@init y1 = zeros(ComplexF64, (fftblockpad, MAXDIM))
@init y2 = zeros(ComplexF64, (fftblockpad, MAXDIM))
ii::Int = in * fftblock
for j = 1:d2, i = 1:fftblock
y1[i, j] = x[i+ii, j, k]
end
cfftz(is, logd2, d2, y1, y2)
for j = 1:d2, i = 1:fftblock
xout[i+ii, j, k] = y1[i, j]
end
end
end
function cffts3(is::Int, x::Array{ComplexF64, 3}, xout::Array{ComplexF64, 3})
d1::Int = size(x, 1) - 1
d2::Int = size(x, 2)
d3::Int = size(x, 3)
logd3::Int = ilog2(d3)
@floop for j = 1:d2, in = 0:d1/fftblock - 1
@init y1 = zeros(ComplexF64, (fftblockpad, MAXDIM))
@init y2 = zeros(ComplexF64, (fftblockpad, MAXDIM))
ii::Int = in * fftblock
for k = 1:d3, i = 1:fftblock
y1[i, k] = x[i+ii, j, k]
end
cfftz(is, logd3, d3, y1, y2)
for k = 1:d3, i = 1:fftblock
xout[i+ii, j, k] = y1[i, k]
end
end
end
function cfftz(is::Int, m::Int, n::Int, x::Array{ComplexF64,2}, y::Array{ComplexF64,2})
# ---------------------------------------------------------------------
# Computes NY N-point complex-to-complex FFTs of X using an algorithm due
# to Swarztrauber. X is both the input and the output array, while Y is a
# scratch array. It is assumed that N = 2^M. Before calling CFFTZ to
# perform FFTs, the array U must be initialized by calling CFFTZ with IS
# set to 0 and M set to MX, where MX is the maximum value of M for any
# subsequent call.
# ---------------------------------------------------------------------
# ---------------------------------------------------------------------
# Check if input parameters are invalid.
# ---------------------------------------------------------------------
mx::Int = real(u[1])
if (is != 1 && is != -1) || m < 1 || m > mx
error(
@sprintf "CFFTZ: Either U has not been initialized, or else one of the input parameters is invalid %d %d %d" is m mx
)
end
# ---------------------------------------------------------------------
# Perform one variant of the Stockham FFT.
# ---------------------------------------------------------------------
for l = 1:2:m
fftz2(is, l, m, n, fftblock, x, y)
if l == m
# ---------------------------------------------------------------------
# Copy Y to X.
# ---------------------------------------------------------------------
for j = 1:n, i = 1:fftblock
x[i, j] = y[i, j]
end
continue
end
fftz2(is, l + 1, m, n, fftblock, y, x)
end
end
function fftz2(is::Int, l::Int, m::Int, n::Int, ny::Int, x::Array{ComplexF64,2}, y::Array{ComplexF64,2})
n1::Int = n / 2
lk::Int = 2 ^ (l - 1)
li::Int = 2 ^ (m - l)
lj::Int = 2 * lk
ku::Int = li + 1
for i = 0:li-1
i11::Int = i * lk + 1
i12::Int = i11 + n1
i21::Int = i * lj + 1
i22::Int = i21 + lk
u1::ComplexF64 = 0
if is >= 1
u1 = u[ku+i]
else
u1 = conj(u[ku + i])
end
for k = 0:lk-1, j = 1:ny
x11::ComplexF64 = x[j,i11+k]
x21::ComplexF64 = x[j,i12+k]
y[j,i21+k] = x11 + x21
y[j,i22+k] = u1 * (x11 - x21)
end
end
end
function checksum(i::Int)
chk::ComplexF64 = 0
@floop for j = 1:1024
q::Int = mod(j, NX) + 1
r::Int = mod(3 * j, NY) + 1
s::Int = mod(5 * j, NZ) + 1
@reduce chk += u1[q, r, s]
end
chk /= NTOTALF
@printf " T = %d Checksum = %.10E %.10E\n" i real(chk) imag(chk)
end
function main()
alloc_space()
setup()
# ---------------------------------------------------------------------
# Run the entire problem once to make sure all data is touched.
# This reduces variable startup costs, which is important for such a
# short benchmark. The other NPB 2 implementations are similar.
# ---------------------------------------------------------------------
compute_indexmap()
compute_initial_conditions()
fft_init()
fft(1, u1, u0)
# ---------------------------------------------------------------------
# Start over from the beginning. Note that all operations must
# be timed, in contrast to other benchmarks.
# ---------------------------------------------------------------------
compute_indexmap()
compute_initial_conditions()
fft_init()
fft(1, u1, u0)
for iter = 1:niter
evolve()
fft(-1, u1, u1)
checksum(iter)
end
end
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment