nas-ft: julia: Initial port

679969f2 · Dorian Stoll · c6ec6edb · 679969f2 · 679969f2
Verified Commit 679969f2 authored 9 months ago by Dorian Stoll
--- a/src/benchmarks/nas-ft/julia/Project.toml
+++ b/src/benchmarks/nas-ft/julia/Project.toml
+name = "ft"
+uuid = "6c99acc8-2c4b-4fe5-977d-7192d3588e0c"
+authors = ["Dorian Stoll <dorian.stoll@uni-potsdam.de>"]
+version = "0.1.0"
+
+[deps]
+FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
--- a/src/benchmarks/nas-ft/julia/src/ft.jl
+++ b/src/benchmarks/nas-ft/julia/src/ft.jl
+#!/usr/bin/env julia
+
+using FLoops
+using Printf
+using StaticArrays
+
+# Class C
+const NX::Int = 512
+const NY::Int = 512
+const NZ::Int = 512
+const MAXDIM::Int = 512
+const NITER_DEFAULT::Int = 20
+
+# Total number of grid points with padding
+const NXP::Int = NX + 1
+const NTOTALP::Int = NXP * NY * NZ
+const NTOTALF::Float64 = Float64(NXP) * NY * NZ
+
+# If processor array is 1x1 -> 0D grid decomposition
+
+# Cache blocking params. These values are good for most RISC processors.
+# FFT parameters:
+#   fftblock controls how many ffts are done at a time.
+#   The default is appropriate for most cache-based machines
+#   On vector machines, the FFT can be vectorized with vector
+#   length equal to the block size, so the block size should
+#   be as large as possible. This is the size of the smallest
+#   dimension of the problem: 128 for class A, 256 for class B and
+#   512 for class C.
+
+const FFTBLOCK_DEFAULT::Int = 32
+const FFTBLOCKPAD_DEFAULT::Int = FFTBLOCK_DEFAULT + 2
+
+# other stuff
+const SEED::Int = 314159265
+const ALPHA::Float64 = 1E-6
+
+global debug::Bool
+
+global niter::Int
+global fftblock::Int
+global fftblockpad::Int
+
+global u::Array{ComplexF64,1}
+global u0::Array{ComplexF64,3}
+global u1::Array{ComplexF64,3}
+global twiddle::Array{Float64,3}
+
+function ilog2(n::Int)::Int
+    if n == 1
+        return 0
+    end
+
+    lg::Int = 1
+    nn::Int = 2
+
+    while nn < n
+        nn *= 2
+        lg += 1
+    end
+
+    return lg
+end
+
+function alloc_space()
+    global u = zeros(ComplexF64, NXP)
+    global u0 = zeros(ComplexF64, (NXP, NY, NZ))
+    global u1 = zeros(ComplexF64, (NXP, NY, NZ))
+    global twiddle = zeros(Float64, (NXP, NY, NZ))
+end
+
+function setup()
+    global debug = false
+    global niter = NITER_DEFAULT
+
+    @printf "\n"
+    @printf "\n"
+    @printf " NAS Parallel Benchmarks (NPB3.4-OMP) - FT Benchmark\n"
+    @printf "\n"
+    @printf " Size                        : %dx%dx%d\n" NX NY NZ
+    @printf " Iterations                  : %d\n" niter
+    @printf " Number of available threads : %d\n" Threads.nthreads()
+    @printf "\n"
+
+    # ---------------------------------------------------------------------
+    # Set up info for blocking of ffts and transposes.  This improves
+    # performance on cache-based systems. Blocking involves
+    # working on a chunk of the problem at a time, taking chunks
+    # along the first, second, or third dimension.
+    #
+    # - In cffts1 blocking is on 2nd dimension (with fft on 1st dim)
+    # - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims)
+    #
+    # Since 1st dim is always in processor, we'll assume it's long enough
+    # (default blocking factor is 16 so min size for 1st dim is 16)
+    # The only case we have to worry about is cffts1 in a 2d decomposition.
+    # so the blocking factor should not be larger than the 2nd dimension.
+    # ---------------------------------------------------------------------
+
+    global fftblock = FFTBLOCK_DEFAULT
+    global fftblockpad = FFTBLOCKPAD_DEFAULT
+
+    if fftblock != FFTBLOCK_DEFAULT
+        fftblockpad = FFTBLOCK_DEFAULT + 3
+    end
+end
+
+# ---------------------------------------------------------------------
+#  compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2
+#  for time evolution exponent.
+# ---------------------------------------------------------------------
+function compute_indexmap()
+    AP::Float64 = -4 * ALPHA * pi * pi
+
+    d1::Int = size(twiddle, 1) - 1
+    d2::Int = size(twiddle, 2)
+    d3::Int = size(twiddle, 3)
+
+    # ---------------------------------------------------------------------
+    # basically we want to convert the fortran indices
+    #   1 2 3 4 5 6 7 8
+    # to
+    #   0 1 2 3 -4 -3 -2 -1
+    # The following magic formula does the trick:
+    # mod(i-1+n/2, n) - n/2
+    # ---------------------------------------------------------------------
+
+    @floop for k = 1:d3, j = 1:d2
+        kk::Int = mod(k-1+d3/2, d3) - d3/2
+        kk2::Int = kk*kk
+        jj::Int = mod(j-1+d2/2, d2) - d2/2
+        kj2::Int = jj*jj+kk2
+
+        for i = 1:d1
+            ii::Int = mod(i-1+d1/2, d1) - d1/2
+            twiddle[i, j, k] = exp(AP * (ii*ii+kj2))
+        end
+    end
+end
+
+function compute_initial_conditions()
+    @ccall srand48(SEED::Clong)::Cvoid
+
+    for k in axes(u1, 3), j in axes(u1, 2), i in axes(u1, 1)
+        u1[i, j, k] = @ccall drand48()::Cdouble
+    end
+end
+
+function evolve()
+    # ---------------------------------------------------------------------
+    #  evolve u0 -> u1 (t time steps) in fourier space
+    # ---------------------------------------------------------------------
+
+    d1::Int = size(u0, 1) - 1
+    d2::Int = size(u0, 2)
+    d3::Int = size(u0, 3)
+
+    @floop for k = 1:d3, j = 1:d2, i = 1:d1
+        u0[i, j, k] *= twiddle[i, j, k]
+        u1[i, j, k] = u0[i, j, k]
+    end
+end
+
+# ---------------------------------------------------------------------
+#  compute the roots-of-unity array that will be used for subsequent FFTs.
+# ---------------------------------------------------------------------
+function fft_init()
+    # ---------------------------------------------------------------------
+    #   Initialize the U array with sines and cosines in a manner that permits
+    #   stride one access at each FFT iteration.
+    # ---------------------------------------------------------------------
+
+    m::Int = ilog2(size(u0, 1) - 1)
+    u[1] = m
+
+    ku::Int = 2
+    ln::Int = 1
+
+    for j = 1:m
+        t::Float64 = pi / ln
+
+        for i = 0:ln-1
+            ti::Float64 = i * t
+            u[i + ku] = complex(cos(ti), sin(ti))
+        end
+
+        ku += ln
+        ln *= 2
+    end
+end
+
+function fft(dir::Int, x1::Array{ComplexF64, 3}, x2::Array{ComplexF64, 3})
+    # ---------------------------------------------------------------------
+    #  note: args x1, x2 must be different arrays
+    #  note: args for cfftsx are (direction, layout, xin, xout, scratch)
+    #        xin/xout may be the same and it can be somewhat faster
+    #        if they are
+    # ---------------------------------------------------------------------
+
+    if dir == 1
+        cffts1(1, x1, x1)
+        cffts2(1, x1, x1)
+        cffts3(1, x1, x2)
+    else
+        cffts3(-1, x1, x1)
+        cffts2(-1, x1, x1)
+        cffts1(-1, x1, x2)
+    end
+end
+
+function cffts1(is::Int, x::Array{ComplexF64, 3}, xout::Array{ComplexF64, 3})
+    d1::Int = size(x, 1) - 1
+    d2::Int = size(x, 2)
+    d3::Int = size(x, 3)
+
+    logd1::Int = ilog2(d1)
+
+    @floop for k = 1:d3, jn = 0:d2/fftblock - 1
+        @init y1 = zeros(ComplexF64, (fftblockpad, MAXDIM))
+        @init y2 = zeros(ComplexF64, (fftblockpad, MAXDIM))
+
+        jj::Int = jn * fftblock
+
+        for j = 1:fftblock, i = 1:d1
+            y1[j, i] = x[i, j+jj, k]
+        end
+
+        cfftz(is, logd1, d1, y1, y2)
+
+        for j = 1:fftblock, i = 1:d1
+            xout[i, j+jj, k] = y1[j, i]
+        end
+    end
+end
+
+function cffts2(is::Int, x::Array{ComplexF64, 3}, xout::Array{ComplexF64, 3})
+    d1::Int = size(x, 1) - 1
+    d2::Int = size(x, 2)
+    d3::Int = size(x, 3)
+
+    logd2::Int = ilog2(d2)
+
+    @floop for k = 1:d3, in = 0:d1/fftblock - 1
+        @init y1 = zeros(ComplexF64, (fftblockpad, MAXDIM))
+        @init y2 = zeros(ComplexF64, (fftblockpad, MAXDIM))
+
+        ii::Int = in * fftblock
+
+        for j = 1:d2, i = 1:fftblock
+            y1[i, j] = x[i+ii, j, k]
+        end
+
+        cfftz(is, logd2, d2, y1, y2)
+
+        for j = 1:d2, i = 1:fftblock
+            xout[i+ii, j, k] = y1[i, j]
+        end
+    end
+end
+
+function cffts3(is::Int, x::Array{ComplexF64, 3}, xout::Array{ComplexF64, 3})
+    d1::Int = size(x, 1) - 1
+    d2::Int = size(x, 2)
+    d3::Int = size(x, 3)
+
+    logd3::Int = ilog2(d3)
+
+    @floop for j = 1:d2, in = 0:d1/fftblock - 1
+        @init y1 = zeros(ComplexF64, (fftblockpad, MAXDIM))
+        @init y2 = zeros(ComplexF64, (fftblockpad, MAXDIM))
+
+        ii::Int = in * fftblock
+
+        for k = 1:d3, i = 1:fftblock
+            y1[i, k] = x[i+ii, j, k]
+        end
+
+        cfftz(is, logd3, d3, y1, y2)
+
+        for k = 1:d3, i = 1:fftblock
+            xout[i+ii, j, k] = y1[i, k]
+        end
+    end
+end
+
+function cfftz(is::Int, m::Int, n::Int, x::Array{ComplexF64,2}, y::Array{ComplexF64,2})
+    # ---------------------------------------------------------------------
+    #    Computes NY N-point complex-to-complex FFTs of X using an algorithm due
+    #    to Swarztrauber.  X is both the input and the output array, while Y is a
+    #    scratch array.  It is assumed that N = 2^M.  Before calling CFFTZ to
+    #    perform FFTs, the array U must be initialized by calling CFFTZ with IS
+    #    set to 0 and M set to MX, where MX is the maximum value of M for any
+    #    subsequent call.
+    # ---------------------------------------------------------------------
+
+    # ---------------------------------------------------------------------
+    #    Check if input parameters are invalid.
+    # ---------------------------------------------------------------------
+
+    mx::Int = real(u[1])
+
+    if (is != 1 && is != -1) || m < 1 || m > mx
+        error(
+            @sprintf "CFFTZ: Either U has not been initialized, or else one of the input parameters is invalid %d %d %d" is m mx
+        )
+    end
+
+    # ---------------------------------------------------------------------
+    #    Perform one variant of the Stockham FFT.
+    # ---------------------------------------------------------------------
+
+    for l = 1:2:m
+        fftz2(is, l, m, n, fftblock, x, y)
+
+        if l == m
+            # ---------------------------------------------------------------------
+            #   Copy Y to X.
+            # ---------------------------------------------------------------------
+
+            for j = 1:n, i = 1:fftblock
+                x[i, j] = y[i, j]
+            end
+
+            continue
+        end
+
+        fftz2(is, l + 1, m, n, fftblock, y, x)
+    end
+end
+
+function fftz2(is::Int, l::Int, m::Int, n::Int, ny::Int, x::Array{ComplexF64,2}, y::Array{ComplexF64,2})
+    n1::Int = n / 2
+    lk::Int = 2 ^ (l - 1)
+    li::Int = 2 ^ (m - l)
+    lj::Int = 2 * lk
+    ku::Int = li + 1
+
+    for i = 0:li-1
+        i11::Int = i * lk + 1
+        i12::Int = i11 + n1
+        i21::Int = i * lj + 1
+        i22::Int = i21 + lk
+
+        u1::ComplexF64 = 0
+
+        if is >= 1
+            u1 = u[ku+i]
+        else
+            u1 = conj(u[ku + i])
+        end
+
+        for k = 0:lk-1, j = 1:ny
+            x11::ComplexF64 = x[j,i11+k]
+            x21::ComplexF64 = x[j,i12+k]
+
+            y[j,i21+k] = x11 + x21
+            y[j,i22+k] = u1 * (x11 - x21)
+        end
+    end
+end
+
+function checksum(i::Int)
+    chk::ComplexF64 = 0
+
+    @floop for j = 1:1024
+        q::Int = mod(j, NX) + 1
+        r::Int = mod(3 * j, NY) + 1
+        s::Int = mod(5 * j, NZ) + 1
+
+        @reduce chk += u1[q, r, s]
+    end
+
+    chk /= NTOTALF
+
+    @printf " T = %d     Checksum = %.10E %.10E\n" i real(chk) imag(chk)
+end
+
+function main()
+    alloc_space()
+    setup()
+
+    # ---------------------------------------------------------------------
+    # Run the entire problem once to make sure all data is touched.
+    # This reduces variable startup costs, which is important for such a
+    # short benchmark. The other NPB 2 implementations are similar.
+    # ---------------------------------------------------------------------
+    compute_indexmap()
+    compute_initial_conditions()
+    fft_init()
+    fft(1, u1, u0)
+
+    # ---------------------------------------------------------------------
+    # Start over from the beginning. Note that all operations must
+    # be timed, in contrast to other benchmarks.
+    # ---------------------------------------------------------------------
+    compute_indexmap()
+    compute_initial_conditions()
+    fft_init()
+    fft(1, u1, u0)
+
+    for iter = 1:niter
+        evolve()
+        fft(-1, u1, u1)
+        checksum(iter)
+    end
+end
+
+main()