!-------------------------------------------------------------------------! ! ! ! N A S P A R A L L E L B E N C H M A R K S 3.4 ! ! ! ! O p e n M P V E R S I O N ! ! ! ! F T ! ! ! !-------------------------------------------------------------------------! ! ! ! This benchmark is an OpenMP version of the NPB FT code. ! ! It is described in NAS Technical Report 99-011. ! ! ! ! Permission to use, copy, distribute and modify this software ! ! for any purpose with or without fee is hereby granted. We ! ! request, however, that all derived work reference the NAS ! ! Parallel Benchmarks 3.4. This software is provided "as is" ! ! without express or implied warranty. ! ! ! ! Information on NPB 3.4, including the technical report, the ! ! original specifications, source code, results and information ! ! on how to submit new results, is available at: ! ! ! ! http://www.nas.nasa.gov/Software/NPB/ ! ! ! ! Send comments or suggestions to npb@nas.nasa.gov ! ! ! ! NAS Parallel Benchmarks Group ! ! NASA Ames Research Center ! ! Mail Stop: T27A-1 ! ! Moffett Field, CA 94035-1000 ! ! ! ! E-mail: npb@nas.nasa.gov ! ! Fax: (650) 604-3957 ! ! ! !-------------------------------------------------------------------------! !--------------------------------------------------------------------- ! ! Authors: D. Bailey ! W. Saphir ! H. Jin ! !--------------------------------------------------------------------- !--------------------------------------------------------------------- !--------------------------------------------------------------------- ! FT benchmark !--------------------------------------------------------------------- !--------------------------------------------------------------------- !--------------------------------------------------------------------- program ft !--------------------------------------------------------------------- !--------------------------------------------------------------------- !--------------------------------------------------------------------- ! Module ft_fields defines main arrays (u0, u1, u2) in the problem !--------------------------------------------------------------------- use ft_data use ft_fields implicit none integer iter !--------------------------------------------------------------------- ! Run the entire problem once to make sure all data is touched. ! This reduces variable startup costs, which is important for such a ! short benchmark. The other NPB 2 implementations are similar. !--------------------------------------------------------------------- call setup() call alloc_space call init_ui(u0, u1, twiddle, dims(1), dims(2), dims(3)) call compute_indexmap(twiddle, dims(1), dims(2), dims(3)) call compute_initial_conditions(u1, dims(1), dims(2), dims(3)) call fft_init (dims(1)) call fft(1, u1, u0) !--------------------------------------------------------------------- ! Start over from the beginning. Note that all operations must ! be timed, in contrast to other benchmarks. !--------------------------------------------------------------------- call compute_indexmap(twiddle, dims(1), dims(2), dims(3)) call compute_initial_conditions(u1, dims(1), dims(2), dims(3)) call fft_init (dims(1)) call fft(1, u1, u0) do iter = 1, niter call evolve(u0, u1, twiddle, dims(1), dims(2), dims(3)) ! call fft(-1, u1, u2) call fft(-1, u1, u1) ! call checksum(iter, u2, dims(1), dims(2), dims(3)) call checksum(iter, u1, dims(1), dims(2), dims(3)) end do end !--------------------------------------------------------------------- !--------------------------------------------------------------------- subroutine init_ui(u0, u1, twiddle, d1, d2, d3) !--------------------------------------------------------------------- !--------------------------------------------------------------------- !--------------------------------------------------------------------- ! touch all the big data !--------------------------------------------------------------------- implicit none integer d1, d2, d3 double complex u0(d1+1,d2,d3) double complex u1(d1+1,d2,d3) double precision twiddle(d1+1,d2,d3) integer i, j, k !$omp parallel do default(shared) private(i,j,k) collapse(2) do k = 1, d3 do j = 1, d2 do i = 1, d1 u0(i,j,k) = 0.d0 u1(i,j,k) = 0.d0 twiddle(i,j,k) = 0.d0 end do end do end do return end !--------------------------------------------------------------------- !--------------------------------------------------------------------- subroutine evolve(u0, u1, twiddle, d1, d2, d3) !--------------------------------------------------------------------- !--------------------------------------------------------------------- !--------------------------------------------------------------------- ! evolve u0 -> u1 (t time steps) in fourier space !--------------------------------------------------------------------- use ft_data implicit none integer d1, d2, d3 double complex u0(d1+1,d2,d3) double complex u1(d1+1,d2,d3) double precision twiddle(d1+1,d2,d3) integer i, j, k !$omp parallel do default(shared) private(i,j,k) collapse(2) do k = 1, d3 do j = 1, d2 do i = 1, d1 u0(i,j,k) = u0(i,j,k) * twiddle(i,j,k) u1(i,j,k) = u0(i,j,k) end do end do end do return end !--------------------------------------------------------------------- !--------------------------------------------------------------------- subroutine compute_initial_conditions(u0, d1, d2, d3) !--------------------------------------------------------------------- !--------------------------------------------------------------------- !--------------------------------------------------------------------- ! Fill in array u0 with initial conditions from ! random number generator !--------------------------------------------------------------------- use ft_data use libc_rand48 implicit none integer d1, d2, d3 double complex u0(d1+1, d2, d3) integer k, j, i call libc_srand48(seed) do k = 1, dims(3) do j = 1, dims(2) do i = 1, dims(1)+1 u0(i, j, k) = libc_drand48() end do end do end do return end !--------------------------------------------------------------------- !--------------------------------------------------------------------- subroutine setup !--------------------------------------------------------------------- !--------------------------------------------------------------------- use ft_data use omp_lib implicit none character(len=32) :: arg if (command_argument_count() .ne. 4) then error stop "Invalid number of arguments" end if call get_command_argument(1, arg) read(arg, *) dims(1) call get_command_argument(2, arg) read(arg, *) dims(2) call get_command_argument(3, arg) read(arg, *) dims(3) call get_command_argument(4, arg) read(arg, *) niter nxp = dims(1) + 1 ntotalp = nxp * dims(2) * dims(3) maxdim = max(dims(1), max(dims(2), dims(3))) debug = .FALSE. write(*, 1000) write(*, 1001) dims(1), dims(2), dims(3) write(*, 1002) niter !$ write(*, 1003) omp_get_max_threads() write(*, *) 1000 format(//, ' NAS Parallel Benchmarks (NPB3.4-OMP) - FT Benchmark', /) 1001 format(' Size : ', i0, 'x', i0, 'x', i0) 1002 format(' Iterations : ', i0) 1003 format(' Number of available threads : ', i0) !--------------------------------------------------------------------- ! Set up info for blocking of ffts and transposes. This improves ! performance on cache-based systems. Blocking involves ! working on a chunk of the problem at a time, taking chunks ! along the first, second, or third dimension. ! ! - In cffts1 blocking is on 2nd dimension (with fft on 1st dim) ! - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims) ! Since 1st dim is always in processor, we'll assume it's long enough ! (default blocking factor is 16 so min size for 1st dim is 16) ! The only case we have to worry about is cffts1 in a 2d decomposition. ! so the blocking factor should not be larger than the 2nd dimension. !--------------------------------------------------------------------- fftblock = fftblock_default fftblockpad = fftblockpad_default return end !--------------------------------------------------------------------- !--------------------------------------------------------------------- subroutine compute_indexmap(twiddle, d1, d2, d3) !--------------------------------------------------------------------- !--------------------------------------------------------------------- !--------------------------------------------------------------------- ! compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2 ! for time evolution exponent. !--------------------------------------------------------------------- use ft_data implicit none integer d1, d2, d3 double precision twiddle(d1+1, d2, d3) integer i, j, k, kk, kk2, jj, kj2, ii double precision ap !--------------------------------------------------------------------- ! basically we want to convert the fortran indices ! 1 2 3 4 5 6 7 8 ! to ! 0 1 2 3 -4 -3 -2 -1 ! The following magic formula does the trick: ! mod(i-1+n/2, n) - n/2 !--------------------------------------------------------------------- ap = - 4.d0 * alpha * pi *pi !$omp parallel do default(shared) private(i,j,k,kk,kk2,jj,kj2,ii) & !$omp& collapse(2) do k = 1, dims(3) do j = 1, dims(2) kk = mod(k-1+dims(3)/2, dims(3)) - dims(3)/2 kk2 = kk*kk jj = mod(j-1+dims(2)/2, dims(2)) - dims(2)/2 kj2 = jj*jj+kk2 do i = 1, dims(1) ii = mod(i-1+dims(1)/2, dims(1)) - dims(1)/2 twiddle(i,j,k) = dexp(ap*dble(ii*ii+kj2)) end do end do end do return end !--------------------------------------------------------------------- !--------------------------------------------------------------------- subroutine fft(dir, x1, x2) !--------------------------------------------------------------------- !--------------------------------------------------------------------- use ft_data implicit none integer dir double complex x1(ntotalp), x2(ntotalp) double complex y1(fftblockpad*maxdim), & & y2(fftblockpad*maxdim) !--------------------------------------------------------------------- ! note: args x1, x2 must be different arrays ! note: args for cfftsx are (direction, layout, xin, xout, scratch) ! xin/xout may be the same and it can be somewhat faster ! if they are !--------------------------------------------------------------------- if (dir .eq. 1) then call cffts1(1, dims(1), dims(2), dims(3), x1, x1, y1, y2) call cffts2(1, dims(1), dims(2), dims(3), x1, x1, y1, y2) call cffts3(1, dims(1), dims(2), dims(3), x1, x2, y1, y2) else call cffts3(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2) call cffts2(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2) call cffts1(-1, dims(1), dims(2), dims(3), x1, x2, y1, y2) endif return end !--------------------------------------------------------------------- !--------------------------------------------------------------------- subroutine cffts1(is, d1, d2, d3, x, xout, y1, y2) !--------------------------------------------------------------------- !--------------------------------------------------------------------- use ft_data implicit none integer is, d1, d2, d3, logd1 double complex x(d1+1,d2,d3) double complex xout(d1+1,d2,d3) double complex y1(fftblockpad, d1), y2(fftblockpad, d1) integer i, j, k, jj, jn logd1 = ilog2(d1) !$omp parallel do default(shared) private(i,j,k,jj,y1,y2,jn) & !$omp& shared(is,logd1,d1) collapse(2) do k = 1, d3 do jn = 0, d2/fftblock - 1 ! do jj = 0, d2 - fftblock, fftblock jj = jn*fftblock do j = 1, fftblock do i = 1, d1 y1(j,i) = x(i,j+jj,k) enddo enddo call cfftz (is, logd1, d1, y1, y2) do j = 1, fftblock do i = 1, d1 xout(i,j+jj,k) = y1(j,i) enddo enddo enddo enddo return end !--------------------------------------------------------------------- !--------------------------------------------------------------------- subroutine cffts2(is, d1, d2, d3, x, xout, y1, y2) !--------------------------------------------------------------------- !--------------------------------------------------------------------- use ft_data implicit none integer is, d1, d2, d3, logd2 double complex x(d1+1,d2,d3) double complex xout(d1+1,d2,d3) double complex y1(fftblockpad, d2), y2(fftblockpad, d2) integer i, j, k, ii, in logd2 = ilog2(d2) !$omp parallel do default(shared) private(i,j,k,ii,y1,y2,in) & !$omp& shared(is,logd2,d2) collapse(2) do k = 1, d3 do in = 0, d1/fftblock - 1 ! do ii = 0, d1 - fftblock, fftblock ii = in*fftblock do j = 1, d2 do i = 1, fftblock y1(i,j) = x(i+ii,j,k) enddo enddo call cfftz (is, logd2, d2, y1, y2) do j = 1, d2 do i = 1, fftblock xout(i+ii,j,k) = y1(i,j) enddo enddo enddo enddo return end !--------------------------------------------------------------------- !--------------------------------------------------------------------- subroutine cffts3(is, d1, d2, d3, x, xout, y1, y2) !--------------------------------------------------------------------- !--------------------------------------------------------------------- use ft_data implicit none integer is, d1, d2, d3, logd3 double complex x(d1+1,d2,d3) double complex xout(d1+1,d2,d3) double complex y1(fftblockpad, d3), y2(fftblockpad, d3) integer i, j, k, ii, in logd3 = ilog2(d3) !$omp parallel do default(shared) private(i,j,k,ii,y1,y2,in) & !$omp& shared(is) collapse(2) do j = 1, d2 do in = 0, d1/fftblock - 1 ! do ii = 0, d1 - fftblock, fftblock ii = in*fftblock do k = 1, d3 do i = 1, fftblock y1(i,k) = x(i+ii,j,k) enddo enddo call cfftz (is, logd3, d3, y1, y2) do k = 1, d3 do i = 1, fftblock xout(i+ii,j,k) = y1(i,k) enddo enddo enddo enddo return end !--------------------------------------------------------------------- !--------------------------------------------------------------------- subroutine fft_init (n) !--------------------------------------------------------------------- !--------------------------------------------------------------------- !--------------------------------------------------------------------- ! compute the roots-of-unity array that will be used for subsequent FFTs. !--------------------------------------------------------------------- use ft_data implicit none integer m,n,nu,ku,i,j,ln double precision t, ti !--------------------------------------------------------------------- ! Initialize the U array with sines and cosines in a manner that permits ! stride one access at each FFT iteration. !--------------------------------------------------------------------- nu = n m = ilog2(n) u(1) = m ku = 2 ln = 1 do j = 1, m t = pi / ln do i = 0, ln - 1 ti = i * t u(i+ku) = dcmplx (cos (ti), sin(ti)) enddo ku = ku + ln ln = 2 * ln enddo return end !--------------------------------------------------------------------- !--------------------------------------------------------------------- subroutine cfftz (is, m, n, x, y) !--------------------------------------------------------------------- !--------------------------------------------------------------------- !--------------------------------------------------------------------- ! Computes NY N-point complex-to-complex FFTs of X using an algorithm due ! to Swarztrauber. X is both the input and the output array, while Y is a ! scratch array. It is assumed that N = 2^M. Before calling CFFTZ to ! perform FFTs, the array U must be initialized by calling CFFTZ with IS ! set to 0 and M set to MX, where MX is the maximum value of M for any ! subsequent call. !--------------------------------------------------------------------- use ft_data implicit none integer is,m,n,i,j,l,mx double complex x, y dimension x(fftblockpad,n), y(fftblockpad,n) !--------------------------------------------------------------------- ! Check if input parameters are invalid. !--------------------------------------------------------------------- mx = u(1) if ((is .ne. 1 .and. is .ne. -1) .or. m .lt. 1 .or. m .gt. mx) & & then write (*, 1) is, m, mx 1 format ('CFFTZ: Either U has not been initialized, or else'/ & & 'one of the input parameters is invalid', 3I5) stop endif !--------------------------------------------------------------------- ! Perform one variant of the Stockham FFT. !--------------------------------------------------------------------- do l = 1, m, 2 call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y) if (l .eq. m) goto 160 call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x) enddo goto 180 !--------------------------------------------------------------------- ! Copy Y to X. !--------------------------------------------------------------------- 160 do j = 1, n do i = 1, fftblock x(i,j) = y(i,j) enddo enddo 180 continue return end !--------------------------------------------------------------------- !--------------------------------------------------------------------- subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y) !--------------------------------------------------------------------- !--------------------------------------------------------------------- !--------------------------------------------------------------------- ! Performs the L-th iteration of the second variant of the Stockham FFT. !--------------------------------------------------------------------- implicit none integer is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22 double complex u,x,y,u1,x11,x21 dimension u(n), x(ny1,n), y(ny1,n) !--------------------------------------------------------------------- ! Set initial parameters. !--------------------------------------------------------------------- n1 = n / 2 lk = 2 ** (l - 1) li = 2 ** (m - l) lj = 2 * lk ku = li + 1 do i = 0, li - 1 i11 = i * lk + 1 i12 = i11 + n1 i21 = i * lj + 1 i22 = i21 + lk if (is .ge. 1) then u1 = u(ku+i) else u1 = dconjg (u(ku+i)) endif !--------------------------------------------------------------------- ! This loop is vectorizable. !--------------------------------------------------------------------- do k = 0, lk - 1 do j = 1, ny x11 = x(j,i11+k) x21 = x(j,i12+k) y(j,i21+k) = x11 + x21 y(j,i22+k) = u1 * (x11 - x21) enddo enddo enddo return end !--------------------------------------------------------------------- !--------------------------------------------------------------------- !--------------------------------------------------------------------- integer function ilog2(n) !--------------------------------------------------------------------- !--------------------------------------------------------------------- implicit none integer n, nn, lg if (n .eq. 1) then ilog2=0 return endif lg = 1 nn = 2 do while (nn .lt. n) nn = nn*2 lg = lg+1 end do ilog2 = lg return end !--------------------------------------------------------------------- !--------------------------------------------------------------------- subroutine checksum(i, u1, d1, d2, d3) !--------------------------------------------------------------------- !--------------------------------------------------------------------- use ft_data implicit none integer i, d1, d2, d3 double complex u1(d1+1,d2,d3) integer j, q,r,s double complex chk, local chk = (0.0,0.0) !$omp parallel default(shared) private(i,q,r,s,local) local = (0.0,0.0) !$omp do do j=1,1024 q = mod(j, dims(1))+1 r = mod(3*j,dims(2))+1 s = mod(5*j,dims(3))+1 local=local+u1(q,r,s) end do !$omp critical chk=chk+local !$omp end critical !$omp end parallel chk = chk/dble(dims(1) * dims(2) * dims(3)) write (*, 30) i, REAL(chk), IMAG(chk) 30 format (' T = ', i0, ' Checksum = ', es0.10, " ", es0.10) return end