From 37310e1572e7b83e7401814aa2e1b4ccb41f77f3 Mon Sep 17 00:00:00 2001 From: Dorian Stoll <dorian.stoll@uni-potsdam.de> Date: Thu, 13 Jun 2024 09:26:20 +0200 Subject: [PATCH] nas-ft: fortran: Import from provided reference Using class C parameters for now --- src/benchmarks/nas-ft/fortran/Makefile | 40 + src/benchmarks/nas-ft/fortran/README | 5 + src/benchmarks/nas-ft/fortran/blk_par.h | 4 + src/benchmarks/nas-ft/fortran/ft.f90 | 1133 +++++++++++++++++ src/benchmarks/nas-ft/fortran/ft_data.f90 | 189 +++ .../nas-ft/fortran/inputft.data.sample | 3 + src/benchmarks/nas-ft/fortran/npbparams.h | 32 + .../nas-ft/fortran/print_results.f90 | 136 ++ src/benchmarks/nas-ft/fortran/randi8.f90 | 67 + src/benchmarks/nas-ft/fortran/timers.f90 | 171 +++ 10 files changed, 1780 insertions(+) create mode 100644 src/benchmarks/nas-ft/fortran/Makefile create mode 100644 src/benchmarks/nas-ft/fortran/README create mode 100644 src/benchmarks/nas-ft/fortran/blk_par.h create mode 100644 src/benchmarks/nas-ft/fortran/ft.f90 create mode 100644 src/benchmarks/nas-ft/fortran/ft_data.f90 create mode 100644 src/benchmarks/nas-ft/fortran/inputft.data.sample create mode 100644 src/benchmarks/nas-ft/fortran/npbparams.h create mode 100644 src/benchmarks/nas-ft/fortran/print_results.f90 create mode 100644 src/benchmarks/nas-ft/fortran/randi8.f90 create mode 100644 src/benchmarks/nas-ft/fortran/timers.f90 diff --git a/src/benchmarks/nas-ft/fortran/Makefile b/src/benchmarks/nas-ft/fortran/Makefile new file mode 100644 index 00000000..0e4b0b42 --- /dev/null +++ b/src/benchmarks/nas-ft/fortran/Makefile @@ -0,0 +1,40 @@ +SHELL=/bin/sh +BENCHMARK=ft +BENCHMARKU=FT +BLKFAC=32 + +include ../config/make.def + +include ../sys/make.common + +OBJS = ft.o ft_data.o ${COMMON}/${RAND}.o ${COMMON}/print_results.o \ + ${COMMON}/timers.o ${COMMON}/wtime.o + +${PROGRAM}: config + @ver=$(VERSION); bfac=`echo $$ver|sed -e 's/^blk//' -e 's/^BLK//'`; \ + if [ x$$ver != x$$bfac ] ; then \ + ${MAKE} BLKFAC=$${bfac:-32} exec; \ + else \ + ${MAKE} exec; \ + fi + +exec: $(OBJS) + ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${F_LIB} + + +.f90.o: + ${FCOMPILE} $< + +blk_par.h: FORCE + sed -e 's/=0/=$(BLKFAC)/' blk_par0.h > blk_par.h_wk + @ if ! `diff blk_par.h_wk blk_par.h > /dev/null 2>&1`; then \ + mv -f blk_par.h_wk blk_par.h; else rm -f blk_par.h_wk; fi +FORCE: + +ft.o: ft.f90 ft_data.o +ft_data.o: ft_data.f90 npbparams.h blk_par.h + +clean: + - rm -f *.o *~ mputil* *.mod + - rm -f ft npbparams.h core blk_par.h + - if [ -d rii_files ]; then rm -r rii_files; fi diff --git a/src/benchmarks/nas-ft/fortran/README b/src/benchmarks/nas-ft/fortran/README new file mode 100644 index 00000000..fd6d3b3c --- /dev/null +++ b/src/benchmarks/nas-ft/fortran/README @@ -0,0 +1,5 @@ +This code implements the time integration of a three-dimensional +partial differential equation using the Fast Fourier Transform. + +The code uses Fortran 90 module to specify data fields. So, a compiler +supports Fortran 90 is required. diff --git a/src/benchmarks/nas-ft/fortran/blk_par.h b/src/benchmarks/nas-ft/fortran/blk_par.h new file mode 100644 index 00000000..4251e495 --- /dev/null +++ b/src/benchmarks/nas-ft/fortran/blk_par.h @@ -0,0 +1,4 @@ + integer fftblock_default, fftblockpad_default + parameter (fftblock_default=32, & + & fftblockpad_default=fftblock_default+2) + diff --git a/src/benchmarks/nas-ft/fortran/ft.f90 b/src/benchmarks/nas-ft/fortran/ft.f90 new file mode 100644 index 00000000..cc8ba00c --- /dev/null +++ b/src/benchmarks/nas-ft/fortran/ft.f90 @@ -0,0 +1,1133 @@ +!-------------------------------------------------------------------------! +! ! +! N A S P A R A L L E L B E N C H M A R K S 3.4 ! +! ! +! O p e n M P V E R S I O N ! +! ! +! F T ! +! ! +!-------------------------------------------------------------------------! +! ! +! This benchmark is an OpenMP version of the NPB FT code. ! +! It is described in NAS Technical Report 99-011. ! +! ! +! Permission to use, copy, distribute and modify this software ! +! for any purpose with or without fee is hereby granted. We ! +! request, however, that all derived work reference the NAS ! +! Parallel Benchmarks 3.4. This software is provided "as is" ! +! without express or implied warranty. ! +! ! +! Information on NPB 3.4, including the technical report, the ! +! original specifications, source code, results and information ! +! on how to submit new results, is available at: ! +! ! +! http://www.nas.nasa.gov/Software/NPB/ ! +! ! +! Send comments or suggestions to npb@nas.nasa.gov ! +! ! +! NAS Parallel Benchmarks Group ! +! NASA Ames Research Center ! +! Mail Stop: T27A-1 ! +! Moffett Field, CA 94035-1000 ! +! ! +! E-mail: npb@nas.nasa.gov ! +! Fax: (650) 604-3957 ! +! ! +!-------------------------------------------------------------------------! + +!--------------------------------------------------------------------- +! +! Authors: D. Bailey +! W. Saphir +! H. Jin +! +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! FT benchmark +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + program ft + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! Module ft_fields defines main arrays (u0, u1, u2) in the problem +!--------------------------------------------------------------------- + + use ft_data + use ft_fields + + implicit none + + integer i + + integer iter + double precision total_time, mflops + logical verified + character class + + +!--------------------------------------------------------------------- +! Run the entire problem once to make sure all data is touched. +! This reduces variable startup costs, which is important for such a +! short benchmark. The other NPB 2 implementations are similar. +!--------------------------------------------------------------------- + do i = 1, t_max + call timer_clear(i) + end do + + call alloc_space + + call setup() + call init_ui(u0, u1, twiddle, dims(1), dims(2), dims(3)) + call compute_indexmap(twiddle, dims(1), dims(2), dims(3)) + call compute_initial_conditions(u1, dims(1), dims(2), dims(3)) + call fft_init (dims(1)) + call fft(1, u1, u0) + +!--------------------------------------------------------------------- +! Start over from the beginning. Note that all operations must +! be timed, in contrast to other benchmarks. +!--------------------------------------------------------------------- + do i = 1, t_max + call timer_clear(i) + end do + + call timer_start(T_total) + if (timers_enabled) call timer_start(T_setup) + + call compute_indexmap(twiddle, dims(1), dims(2), dims(3)) + + call compute_initial_conditions(u1, dims(1), dims(2), dims(3)) + + call fft_init (dims(1)) + + if (timers_enabled) call timer_stop(T_setup) + if (timers_enabled) call timer_start(T_fft) + call fft(1, u1, u0) + if (timers_enabled) call timer_stop(T_fft) + + do iter = 1, niter + if (timers_enabled) call timer_start(T_evolve) + call evolve(u0, u1, twiddle, dims(1), dims(2), dims(3)) + if (timers_enabled) call timer_stop(T_evolve) + if (timers_enabled) call timer_start(T_fft) +! call fft(-1, u1, u2) + call fft(-1, u1, u1) + if (timers_enabled) call timer_stop(T_fft) + if (timers_enabled) call timer_start(T_checksum) +! call checksum(iter, u2, dims(1), dims(2), dims(3)) + call checksum(iter, u1, dims(1), dims(2), dims(3)) + if (timers_enabled) call timer_stop(T_checksum) + end do + + call verify(nx, ny, nz, niter, verified, class) + + call timer_stop(t_total) + total_time = timer_read(t_total) + + if( total_time .ne. 0. ) then + mflops = 1.0d-6*ntotal_f * & + & (14.8157+7.19641*log(ntotal_f) & + & + (5.23518+7.21113*log(ntotal_f))*niter) & + & /total_time + else + mflops = 0.0 + endif + call print_results('FT', class, nx, ny, nz, niter, & + & total_time, mflops, ' floating point', verified, & + & npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) + if (timers_enabled) call print_timers() + + end + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine init_ui(u0, u1, twiddle, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! touch all the big data +!--------------------------------------------------------------------- + + implicit none + integer d1, d2, d3 + double complex u0(d1+1,d2,d3) + double complex u1(d1+1,d2,d3) + double precision twiddle(d1+1,d2,d3) + integer i, j, k + +!$omp parallel do default(shared) private(i,j,k) collapse(2) + do k = 1, d3 + do j = 1, d2 + do i = 1, d1 + u0(i,j,k) = 0.d0 + u1(i,j,k) = 0.d0 + twiddle(i,j,k) = 0.d0 + end do + end do + end do + + return + end + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine evolve(u0, u1, twiddle, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! evolve u0 -> u1 (t time steps) in fourier space +!--------------------------------------------------------------------- + + use ft_data + implicit none + + integer d1, d2, d3 + double complex u0(d1+1,d2,d3) + double complex u1(d1+1,d2,d3) + double precision twiddle(d1+1,d2,d3) + integer i, j, k + +!$omp parallel do default(shared) private(i,j,k) collapse(2) + do k = 1, d3 + do j = 1, d2 + do i = 1, d1 + u0(i,j,k) = u0(i,j,k) * twiddle(i,j,k) + u1(i,j,k) = u0(i,j,k) + end do + end do + end do + + return + end + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine compute_initial_conditions(u0, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! Fill in array u0 with initial conditions from +! random number generator +!--------------------------------------------------------------------- + + use ft_data + implicit none + + integer d1, d2, d3 + double complex u0(d1+1, d2, d3) + integer k, j + double precision x0, start, an, dummy, starts(nz) + + + start = seed +!--------------------------------------------------------------------- +! Jump to the starting element for our first plane. +!--------------------------------------------------------------------- + call ipow46(a, 0, an) + dummy = randlc(start, an) + call ipow46(a, 2*nx*ny, an) + + starts(1) = start + do k = 2, dims(3) + dummy = randlc(start, an) + starts(k) = start + end do + +!--------------------------------------------------------------------- +! Go through by z planes filling in one square at a time. +!--------------------------------------------------------------------- +!$omp parallel do default(shared) private(k,j,x0) + do k = 1, dims(3) + x0 = starts(k) + do j = 1, dims(2) + call vranlc(2*nx, x0, a, u0(1, j, k)) + end do + end do + + return + end + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine ipow46(a, exponent, result) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! compute a^exponent mod 2^46 +!--------------------------------------------------------------------- + + implicit none + double precision a, result, dummy, q, r + integer exponent, n, n2 + external randlc + double precision randlc +!--------------------------------------------------------------------- +! Use +! a^n = a^(n/2)*a^(n/2) if n even else +! a^n = a*a^(n-1) if n odd +!--------------------------------------------------------------------- + result = 1 + if (exponent .eq. 0) return + q = a + r = 1 + n = exponent + + + do while (n .gt. 1) + n2 = n/2 + if (n2 * 2 .eq. n) then + dummy = randlc(q, q) + n = n2 + else + dummy = randlc(r, q) + n = n-1 + endif + end do + dummy = randlc(r, q) + result = r + return + end + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine setup + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + use ft_data + implicit none + +!$ integer omp_get_max_threads +!$ external omp_get_max_threads + debug = .FALSE. + + call check_timer_flag( timers_enabled ) + + write(*, 1000) + + niter = niter_default + + write(*, 1001) nx, ny, nz + write(*, 1002) niter +!$ write(*, 1003) omp_get_max_threads() + write(*, *) + + + 1000 format(//,' NAS Parallel Benchmarks (NPB3.4-OMP)', & + & ' - FT Benchmark', /) + 1001 format(' Size : ', i4, 'x', i4, 'x', i4) + 1002 format(' Iterations :', i7) + 1003 format(' Number of available threads :', i7) + + dims(1) = nx + dims(2) = ny + dims(3) = nz + + +!--------------------------------------------------------------------- +! Set up info for blocking of ffts and transposes. This improves +! performance on cache-based systems. Blocking involves +! working on a chunk of the problem at a time, taking chunks +! along the first, second, or third dimension. +! +! - In cffts1 blocking is on 2nd dimension (with fft on 1st dim) +! - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims) + +! Since 1st dim is always in processor, we'll assume it's long enough +! (default blocking factor is 16 so min size for 1st dim is 16) +! The only case we have to worry about is cffts1 in a 2d decomposition. +! so the blocking factor should not be larger than the 2nd dimension. +!--------------------------------------------------------------------- + + fftblock = fftblock_default + fftblockpad = fftblockpad_default + + if (fftblock .ne. fftblock_default) fftblockpad = fftblock+3 + + return + end + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine compute_indexmap(twiddle, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2 +! for time evolution exponent. +!--------------------------------------------------------------------- + + use ft_data + implicit none + + integer d1, d2, d3 + double precision twiddle(d1+1, d2, d3) + integer i, j, k, kk, kk2, jj, kj2, ii + double precision ap + +!--------------------------------------------------------------------- +! basically we want to convert the fortran indices +! 1 2 3 4 5 6 7 8 +! to +! 0 1 2 3 -4 -3 -2 -1 +! The following magic formula does the trick: +! mod(i-1+n/2, n) - n/2 +!--------------------------------------------------------------------- + + ap = - 4.d0 * alpha * pi *pi + +!$omp parallel do default(shared) private(i,j,k,kk,kk2,jj,kj2,ii) & +!$omp& collapse(2) + do k = 1, dims(3) + do j = 1, dims(2) + kk = mod(k-1+nz/2, nz) - nz/2 + kk2 = kk*kk + jj = mod(j-1+ny/2, ny) - ny/2 + kj2 = jj*jj+kk2 + do i = 1, dims(1) + ii = mod(i-1+nx/2, nx) - nx/2 + twiddle(i,j,k) = dexp(ap*dble(ii*ii+kj2)) + end do + end do + end do + + return + end + + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine print_timers() + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + use ft_data + implicit none + + integer i + double precision t, t_m + character*25 tstrings(T_max) + data tstrings / ' total ', & + & ' setup ', & + & ' fft ', & + & ' evolve ', & + & ' checksum ', & + & ' fftx ', & + & ' ffty ', & + & ' fftz ' / + + t_m = timer_read(T_total) + if (t_m .le. 0.0d0) t_m = 1.0d0 + do i = 1, t_max + t = timer_read(i) + write(*, 100) i, tstrings(i), t, t*100.0/t_m + end do + 100 format(' timer ', i2, '(', A16, ') :', F9.4, ' (',F6.2,'%)') + return + end + + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine fft(dir, x1, x2) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + use ft_data + implicit none + + integer dir + double complex x1(ntotalp), x2(ntotalp) + + double complex y1(fftblockpad_default*maxdim), & + & y2(fftblockpad_default*maxdim) + +!--------------------------------------------------------------------- +! note: args x1, x2 must be different arrays +! note: args for cfftsx are (direction, layout, xin, xout, scratch) +! xin/xout may be the same and it can be somewhat faster +! if they are +!--------------------------------------------------------------------- + + if (dir .eq. 1) then + call cffts1(1, dims(1), dims(2), dims(3), x1, x1, y1, y2) + call cffts2(1, dims(1), dims(2), dims(3), x1, x1, y1, y2) + call cffts3(1, dims(1), dims(2), dims(3), x1, x2, y1, y2) + else + call cffts3(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2) + call cffts2(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2) + call cffts1(-1, dims(1), dims(2), dims(3), x1, x2, y1, y2) + endif + return + end + + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine cffts1(is, d1, d2, d3, x, xout, y1, y2) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + use ft_data + implicit none + + integer is, d1, d2, d3, logd1 + double complex x(d1+1,d2,d3) + double complex xout(d1+1,d2,d3) + double complex y1(fftblockpad, d1), y2(fftblockpad, d1) + integer i, j, k, jj, jn + + logd1 = ilog2(d1) + + if (timers_enabled) call timer_start(T_fftx) +!$omp parallel do default(shared) private(i,j,k,jj,y1,y2,jn) & +!$omp& shared(is,logd1,d1) collapse(2) + do k = 1, d3 + do jn = 0, d2/fftblock - 1 +! do jj = 0, d2 - fftblock, fftblock + jj = jn*fftblock + do j = 1, fftblock + do i = 1, d1 + y1(j,i) = x(i,j+jj,k) + enddo + enddo + + call cfftz (is, logd1, d1, y1, y2) + + + do j = 1, fftblock + do i = 1, d1 + xout(i,j+jj,k) = y1(j,i) + enddo + enddo + enddo + enddo + if (timers_enabled) call timer_stop(T_fftx) + + return + end + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine cffts2(is, d1, d2, d3, x, xout, y1, y2) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + use ft_data + implicit none + + integer is, d1, d2, d3, logd2 + double complex x(d1+1,d2,d3) + double complex xout(d1+1,d2,d3) + double complex y1(fftblockpad, d2), y2(fftblockpad, d2) + integer i, j, k, ii, in + + logd2 = ilog2(d2) + + if (timers_enabled) call timer_start(T_ffty) +!$omp parallel do default(shared) private(i,j,k,ii,y1,y2,in) & +!$omp& shared(is,logd2,d2) collapse(2) + do k = 1, d3 + do in = 0, d1/fftblock - 1 +! do ii = 0, d1 - fftblock, fftblock + ii = in*fftblock + do j = 1, d2 + do i = 1, fftblock + y1(i,j) = x(i+ii,j,k) + enddo + enddo + + call cfftz (is, logd2, d2, y1, y2) + + do j = 1, d2 + do i = 1, fftblock + xout(i+ii,j,k) = y1(i,j) + enddo + enddo + enddo + enddo + if (timers_enabled) call timer_stop(T_ffty) + + return + end + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine cffts3(is, d1, d2, d3, x, xout, y1, y2) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + use ft_data + implicit none + + integer is, d1, d2, d3, logd3 + double complex x(d1+1,d2,d3) + double complex xout(d1+1,d2,d3) + double complex y1(fftblockpad, d3), y2(fftblockpad, d3) + integer i, j, k, ii, in + + logd3 = ilog2(d3) + + if (timers_enabled) call timer_start(T_fftz) +!$omp parallel do default(shared) private(i,j,k,ii,y1,y2,in) & +!$omp& shared(is) collapse(2) + do j = 1, d2 + do in = 0, d1/fftblock - 1 +! do ii = 0, d1 - fftblock, fftblock + ii = in*fftblock + do k = 1, d3 + do i = 1, fftblock + y1(i,k) = x(i+ii,j,k) + enddo + enddo + + call cfftz (is, logd3, d3, y1, y2) + + do k = 1, d3 + do i = 1, fftblock + xout(i+ii,j,k) = y1(i,k) + enddo + enddo + enddo + enddo + if (timers_enabled) call timer_stop(T_fftz) + + return + end + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine fft_init (n) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! compute the roots-of-unity array that will be used for subsequent FFTs. +!--------------------------------------------------------------------- + + use ft_data + implicit none + + integer m,n,nu,ku,i,j,ln + double precision t, ti + + +!--------------------------------------------------------------------- +! Initialize the U array with sines and cosines in a manner that permits +! stride one access at each FFT iteration. +!--------------------------------------------------------------------- + nu = n + m = ilog2(n) + u(1) = m + ku = 2 + ln = 1 + + do j = 1, m + t = pi / ln + + do i = 0, ln - 1 + ti = i * t + u(i+ku) = dcmplx (cos (ti), sin(ti)) + enddo + + ku = ku + ln + ln = 2 * ln + enddo + + return + end + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine cfftz (is, m, n, x, y) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! Computes NY N-point complex-to-complex FFTs of X using an algorithm due +! to Swarztrauber. X is both the input and the output array, while Y is a +! scratch array. It is assumed that N = 2^M. Before calling CFFTZ to +! perform FFTs, the array U must be initialized by calling CFFTZ with IS +! set to 0 and M set to MX, where MX is the maximum value of M for any +! subsequent call. +!--------------------------------------------------------------------- + + use ft_data + implicit none + + integer is,m,n,i,j,l,mx + double complex x, y + + dimension x(fftblockpad,n), y(fftblockpad,n) + +!--------------------------------------------------------------------- +! Check if input parameters are invalid. +!--------------------------------------------------------------------- + mx = u(1) + if ((is .ne. 1 .and. is .ne. -1) .or. m .lt. 1 .or. m .gt. mx) & + & then + write (*, 1) is, m, mx + 1 format ('CFFTZ: Either U has not been initialized, or else'/ & + & 'one of the input parameters is invalid', 3I5) + stop + endif + +!--------------------------------------------------------------------- +! Perform one variant of the Stockham FFT. +!--------------------------------------------------------------------- + do l = 1, m, 2 + call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y) + if (l .eq. m) goto 160 + call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x) + enddo + + goto 180 + +!--------------------------------------------------------------------- +! Copy Y to X. +!--------------------------------------------------------------------- + 160 do j = 1, n + do i = 1, fftblock + x(i,j) = y(i,j) + enddo + enddo + + 180 continue + + return + end + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! Performs the L-th iteration of the second variant of the Stockham FFT. +!--------------------------------------------------------------------- + + implicit none + + integer is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22 + double complex u,x,y,u1,x11,x21 + dimension u(n), x(ny1,n), y(ny1,n) + + +!--------------------------------------------------------------------- +! Set initial parameters. +!--------------------------------------------------------------------- + + n1 = n / 2 + lk = 2 ** (l - 1) + li = 2 ** (m - l) + lj = 2 * lk + ku = li + 1 + + do i = 0, li - 1 + i11 = i * lk + 1 + i12 = i11 + n1 + i21 = i * lj + 1 + i22 = i21 + lk + if (is .ge. 1) then + u1 = u(ku+i) + else + u1 = dconjg (u(ku+i)) + endif + +!--------------------------------------------------------------------- +! This loop is vectorizable. +!--------------------------------------------------------------------- + do k = 0, lk - 1 + do j = 1, ny + x11 = x(j,i11+k) + x21 = x(j,i12+k) + y(j,i21+k) = x11 + x21 + y(j,i22+k) = u1 * (x11 - x21) + enddo + enddo + enddo + + return + end + +!--------------------------------------------------------------------- + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + integer function ilog2(n) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + integer n, nn, lg + if (n .eq. 1) then + ilog2=0 + return + endif + lg = 1 + nn = 2 + do while (nn .lt. n) + nn = nn*2 + lg = lg+1 + end do + ilog2 = lg + return + end + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine checksum(i, u1, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + use ft_data + implicit none + + integer i, d1, d2, d3 + double complex u1(d1+1,d2,d3) + integer j, q,r,s + double complex chk + chk = (0.0,0.0) + +!$omp parallel do default(shared) private(i,q,r,s) reduction(+:chk) + do j=1,1024 + q = mod(j, nx)+1 + r = mod(3*j,ny)+1 + s = mod(5*j,nz)+1 + chk=chk+u1(q,r,s) + end do + + chk = chk/ntotal_f + + write (*, 30) i, chk + 30 format (' T =',I5,5X,'Checksum =',1P2D22.12) + sums(i) = chk + return + end + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine verify (d1, d2, d3, nt, verified, class) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + use, intrinsic :: ieee_arithmetic, only : ieee_is_nan + + use ft_data + + implicit none + + integer d1, d2, d3, nt + character class + logical verified + integer i + double precision err, epsilon + +!--------------------------------------------------------------------- +! Reference checksums +!--------------------------------------------------------------------- + double complex csum_ref(25) + + + class = 'U' + + epsilon = 1.0d-12 + verified = .FALSE. + + if (d1 .eq. 64 .and. & + & d2 .eq. 64 .and. & + & d3 .eq. 64 .and. & + & nt .eq. 6) then +!--------------------------------------------------------------------- +! Sample size reference checksums +!--------------------------------------------------------------------- + class = 'S' + csum_ref(1) = dcmplx(5.546087004964D+02, 4.845363331978D+02) + csum_ref(2) = dcmplx(5.546385409189D+02, 4.865304269511D+02) + csum_ref(3) = dcmplx(5.546148406171D+02, 4.883910722336D+02) + csum_ref(4) = dcmplx(5.545423607415D+02, 4.901273169046D+02) + csum_ref(5) = dcmplx(5.544255039624D+02, 4.917475857993D+02) + csum_ref(6) = dcmplx(5.542683411902D+02, 4.932597244941D+02) + + else if (d1 .eq. 128 .and. & + & d2 .eq. 128 .and. & + & d3 .eq. 32 .and. & + & nt .eq. 6) then +!--------------------------------------------------------------------- +! Class W size reference checksums +!--------------------------------------------------------------------- + class = 'W' + csum_ref(1) = dcmplx(5.673612178944D+02, 5.293246849175D+02) + csum_ref(2) = dcmplx(5.631436885271D+02, 5.282149986629D+02) + csum_ref(3) = dcmplx(5.594024089970D+02, 5.270996558037D+02) + csum_ref(4) = dcmplx(5.560698047020D+02, 5.260027904925D+02) + csum_ref(5) = dcmplx(5.530898991250D+02, 5.249400845633D+02) + csum_ref(6) = dcmplx(5.504159734538D+02, 5.239212247086D+02) + + else if (d1 .eq. 256 .and. & + & d2 .eq. 256 .and. & + & d3 .eq. 128 .and. & + & nt .eq. 6) then +!--------------------------------------------------------------------- +! Class A size reference checksums +!--------------------------------------------------------------------- + class = 'A' + csum_ref(1) = dcmplx(5.046735008193D+02, 5.114047905510D+02) + csum_ref(2) = dcmplx(5.059412319734D+02, 5.098809666433D+02) + csum_ref(3) = dcmplx(5.069376896287D+02, 5.098144042213D+02) + csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02) + csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02) + csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02) + + else if (d1 .eq. 512 .and. & + & d2 .eq. 256 .and. & + & d3 .eq. 256 .and. & + & nt .eq. 20) then +!--------------------------------------------------------------------- +! Class B size reference checksums +!--------------------------------------------------------------------- + class = 'B' + csum_ref(1) = dcmplx(5.177643571579D+02, 5.077803458597D+02) + csum_ref(2) = dcmplx(5.154521291263D+02, 5.088249431599D+02) + csum_ref(3) = dcmplx(5.146409228649D+02, 5.096208912659D+02) + csum_ref(4) = dcmplx(5.142378756213D+02, 5.101023387619D+02) + csum_ref(5) = dcmplx(5.139626667737D+02, 5.103976610617D+02) + csum_ref(6) = dcmplx(5.137423460082D+02, 5.105948019802D+02) + csum_ref(7) = dcmplx(5.135547056878D+02, 5.107404165783D+02) + csum_ref(8) = dcmplx(5.133910925466D+02, 5.108576573661D+02) + csum_ref(9) = dcmplx(5.132470705390D+02, 5.109577278523D+02) + csum_ref(10) = dcmplx(5.131197729984D+02, 5.110460304483D+02) + csum_ref(11) = dcmplx(5.130070319283D+02, 5.111252433800D+02) + csum_ref(12) = dcmplx(5.129070537032D+02, 5.111968077718D+02) + csum_ref(13) = dcmplx(5.128182883502D+02, 5.112616233064D+02) + csum_ref(14) = dcmplx(5.127393733383D+02, 5.113203605551D+02) + csum_ref(15) = dcmplx(5.126691062020D+02, 5.113735928093D+02) + csum_ref(16) = dcmplx(5.126064276004D+02, 5.114218460548D+02) + csum_ref(17) = dcmplx(5.125504076570D+02, 5.114656139760D+02) + csum_ref(18) = dcmplx(5.125002331720D+02, 5.115053595966D+02) + csum_ref(19) = dcmplx(5.124551951846D+02, 5.115415130407D+02) + csum_ref(20) = dcmplx(5.124146770029D+02, 5.115744692211D+02) + + else if (d1 .eq. 512 .and. & + & d2 .eq. 512 .and. & + & d3 .eq. 512 .and. & + & nt .eq. 20) then +!--------------------------------------------------------------------- +! Class C size reference checksums +!--------------------------------------------------------------------- + class = 'C' + csum_ref(1) = dcmplx(5.195078707457D+02, 5.149019699238D+02) + csum_ref(2) = dcmplx(5.155422171134D+02, 5.127578201997D+02) + csum_ref(3) = dcmplx(5.144678022222D+02, 5.122251847514D+02) + csum_ref(4) = dcmplx(5.140150594328D+02, 5.121090289018D+02) + csum_ref(5) = dcmplx(5.137550426810D+02, 5.121143685824D+02) + csum_ref(6) = dcmplx(5.135811056728D+02, 5.121496764568D+02) + csum_ref(7) = dcmplx(5.134569343165D+02, 5.121870921893D+02) + csum_ref(8) = dcmplx(5.133651975661D+02, 5.122193250322D+02) + csum_ref(9) = dcmplx(5.132955192805D+02, 5.122454735794D+02) + csum_ref(10) = dcmplx(5.132410471738D+02, 5.122663649603D+02) + csum_ref(11) = dcmplx(5.131971141679D+02, 5.122830879827D+02) + csum_ref(12) = dcmplx(5.131605205716D+02, 5.122965869718D+02) + csum_ref(13) = dcmplx(5.131290734194D+02, 5.123075927445D+02) + csum_ref(14) = dcmplx(5.131012720314D+02, 5.123166486553D+02) + csum_ref(15) = dcmplx(5.130760908195D+02, 5.123241541685D+02) + csum_ref(16) = dcmplx(5.130528295923D+02, 5.123304037599D+02) + csum_ref(17) = dcmplx(5.130310107773D+02, 5.123356167976D+02) + csum_ref(18) = dcmplx(5.130103090133D+02, 5.123399592211D+02) + csum_ref(19) = dcmplx(5.129905029333D+02, 5.123435588985D+02) + csum_ref(20) = dcmplx(5.129714421109D+02, 5.123465164008D+02) + + else if (d1 .eq. 2048 .and. & + & d2 .eq. 1024 .and. & + & d3 .eq. 1024 .and. & + & nt .eq. 25) then +!--------------------------------------------------------------------- +! Class D size reference checksums +!--------------------------------------------------------------------- + class = 'D' + csum_ref(1) = dcmplx(5.122230065252D+02, 5.118534037109D+02) + csum_ref(2) = dcmplx(5.120463975765D+02, 5.117061181082D+02) + csum_ref(3) = dcmplx(5.119865766760D+02, 5.117096364601D+02) + csum_ref(4) = dcmplx(5.119518799488D+02, 5.117373863950D+02) + csum_ref(5) = dcmplx(5.119269088223D+02, 5.117680347632D+02) + csum_ref(6) = dcmplx(5.119082416858D+02, 5.117967875532D+02) + csum_ref(7) = dcmplx(5.118943814638D+02, 5.118225281841D+02) + csum_ref(8) = dcmplx(5.118842385057D+02, 5.118451629348D+02) + csum_ref(9) = dcmplx(5.118769435632D+02, 5.118649119387D+02) + csum_ref(10) = dcmplx(5.118718203448D+02, 5.118820803844D+02) + csum_ref(11) = dcmplx(5.118683569061D+02, 5.118969781011D+02) + csum_ref(12) = dcmplx(5.118661708593D+02, 5.119098918835D+02) + csum_ref(13) = dcmplx(5.118649768950D+02, 5.119210777066D+02) + csum_ref(14) = dcmplx(5.118645605626D+02, 5.119307604484D+02) + csum_ref(15) = dcmplx(5.118647586618D+02, 5.119391362671D+02) + csum_ref(16) = dcmplx(5.118654451572D+02, 5.119463757241D+02) + csum_ref(17) = dcmplx(5.118665212451D+02, 5.119526269238D+02) + csum_ref(18) = dcmplx(5.118679083821D+02, 5.119580184108D+02) + csum_ref(19) = dcmplx(5.118695433664D+02, 5.119626617538D+02) + csum_ref(20) = dcmplx(5.118713748264D+02, 5.119666538138D+02) + csum_ref(21) = dcmplx(5.118733606701D+02, 5.119700787219D+02) + csum_ref(22) = dcmplx(5.118754661974D+02, 5.119730095953D+02) + csum_ref(23) = dcmplx(5.118776626738D+02, 5.119755100241D+02) + csum_ref(24) = dcmplx(5.118799262314D+02, 5.119776353561D+02) + csum_ref(25) = dcmplx(5.118822370068D+02, 5.119794338060D+02) + + else if (d1 .eq. 4096 .and. & + & d2 .eq. 2048 .and. & + & d3 .eq. 2048 .and. & + & nt .eq. 25) then +!--------------------------------------------------------------------- +! Class E size reference checksums +!--------------------------------------------------------------------- + class = 'E' + csum_ref(1) = dcmplx(5.121601045346D+02, 5.117395998266D+02) + csum_ref(2) = dcmplx(5.120905403678D+02, 5.118614716182D+02) + csum_ref(3) = dcmplx(5.120623229306D+02, 5.119074203747D+02) + csum_ref(4) = dcmplx(5.120438418997D+02, 5.119345900733D+02) + csum_ref(5) = dcmplx(5.120311521872D+02, 5.119551325550D+02) + csum_ref(6) = dcmplx(5.120226088809D+02, 5.119720179919D+02) + csum_ref(7) = dcmplx(5.120169296534D+02, 5.119861371665D+02) + csum_ref(8) = dcmplx(5.120131225172D+02, 5.119979364402D+02) + csum_ref(9) = dcmplx(5.120104767108D+02, 5.120077674092D+02) + csum_ref(10) = dcmplx(5.120085127969D+02, 5.120159443121D+02) + csum_ref(11) = dcmplx(5.120069224127D+02, 5.120227453670D+02) + csum_ref(12) = dcmplx(5.120055158164D+02, 5.120284096041D+02) + csum_ref(13) = dcmplx(5.120041820159D+02, 5.120331373793D+02) + csum_ref(14) = dcmplx(5.120028605402D+02, 5.120370938679D+02) + csum_ref(15) = dcmplx(5.120015223011D+02, 5.120404138831D+02) + csum_ref(16) = dcmplx(5.120001570022D+02, 5.120432068837D+02) + csum_ref(17) = dcmplx(5.119987650555D+02, 5.120455615860D+02) + csum_ref(18) = dcmplx(5.119973525091D+02, 5.120475499442D+02) + csum_ref(19) = dcmplx(5.119959279472D+02, 5.120492304629D+02) + csum_ref(20) = dcmplx(5.119945006558D+02, 5.120506508902D+02) + csum_ref(21) = dcmplx(5.119930795911D+02, 5.120518503782D+02) + csum_ref(22) = dcmplx(5.119916728462D+02, 5.120528612016D+02) + csum_ref(23) = dcmplx(5.119902874185D+02, 5.120537101195D+02) + csum_ref(24) = dcmplx(5.119889291565D+02, 5.120544194514D+02) + csum_ref(25) = dcmplx(5.119876028049D+02, 5.120550079284D+02) + + else if (d1 .eq. 8192 .and. & + & d2 .eq. 4096 .and. & + & d3 .eq. 4096 .and. & + & nt .eq. 25) then +!--------------------------------------------------------------------- +! Class F size reference checksums +!--------------------------------------------------------------------- + class = 'F' + csum_ref( 1) = dcmplx(5.119892866928D+02, 5.121457822747D+02) + csum_ref( 2) = dcmplx(5.119560157487D+02, 5.121009044434D+02) + csum_ref( 3) = dcmplx(5.119437960123D+02, 5.120761074285D+02) + csum_ref( 4) = dcmplx(5.119395628845D+02, 5.120614320496D+02) + csum_ref( 5) = dcmplx(5.119390371879D+02, 5.120514085624D+02) + csum_ref( 6) = dcmplx(5.119405091840D+02, 5.120438117102D+02) + csum_ref( 7) = dcmplx(5.119430444528D+02, 5.120376348915D+02) + csum_ref( 8) = dcmplx(5.119460702242D+02, 5.120323831062D+02) + csum_ref( 9) = dcmplx(5.119492377036D+02, 5.120277980818D+02) + csum_ref(10) = dcmplx(5.119523446268D+02, 5.120237368268D+02) + csum_ref(11) = dcmplx(5.119552825361D+02, 5.120201137845D+02) + csum_ref(12) = dcmplx(5.119580008777D+02, 5.120168723492D+02) + csum_ref(13) = dcmplx(5.119604834177D+02, 5.120139707209D+02) + csum_ref(14) = dcmplx(5.119627332821D+02, 5.120113749334D+02) + csum_ref(15) = dcmplx(5.119647637538D+02, 5.120090554887D+02) + csum_ref(16) = dcmplx(5.119665927740D+02, 5.120069857863D+02) + csum_ref(17) = dcmplx(5.119682397643D+02, 5.120051414260D+02) + csum_ref(18) = dcmplx(5.119697238718D+02, 5.120034999132D+02) + csum_ref(19) = dcmplx(5.119710630664D+02, 5.120020405355D+02) + csum_ref(20) = dcmplx(5.119722737384D+02, 5.120007442976D+02) + csum_ref(21) = dcmplx(5.119733705802D+02, 5.119995938652D+02) + csum_ref(22) = dcmplx(5.119743666226D+02, 5.119985735001D+02) + csum_ref(23) = dcmplx(5.119752733481D+02, 5.119976689792D+02) + csum_ref(24) = dcmplx(5.119761008382D+02, 5.119968675026D+02) + csum_ref(25) = dcmplx(5.119768579280D+02, 5.119961575929D+02) + + endif + + + if (class .ne. 'U') then + + do i = 1, nt + err = abs( (sums(i) - csum_ref(i)) / csum_ref(i) ) + if (ieee_is_nan(err) .or. (err .gt. epsilon)) goto 100 + end do + verified = .TRUE. + 100 continue + + endif + + + if (class .ne. 'U') then + if (verified) then + write(*,2000) + 2000 format(' Result verification successful') + else + write(*,2001) + 2001 format(' Result verification failed') + endif + endif + print *, 'class = ', class + + return + end + + diff --git a/src/benchmarks/nas-ft/fortran/ft_data.f90 b/src/benchmarks/nas-ft/fortran/ft_data.f90 new file mode 100644 index 00000000..9f17d49d --- /dev/null +++ b/src/benchmarks/nas-ft/fortran/ft_data.f90 @@ -0,0 +1,189 @@ +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- +! +! ft_data module +! +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + module ft_data + + include 'npbparams.h' + +! total number of grid points with padding + integer(kind2) nxp, ntotalp + parameter (nxp=nx+1) + parameter (ntotalp=nxp*ny*nz) + double precision ntotal_f + parameter (ntotal_f=dble(nx)*ny*nz) + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + include 'blk_par.h' +! integer fftblock_default, fftblockpad_default +! parameter (fftblock_default=32, fftblockpad_default=34) + + integer fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx +! +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer dims(3) + + integer T_total, T_setup, T_fft, T_evolve, T_checksum, & + & T_fftx, T_ffty, & + & T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, & + & T_evolve = 4, T_checksum = 5, & + & T_fftx = 6, & + & T_ffty = 7, & + & T_fftz = 8, T_max = 8) + + + + logical timers_enabled + + + external timer_read + double precision timer_read + external ilog2 + integer ilog2 + + external randlc + double precision randlc + + +! other stuff + logical debug, debugsynch + + double precision seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, & + & pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + + +! for checksum data + double complex sums(0:niter_default) + +! number of iterations + integer niter + + + end module ft_data + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- +! +! ft_fields module +! +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + module ft_fields + +!--------------------------------------------------------------------- +! u0, u1, u2 are the main arrays in the problem. +! Depending on the decomposition, these arrays will have different +! dimensions. To accomodate all possibilities, we allocate them as +! one-dimensional arrays and pass them to subroutines for different +! views +! - u0 contains the initial (transformed) initial condition +! - u1 and u2 are working arrays +! - twiddle contains exponents for the time evolution operator. +!--------------------------------------------------------------------- + + double complex, allocatable :: & + & u0(:), pad1(:), & + & u1(:), pad2(:) +! > u2(:) + double precision, allocatable :: twiddle(:) +!--------------------------------------------------------------------- +! Large arrays are in module so that they are allocated on the +! heap rather than the stack. This module is not +! referenced directly anywhere else. Padding is to avoid accidental +! cache problems, since all array sizes are powers of two. +!--------------------------------------------------------------------- + + + end module ft_fields + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine alloc_space + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! allocate space dynamically for data arrays +!--------------------------------------------------------------------- + + use ft_data, only : ntotalp + use ft_fields + + implicit none + + integer ios + + + allocate ( & + & u0(ntotalp), pad1(3), & + & u1(ntotalp), pad2(3), & +! > u2(ntotalp), + & twiddle(ntotalp), & + & stat = ios) + + if (ios .ne. 0) then + write(*,*) 'Error encountered in allocating space' + stop + endif + + return + end + + diff --git a/src/benchmarks/nas-ft/fortran/inputft.data.sample b/src/benchmarks/nas-ft/fortran/inputft.data.sample new file mode 100644 index 00000000..448ac42b --- /dev/null +++ b/src/benchmarks/nas-ft/fortran/inputft.data.sample @@ -0,0 +1,3 @@ +6 ! number of iterations +2 ! layout type. 0 = 0d, 1 = 1d, 2 = 2d +2 4 ! processor layout. 0d must be "1 1"; 1d must be "1 N" diff --git a/src/benchmarks/nas-ft/fortran/npbparams.h b/src/benchmarks/nas-ft/fortran/npbparams.h new file mode 100644 index 00000000..254a5b28 --- /dev/null +++ b/src/benchmarks/nas-ft/fortran/npbparams.h @@ -0,0 +1,32 @@ +! CLASS = C +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + parameter (nx=512, ny=512, nz=512, maxdim=512) + parameter (niter_default=20) + integer kind2 + parameter (kind2=4) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='13 Jun 2024') + character npbversion*5 + parameter (npbversion='3.4.2') + character cs1*8 + parameter (cs1='gfortran') + character cs2*5 + parameter (cs2='$(FC)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*12 + parameter (cs5='-O3 -fopenmp') + character cs6*9 + parameter (cs6='$(FFLAGS)') + character cs7*6 + parameter (cs7='randi8') diff --git a/src/benchmarks/nas-ft/fortran/print_results.f90 b/src/benchmarks/nas-ft/fortran/print_results.f90 new file mode 100644 index 00000000..f6be545c --- /dev/null +++ b/src/benchmarks/nas-ft/fortran/print_results.f90 @@ -0,0 +1,136 @@ + + subroutine print_results(name, class, n1, n2, n3, niter, & + & t, mops, optype, verified, npbversion, & + & compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) + + implicit none + character(len=*) name + character class + integer n1, n2, n3, niter, j + double precision t, mops + character optype*24, size*15 + logical verified + character(len=*) npbversion, compiletime, & + & cs1, cs2, cs3, cs4, cs5, cs6, cs7 + integer num_threads, max_threads +!$ integer omp_get_num_threads, omp_get_max_threads +!$ external omp_get_num_threads, omp_get_max_threads + + + max_threads = 0 +!$ max_threads = omp_get_max_threads() + +! figure out number of threads used + num_threads = 0 +!$omp parallel shared(num_threads) +!$omp master +!$ num_threads = omp_get_num_threads() +!$omp end master +!$omp end parallel + + + write (*, 2) name + 2 format(//, ' ', A, ' Benchmark Completed.') + + write (*, 3) Class + 3 format(' Class = ', 12x, a12) + +! If this is not a grid-based problem (EP, FT, CG), then +! we only print n1, which contains some measure of the +! problem size. In that case, n2 and n3 are both zero. +! Otherwise, we print the grid size n1xn2xn3 + + if ((n2 .eq. 0) .and. (n3 .eq. 0)) then + if (name(1:2) .eq. 'EP') then + write(size, '(f15.0)' ) 2.d0**n1 + j = 15 + if (size(j:j) .eq. '.') j = j - 1 + write (*,42) size(1:j) + 42 format(' Size = ',9x, a15) + else + write (*,44) n1 + 44 format(' Size = ',12x, i12) + endif + else + write (*, 4) n1,n2,n3 + 4 format(' Size = ',9x, i4,'x',i4,'x',i4) + endif + + write (*, 5) niter + 5 format(' Iterations = ', 12x, i12) + + write (*, 6) t + 6 format(' Time in seconds = ',12x, f12.2) + + if (num_threads .gt. 0) write (*,7) num_threads + 7 format(' Total threads = ', 12x, i12) + + if (max_threads .gt. 0) write (*,8) max_threads + 8 format(' Avail threads = ', 12x, i12) + + if (num_threads .ne. max_threads) write (*,88) + 88 format(' Warning: Threads used differ from threads available') + + write (*,9) mops + 9 format(' Mop/s total = ',12x, f12.2) + + if (num_threads .gt. 0) write (*,10) mops/float( num_threads ) + 10 format(' Mop/s/thread = ', 12x, f12.2) + + write(*, 11) optype + 11 format(' Operation type = ', a24) + + if (verified) then + write(*,12) ' SUCCESSFUL' + else + write(*,12) 'UNSUCCESSFUL' + endif + 12 format(' Verification = ', 12x, a) + + write(*,13) npbversion + 13 format(' Version = ', 12x, a12) + + write(*,14) compiletime + 14 format(' Compile date = ', 12x, a12) + + + write (*,121) cs1 + 121 format(/, ' Compile options:', /, & + & ' FC = ', A) + + write (*,122) cs2 + 122 format(' FLINK = ', A) + + write (*,123) cs3 + 123 format(' F_LIB = ', A) + + write (*,124) cs4 + 124 format(' F_INC = ', A) + + write (*,125) cs5 + 125 format(' FFLAGS = ', A) + + write (*,126) cs6 + 126 format(' FLINKFLAGS = ', A) + + write(*, 127) cs7 + 127 format(' RAND = ', A) + + write (*,130) + 130 format(//' Please send all errors/feedbacks to:'// & + & ' NPB Development Team'/ & + & ' npb@nas.nasa.gov'//) +! 130 format(//' Please send the results of this run to:'// +! > ' NPB Development Team '/ +! > ' Internet: npb@nas.nasa.gov'/ +! > ' '/ +! > ' If email is not available, send this to:'// +! > ' MS T27A-1'/ +! > ' NASA Ames Research Center'/ +! > ' Moffett Field, CA 94035-1000'// +! > ' Fax: 650-604-3957'//) + + + return + end + diff --git a/src/benchmarks/nas-ft/fortran/randi8.f90 b/src/benchmarks/nas-ft/fortran/randi8.f90 new file mode 100644 index 00000000..f8932eda --- /dev/null +++ b/src/benchmarks/nas-ft/fortran/randi8.f90 @@ -0,0 +1,67 @@ + double precision function randlc(x, a) + +!--------------------------------------------------------------------- +! +! This routine returns a uniform pseudorandom double precision number in the +! range (0, 1) by using the linear congruential generator +! +! x_{k+1} = a x_k (mod 2^46) +! +! where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers +! before repeating. The argument A is the same as 'a' in the above formula, +! and X is the same as x_0. A and X must be odd double precision integers +! in the range (1, 2^46). The returned value RANDLC is normalized to be +! between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain +! the new seed x_1, so that subsequent calls to RANDLC using the same +! arguments will generate a continuous sequence. + + implicit none + double precision x, a + integer(kind=8) i246m1, Lx, La + double precision d2m46 + + parameter(d2m46=0.5d0**46) + + parameter(i246m1=INT(Z'00003FFFFFFFFFFF',8)) + + Lx = X + La = A + + Lx = iand(Lx*La,i246m1) + randlc = d2m46*dble(Lx) + x = dble(Lx) + return + end + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + + SUBROUTINE VRANLC (N, X, A, Y) + implicit none + integer n, i + double precision x, a, y(*) + integer(kind=8) i246m1, Lx, La + double precision d2m46 + +! This doesn't work, because the compiler does the calculation in 32 +! bits and overflows. No standard way (without f90 stuff) to specify +! that the rhs should be done in 64 bit arithmetic. +! parameter(i246m1=2**46-1) + + parameter(d2m46=0.5d0**46) + + parameter(i246m1=INT(Z'00003FFFFFFFFFFF',8)) + + Lx = X + La = A + do i = 1, N + Lx = iand(Lx*La,i246m1) + y(i) = d2m46*dble(Lx) + end do + x = dble(Lx) + + return + end + diff --git a/src/benchmarks/nas-ft/fortran/timers.f90 b/src/benchmarks/nas-ft/fortran/timers.f90 new file mode 100644 index 00000000..3a50de94 --- /dev/null +++ b/src/benchmarks/nas-ft/fortran/timers.f90 @@ -0,0 +1,171 @@ +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + module timers + + double precision start(64), elapsed(64) +!$omp threadprivate(start, elapsed) + + double precision, external :: elapsed_time + + end module timers + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine timer_clear(n) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + use timers + implicit none + + integer n + + elapsed(n) = 0.0 + return + end + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine timer_start(n) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + use timers + implicit none + + integer n + + start(n) = elapsed_time() + + return + end + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine timer_stop(n) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + use timers + implicit none + + integer n + + double precision t, now + + now = elapsed_time() + t = now - start(n) + elapsed(n) = elapsed(n) + t + + return + end + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + double precision function timer_read(n) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + use timers + implicit none + + integer n + + timer_read = elapsed(n) + + return + end + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + double precision function elapsed_time() + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none +!$ external omp_get_wtime +!$ double precision omp_get_wtime + + double precision t + logical mp + +! ... Use the OpenMP timer if we can (via C$ conditional compilation) + mp = .false. +!$ mp = .true. +!$ t = omp_get_wtime() + + if (.not.mp) then +! This function must measure wall clock time, not CPU time. +! Since there is no portable timer in Fortran (77) +! we call a routine compiled in C (though the C source may have +! to be tweaked). + call wtime(t) +! The following is not ok for "official" results because it reports +! CPU time not wall clock time. It may be useful for developing/testing +! on timeshared Crays, though. +! call second(t) + endif + + elapsed_time = t + + return + end + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine check_timer_flag( timeron ) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + logical timeron + + integer nc, ios + character(len=20) val + + timeron = .false. + +! ... Check environment variable "NPB_TIMER_FLAG" + call get_environment_variable('NPB_TIMER_FLAG', val, nc, ios) + if (ios .eq. 0) then + if (nc .le. 0) then + timeron = .true. + else if (val(1:1) .ge. '1' .and. val(1:1) .le. '9') then + timeron = .true. + else if (val .eq. 'on' .or. val .eq. 'ON' .or. & + & val .eq. 'yes' .or. val .eq. 'YES' .or. & + & val .eq. 'true' .or. val .eq. 'TRUE') then + timeron = .true. + endif + + else + +! ... Check if the "timer.flag" file exists + open (unit=2, file='timer.flag', status='old', iostat=ios) + if (ios .eq. 0) then + close(2) + timeron = .true. + endif + + endif + + return + end -- GitLab