From b82192d01982d4213c38e9c398536f300d02b92c Mon Sep 17 00:00:00 2001 From: Dorian Stoll <dorian.stoll@tmsp.io> Date: Thu, 13 Jun 2024 15:45:16 +0200 Subject: [PATCH] nas-ft: fortran: Remove built-in timekeeping We are only interested in the runtime of the entire program and that can be measured more easily. --- src/benchmarks/nas-ft/fortran/ft.f90 | 148 ++++----------- src/benchmarks/nas-ft/fortran/ft_data.f90 | 36 ++-- src/benchmarks/nas-ft/fortran/meson.build | 1 - .../nas-ft/fortran/print_results.f90 | 22 +-- src/benchmarks/nas-ft/fortran/randi8.f90 | 2 +- src/benchmarks/nas-ft/fortran/timers.f90 | 171 ------------------ 6 files changed, 53 insertions(+), 327 deletions(-) delete mode 100644 src/benchmarks/nas-ft/fortran/timers.f90 diff --git a/src/benchmarks/nas-ft/fortran/ft.f90 b/src/benchmarks/nas-ft/fortran/ft.f90 index cc8ba00c..86452da4 100644 --- a/src/benchmarks/nas-ft/fortran/ft.f90 +++ b/src/benchmarks/nas-ft/fortran/ft.f90 @@ -66,23 +66,16 @@ implicit none - integer i - integer iter - double precision total_time, mflops logical verified character class !--------------------------------------------------------------------- -! Run the entire problem once to make sure all data is touched. -! This reduces variable startup costs, which is important for such a -! short benchmark. The other NPB 2 implementations are similar. +! Run the entire problem once to make sure all data is touched. +! This reduces variable startup costs, which is important for such a +! short benchmark. The other NPB 2 implementations are similar. !--------------------------------------------------------------------- - do i = 1, t_max - call timer_clear(i) - end do - call alloc_space call setup() @@ -94,57 +87,29 @@ !--------------------------------------------------------------------- ! Start over from the beginning. Note that all operations must -! be timed, in contrast to other benchmarks. +! be timed, in contrast to other benchmarks. !--------------------------------------------------------------------- - do i = 1, t_max - call timer_clear(i) - end do - - call timer_start(T_total) - if (timers_enabled) call timer_start(T_setup) - call compute_indexmap(twiddle, dims(1), dims(2), dims(3)) call compute_initial_conditions(u1, dims(1), dims(2), dims(3)) call fft_init (dims(1)) - if (timers_enabled) call timer_stop(T_setup) - if (timers_enabled) call timer_start(T_fft) call fft(1, u1, u0) - if (timers_enabled) call timer_stop(T_fft) do iter = 1, niter - if (timers_enabled) call timer_start(T_evolve) call evolve(u0, u1, twiddle, dims(1), dims(2), dims(3)) - if (timers_enabled) call timer_stop(T_evolve) - if (timers_enabled) call timer_start(T_fft) ! call fft(-1, u1, u2) call fft(-1, u1, u1) - if (timers_enabled) call timer_stop(T_fft) - if (timers_enabled) call timer_start(T_checksum) ! call checksum(iter, u2, dims(1), dims(2), dims(3)) call checksum(iter, u1, dims(1), dims(2), dims(3)) - if (timers_enabled) call timer_stop(T_checksum) end do call verify(nx, ny, nz, niter, verified, class) - call timer_stop(t_total) - total_time = timer_read(t_total) - - if( total_time .ne. 0. ) then - mflops = 1.0d-6*ntotal_f * & - & (14.8157+7.19641*log(ntotal_f) & - & + (5.23518+7.21113*log(ntotal_f))*niter) & - & /total_time - else - mflops = 0.0 - endif call print_results('FT', class, nx, ny, nz, niter, & - & total_time, mflops, ' floating point', verified, & + & ' floating point', verified, & & npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) - if (timers_enabled) call print_timers() end @@ -225,8 +190,8 @@ !--------------------------------------------------------------------- !--------------------------------------------------------------------- -! Fill in array u0 with initial conditions from -! random number generator +! Fill in array u0 with initial conditions from +! random number generator !--------------------------------------------------------------------- use ft_data @@ -236,7 +201,7 @@ double complex u0(d1+1, d2, d3) integer k, j double precision x0, start, an, dummy, starts(nz) - + start = seed !--------------------------------------------------------------------- @@ -251,14 +216,14 @@ dummy = randlc(start, an) starts(k) = start end do - + !--------------------------------------------------------------------- ! Go through by z planes filling in one square at a time. !--------------------------------------------------------------------- !$omp parallel do default(shared) private(k,j,x0) - do k = 1, dims(3) + do k = 1, dims(3) x0 = starts(k) - do j = 1, dims(2) + do j = 1, dims(2) call vranlc(2*nx, x0, a, u0(1, j, k)) end do end do @@ -299,7 +264,7 @@ do while (n .gt. 1) n2 = n/2 if (n2 * 2 .eq. n) then - dummy = randlc(q, q) + dummy = randlc(q, q) n = n2 else dummy = randlc(r, q) @@ -327,8 +292,6 @@ !$ external omp_get_max_threads debug = .FALSE. - call check_timer_flag( timers_enabled ) - write(*, 1000) niter = niter_default @@ -354,15 +317,15 @@ ! Set up info for blocking of ffts and transposes. This improves ! performance on cache-based systems. Blocking involves ! working on a chunk of the problem at a time, taking chunks -! along the first, second, or third dimension. +! along the first, second, or third dimension. ! ! - In cffts1 blocking is on 2nd dimension (with fft on 1st dim) ! - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims) -! Since 1st dim is always in processor, we'll assume it's long enough +! Since 1st dim is always in processor, we'll assume it's long enough ! (default blocking factor is 16 so min size for 1st dim is 16) -! The only case we have to worry about is cffts1 in a 2d decomposition. -! so the blocking factor should not be larger than the 2nd dimension. +! The only case we have to worry about is cffts1 in a 2d decomposition. +! so the blocking factor should not be larger than the 2nd dimension. !--------------------------------------------------------------------- fftblock = fftblock_default @@ -373,7 +336,7 @@ return end - + !--------------------------------------------------------------------- !--------------------------------------------------------------------- @@ -383,8 +346,8 @@ !--------------------------------------------------------------------- !--------------------------------------------------------------------- -! compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2 -! for time evolution exponent. +! compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2 +! for time evolution exponent. !--------------------------------------------------------------------- use ft_data @@ -396,9 +359,9 @@ double precision ap !--------------------------------------------------------------------- -! basically we want to convert the fortran indices -! 1 2 3 4 5 6 7 8 -! to +! basically we want to convert the fortran indices +! 1 2 3 4 5 6 7 8 +! to ! 0 1 2 3 -4 -3 -2 -1 ! The following magic formula does the trick: ! mod(i-1+n/2, n) - n/2 @@ -426,41 +389,6 @@ -!--------------------------------------------------------------------- -!--------------------------------------------------------------------- - - subroutine print_timers() - -!--------------------------------------------------------------------- -!--------------------------------------------------------------------- - - use ft_data - implicit none - - integer i - double precision t, t_m - character*25 tstrings(T_max) - data tstrings / ' total ', & - & ' setup ', & - & ' fft ', & - & ' evolve ', & - & ' checksum ', & - & ' fftx ', & - & ' ffty ', & - & ' fftz ' / - - t_m = timer_read(T_total) - if (t_m .le. 0.0d0) t_m = 1.0d0 - do i = 1, t_max - t = timer_read(i) - write(*, 100) i, tstrings(i), t, t*100.0/t_m - end do - 100 format(' timer ', i2, '(', A16, ') :', F9.4, ' (',F6.2,'%)') - return - end - - - !--------------------------------------------------------------------- !--------------------------------------------------------------------- @@ -518,7 +446,6 @@ logd1 = ilog2(d1) - if (timers_enabled) call timer_start(T_fftx) !$omp parallel do default(shared) private(i,j,k,jj,y1,y2,jn) & !$omp& shared(is,logd1,d1) collapse(2) do k = 1, d3 @@ -530,7 +457,7 @@ y1(j,i) = x(i,j+jj,k) enddo enddo - + call cfftz (is, logd1, d1, y1, y2) @@ -541,7 +468,6 @@ enddo enddo enddo - if (timers_enabled) call timer_stop(T_fftx) return end @@ -566,7 +492,6 @@ logd2 = ilog2(d2) - if (timers_enabled) call timer_start(T_ffty) !$omp parallel do default(shared) private(i,j,k,ii,y1,y2,in) & !$omp& shared(is,logd2,d2) collapse(2) do k = 1, d3 @@ -580,7 +505,7 @@ enddo call cfftz (is, logd2, d2, y1, y2) - + do j = 1, d2 do i = 1, fftblock xout(i+ii,j,k) = y1(i,j) @@ -588,7 +513,6 @@ enddo enddo enddo - if (timers_enabled) call timer_stop(T_ffty) return end @@ -613,7 +537,6 @@ logd3 = ilog2(d3) - if (timers_enabled) call timer_start(T_fftz) !$omp parallel do default(shared) private(i,j,k,ii,y1,y2,in) & !$omp& shared(is) collapse(2) do j = 1, d2 @@ -635,7 +558,6 @@ enddo enddo enddo - if (timers_enabled) call timer_stop(T_fftz) return end @@ -650,7 +572,7 @@ !--------------------------------------------------------------------- !--------------------------------------------------------------------- -! compute the roots-of-unity array that will be used for subsequent FFTs. +! compute the roots-of-unity array that will be used for subsequent FFTs. !--------------------------------------------------------------------- use ft_data @@ -672,16 +594,16 @@ do j = 1, m t = pi / ln - + do i = 0, ln - 1 ti = i * t u(i+ku) = dcmplx (cos (ti), sin(ti)) enddo - + ku = ku + ln ln = 2 * ln enddo - + return end @@ -695,10 +617,10 @@ !--------------------------------------------------------------------- ! Computes NY N-point complex-to-complex FFTs of X using an algorithm due -! to Swarztrauber. X is both the input and the output array, while Y is a -! scratch array. It is assumed that N = 2^M. Before calling CFFTZ to -! perform FFTs, the array U must be initialized by calling CFFTZ with IS -! set to 0 and M set to MX, where MX is the maximum value of M for any +! to Swarztrauber. X is both the input and the output array, while Y is a +! scratch array. It is assumed that N = 2^M. Before calling CFFTZ to +! perform FFTs, the array U must be initialized by calling CFFTZ with IS +! set to 0 and M set to MX, where MX is the maximum value of M for any ! subsequent call. !--------------------------------------------------------------------- @@ -857,7 +779,7 @@ end do chk = chk/ntotal_f - + write (*, 30) i, chk 30 format (' T =',I5,5X,'Checksum =',1P2D22.12) sums(i) = chk @@ -940,7 +862,7 @@ csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02) csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02) csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02) - + else if (d1 .eq. 512 .and. & & d2 .eq. 256 .and. & & d3 .eq. 256 .and. & @@ -1115,7 +1037,7 @@ endif - + if (class .ne. 'U') then if (verified) then write(*,2000) diff --git a/src/benchmarks/nas-ft/fortran/ft_data.f90 b/src/benchmarks/nas-ft/fortran/ft_data.f90 index 9f17d49d..a94b1e34 100644 --- a/src/benchmarks/nas-ft/fortran/ft_data.f90 +++ b/src/benchmarks/nas-ft/fortran/ft_data.f90 @@ -22,9 +22,9 @@ ! Cache blocking params. These values are good for most -! RISC processors. +! RISC processors. ! FFT parameters: -! fftblock controls how many ffts are done at a time. +! fftblock controls how many ffts are done at a time. ! The default is appropriate for most cache-based machines ! On vector machines, the FFT can be vectorized with vector ! length equal to the block size, so the block size should @@ -35,11 +35,11 @@ include 'blk_par.h' ! integer fftblock_default, fftblockpad_default ! parameter (fftblock_default=32, fftblockpad_default=34) - + integer fftblock, fftblockpad ! we need a bunch of logic to keep track of how -! arrays are laid out. +! arrays are laid out. ! Note: this serial version is the derived from the parallel 0D case @@ -61,29 +61,15 @@ ! compute residual(1) ! for the 0D, 1D, 2D strategies, the layouts look like xxx -! +! ! 0D 1D 2D ! 1: xyz xyz xyz ! the array dimensions are stored in dims(coord, phase) integer dims(3) - integer T_total, T_setup, T_fft, T_evolve, T_checksum, & - & T_fftx, T_ffty, & - & T_fftz, T_max - parameter (T_total = 1, T_setup = 2, T_fft = 3, & - & T_evolve = 4, T_checksum = 5, & - & T_fftx = 6, & - & T_ffty = 7, & - & T_fftz = 8, T_max = 8) - - - - logical timers_enabled - external timer_read - double precision timer_read external ilog2 integer ilog2 @@ -125,14 +111,14 @@ module ft_fields !--------------------------------------------------------------------- -! u0, u1, u2 are the main arrays in the problem. -! Depending on the decomposition, these arrays will have different -! dimensions. To accomodate all possibilities, we allocate them as -! one-dimensional arrays and pass them to subroutines for different +! u0, u1, u2 are the main arrays in the problem. +! Depending on the decomposition, these arrays will have different +! dimensions. To accomodate all possibilities, we allocate them as +! one-dimensional arrays and pass them to subroutines for different ! views ! - u0 contains the initial (transformed) initial condition ! - u1 and u2 are working arrays -! - twiddle contains exponents for the time evolution operator. +! - twiddle contains exponents for the time evolution operator. !--------------------------------------------------------------------- double complex, allocatable :: & @@ -143,7 +129,7 @@ !--------------------------------------------------------------------- ! Large arrays are in module so that they are allocated on the ! heap rather than the stack. This module is not -! referenced directly anywhere else. Padding is to avoid accidental +! referenced directly anywhere else. Padding is to avoid accidental ! cache problems, since all array sizes are powers of two. !--------------------------------------------------------------------- diff --git a/src/benchmarks/nas-ft/fortran/meson.build b/src/benchmarks/nas-ft/fortran/meson.build index e3e613c4..add5abc1 100644 --- a/src/benchmarks/nas-ft/fortran/meson.build +++ b/src/benchmarks/nas-ft/fortran/meson.build @@ -2,7 +2,6 @@ sources = [ 'ft.f90', 'ft_data.f90', 'print_results.f90', - 'timers.f90', 'randi8.f90', ] diff --git a/src/benchmarks/nas-ft/fortran/print_results.f90 b/src/benchmarks/nas-ft/fortran/print_results.f90 index f6be545c..dc168bcd 100644 --- a/src/benchmarks/nas-ft/fortran/print_results.f90 +++ b/src/benchmarks/nas-ft/fortran/print_results.f90 @@ -1,13 +1,12 @@ subroutine print_results(name, class, n1, n2, n3, niter, & - & t, mops, optype, verified, npbversion, & + & optype, verified, npbversion, & & compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) - + implicit none character(len=*) name character class integer n1, n2, n3, niter, j - double precision t, mops character optype*24, size*15 logical verified character(len=*) npbversion, compiletime, & @@ -58,29 +57,20 @@ write (*, 5) niter 5 format(' Iterations = ', 12x, i12) - - write (*, 6) t - 6 format(' Time in seconds = ',12x, f12.2) if (num_threads .gt. 0) write (*,7) num_threads 7 format(' Total threads = ', 12x, i12) - + if (max_threads .gt. 0) write (*,8) max_threads 8 format(' Avail threads = ', 12x, i12) - if (num_threads .ne. max_threads) write (*,88) + if (num_threads .ne. max_threads) write (*,88) 88 format(' Warning: Threads used differ from threads available') - write (*,9) mops - 9 format(' Mop/s total = ',12x, f12.2) - - if (num_threads .gt. 0) write (*,10) mops/float( num_threads ) - 10 format(' Mop/s/thread = ', 12x, f12.2) - write(*, 11) optype 11 format(' Operation type = ', a24) - if (verified) then + if (verified) then write(*,12) ' SUCCESSFUL' else write(*,12) 'UNSUCCESSFUL' @@ -115,7 +105,7 @@ write(*, 127) cs7 127 format(' RAND = ', A) - + write (*,130) 130 format(//' Please send all errors/feedbacks to:'// & & ' NPB Development Team'/ & diff --git a/src/benchmarks/nas-ft/fortran/randi8.f90 b/src/benchmarks/nas-ft/fortran/randi8.f90 index f8932eda..102bcaa3 100644 --- a/src/benchmarks/nas-ft/fortran/randi8.f90 +++ b/src/benchmarks/nas-ft/fortran/randi8.f90 @@ -47,7 +47,7 @@ ! This doesn't work, because the compiler does the calculation in 32 ! bits and overflows. No standard way (without f90 stuff) to specify -! that the rhs should be done in 64 bit arithmetic. +! that the rhs should be done in 64 bit arithmetic. ! parameter(i246m1=2**46-1) parameter(d2m46=0.5d0**46) diff --git a/src/benchmarks/nas-ft/fortran/timers.f90 b/src/benchmarks/nas-ft/fortran/timers.f90 deleted file mode 100644 index 3a50de94..00000000 --- a/src/benchmarks/nas-ft/fortran/timers.f90 +++ /dev/null @@ -1,171 +0,0 @@ -!--------------------------------------------------------------------- -!--------------------------------------------------------------------- - - module timers - - double precision start(64), elapsed(64) -!$omp threadprivate(start, elapsed) - - double precision, external :: elapsed_time - - end module timers - - -!--------------------------------------------------------------------- -!--------------------------------------------------------------------- - - subroutine timer_clear(n) - -!--------------------------------------------------------------------- -!--------------------------------------------------------------------- - - use timers - implicit none - - integer n - - elapsed(n) = 0.0 - return - end - - -!--------------------------------------------------------------------- -!--------------------------------------------------------------------- - - subroutine timer_start(n) - -!--------------------------------------------------------------------- -!--------------------------------------------------------------------- - - use timers - implicit none - - integer n - - start(n) = elapsed_time() - - return - end - - -!--------------------------------------------------------------------- -!--------------------------------------------------------------------- - - subroutine timer_stop(n) - -!--------------------------------------------------------------------- -!--------------------------------------------------------------------- - - use timers - implicit none - - integer n - - double precision t, now - - now = elapsed_time() - t = now - start(n) - elapsed(n) = elapsed(n) + t - - return - end - - -!--------------------------------------------------------------------- -!--------------------------------------------------------------------- - - double precision function timer_read(n) - -!--------------------------------------------------------------------- -!--------------------------------------------------------------------- - - use timers - implicit none - - integer n - - timer_read = elapsed(n) - - return - end - - -!--------------------------------------------------------------------- -!--------------------------------------------------------------------- - - double precision function elapsed_time() - -!--------------------------------------------------------------------- -!--------------------------------------------------------------------- - - implicit none -!$ external omp_get_wtime -!$ double precision omp_get_wtime - - double precision t - logical mp - -! ... Use the OpenMP timer if we can (via C$ conditional compilation) - mp = .false. -!$ mp = .true. -!$ t = omp_get_wtime() - - if (.not.mp) then -! This function must measure wall clock time, not CPU time. -! Since there is no portable timer in Fortran (77) -! we call a routine compiled in C (though the C source may have -! to be tweaked). - call wtime(t) -! The following is not ok for "official" results because it reports -! CPU time not wall clock time. It may be useful for developing/testing -! on timeshared Crays, though. -! call second(t) - endif - - elapsed_time = t - - return - end - -!--------------------------------------------------------------------- -!--------------------------------------------------------------------- - - subroutine check_timer_flag( timeron ) - -!--------------------------------------------------------------------- -!--------------------------------------------------------------------- - - implicit none - logical timeron - - integer nc, ios - character(len=20) val - - timeron = .false. - -! ... Check environment variable "NPB_TIMER_FLAG" - call get_environment_variable('NPB_TIMER_FLAG', val, nc, ios) - if (ios .eq. 0) then - if (nc .le. 0) then - timeron = .true. - else if (val(1:1) .ge. '1' .and. val(1:1) .le. '9') then - timeron = .true. - else if (val .eq. 'on' .or. val .eq. 'ON' .or. & - & val .eq. 'yes' .or. val .eq. 'YES' .or. & - & val .eq. 'true' .or. val .eq. 'TRUE') then - timeron = .true. - endif - - else - -! ... Check if the "timer.flag" file exists - open (unit=2, file='timer.flag', status='old', iostat=ios) - if (ios .eq. 0) then - close(2) - timeron = .true. - endif - - endif - - return - end -- GitLab