From b82192d01982d4213c38e9c398536f300d02b92c Mon Sep 17 00:00:00 2001
From: Dorian Stoll <dorian.stoll@tmsp.io>
Date: Thu, 13 Jun 2024 15:45:16 +0200
Subject: [PATCH] nas-ft: fortran: Remove built-in timekeeping

We are only interested in the runtime of the entire program and that
can be measured more easily.
---
 src/benchmarks/nas-ft/fortran/ft.f90          | 148 ++++-----------
 src/benchmarks/nas-ft/fortran/ft_data.f90     |  36 ++--
 src/benchmarks/nas-ft/fortran/meson.build     |   1 -
 .../nas-ft/fortran/print_results.f90          |  22 +--
 src/benchmarks/nas-ft/fortran/randi8.f90      |   2 +-
 src/benchmarks/nas-ft/fortran/timers.f90      | 171 ------------------
 6 files changed, 53 insertions(+), 327 deletions(-)
 delete mode 100644 src/benchmarks/nas-ft/fortran/timers.f90

diff --git a/src/benchmarks/nas-ft/fortran/ft.f90 b/src/benchmarks/nas-ft/fortran/ft.f90
index cc8ba00c..86452da4 100644
--- a/src/benchmarks/nas-ft/fortran/ft.f90
+++ b/src/benchmarks/nas-ft/fortran/ft.f90
@@ -66,23 +66,16 @@
 
       implicit none
 
-      integer i
-
       integer iter
-      double precision total_time, mflops
       logical verified
       character class
 
 
 !---------------------------------------------------------------------
-! Run the entire problem once to make sure all data is touched. 
-! This reduces variable startup costs, which is important for such a 
-! short benchmark. The other NPB 2 implementations are similar. 
+! Run the entire problem once to make sure all data is touched.
+! This reduces variable startup costs, which is important for such a
+! short benchmark. The other NPB 2 implementations are similar.
 !---------------------------------------------------------------------
-      do i = 1, t_max
-         call timer_clear(i)
-      end do
-
       call alloc_space
 
       call setup()
@@ -94,57 +87,29 @@
 
 !---------------------------------------------------------------------
 ! Start over from the beginning. Note that all operations must
-! be timed, in contrast to other benchmarks. 
+! be timed, in contrast to other benchmarks.
 !---------------------------------------------------------------------
-      do i = 1, t_max
-         call timer_clear(i)
-      end do
-
-      call timer_start(T_total)
-      if (timers_enabled) call timer_start(T_setup)
-
       call compute_indexmap(twiddle, dims(1), dims(2), dims(3))
 
       call compute_initial_conditions(u1, dims(1), dims(2), dims(3))
 
       call fft_init (dims(1))
 
-      if (timers_enabled) call timer_stop(T_setup)
-      if (timers_enabled) call timer_start(T_fft)
       call fft(1, u1, u0)
-      if (timers_enabled) call timer_stop(T_fft)
 
       do iter = 1, niter
-         if (timers_enabled) call timer_start(T_evolve)
          call evolve(u0, u1, twiddle, dims(1), dims(2), dims(3))
-         if (timers_enabled) call timer_stop(T_evolve)
-         if (timers_enabled) call timer_start(T_fft)
 !         call fft(-1, u1, u2)
          call fft(-1, u1, u1)
-         if (timers_enabled) call timer_stop(T_fft)
-         if (timers_enabled) call timer_start(T_checksum)
 !         call checksum(iter, u2, dims(1), dims(2), dims(3))
          call checksum(iter, u1, dims(1), dims(2), dims(3))
-         if (timers_enabled) call timer_stop(T_checksum)
       end do
 
       call verify(nx, ny, nz, niter, verified, class)
 
-      call timer_stop(t_total)
-      total_time = timer_read(t_total)
-
-      if( total_time .ne. 0. ) then
-         mflops = 1.0d-6*ntotal_f *  &
-     &             (14.8157+7.19641*log(ntotal_f)  &
-     &          +  (5.23518+7.21113*log(ntotal_f))*niter)  &
-     &                 /total_time
-      else
-         mflops = 0.0
-      endif
       call print_results('FT', class, nx, ny, nz, niter,  &
-     &  total_time, mflops, '          floating point', verified,  &
+     &  '          floating point', verified,  &
      &  npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7)
-      if (timers_enabled) call print_timers()
 
       end
 
@@ -225,8 +190,8 @@
 !---------------------------------------------------------------------
 
 !---------------------------------------------------------------------
-! Fill in array u0 with initial conditions from 
-! random number generator 
+! Fill in array u0 with initial conditions from
+! random number generator
 !---------------------------------------------------------------------
 
       use ft_data
@@ -236,7 +201,7 @@
       double complex u0(d1+1, d2, d3)
       integer k, j
       double precision x0, start, an, dummy, starts(nz)
-      
+
 
       start = seed
 !---------------------------------------------------------------------
@@ -251,14 +216,14 @@
          dummy = randlc(start, an)
          starts(k) = start
       end do
-      
+
 !---------------------------------------------------------------------
 ! Go through by z planes filling in one square at a time.
 !---------------------------------------------------------------------
 !$omp parallel do default(shared) private(k,j,x0)
-      do k = 1, dims(3) 
+      do k = 1, dims(3)
          x0 = starts(k)
-         do j = 1, dims(2) 
+         do j = 1, dims(2)
             call vranlc(2*nx, x0, a, u0(1, j, k))
          end do
       end do
@@ -299,7 +264,7 @@
       do while (n .gt. 1)
          n2 = n/2
          if (n2 * 2 .eq. n) then
-            dummy = randlc(q, q) 
+            dummy = randlc(q, q)
             n = n2
          else
             dummy = randlc(r, q)
@@ -327,8 +292,6 @@
 !$    external omp_get_max_threads
       debug = .FALSE.
 
-      call check_timer_flag( timers_enabled )
-
       write(*, 1000)
 
       niter = niter_default
@@ -354,15 +317,15 @@
 ! Set up info for blocking of ffts and transposes.  This improves
 ! performance on cache-based systems. Blocking involves
 ! working on a chunk of the problem at a time, taking chunks
-! along the first, second, or third dimension. 
+! along the first, second, or third dimension.
 !
 ! - In cffts1 blocking is on 2nd dimension (with fft on 1st dim)
 ! - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims)
 
-! Since 1st dim is always in processor, we'll assume it's long enough 
+! Since 1st dim is always in processor, we'll assume it's long enough
 ! (default blocking factor is 16 so min size for 1st dim is 16)
-! The only case we have to worry about is cffts1 in a 2d decomposition. 
-! so the blocking factor should not be larger than the 2nd dimension. 
+! The only case we have to worry about is cffts1 in a 2d decomposition.
+! so the blocking factor should not be larger than the 2nd dimension.
 !---------------------------------------------------------------------
 
       fftblock = fftblock_default
@@ -373,7 +336,7 @@
       return
       end
 
-      
+
 !---------------------------------------------------------------------
 !---------------------------------------------------------------------
 
@@ -383,8 +346,8 @@
 !---------------------------------------------------------------------
 
 !---------------------------------------------------------------------
-! compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2 
-! for time evolution exponent. 
+! compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2
+! for time evolution exponent.
 !---------------------------------------------------------------------
 
       use ft_data
@@ -396,9 +359,9 @@
       double precision ap
 
 !---------------------------------------------------------------------
-! basically we want to convert the fortran indices 
-!   1 2 3 4 5 6 7 8 
-! to 
+! basically we want to convert the fortran indices
+!   1 2 3 4 5 6 7 8
+! to
 !   0 1 2 3 -4 -3 -2 -1
 ! The following magic formula does the trick:
 ! mod(i-1+n/2, n) - n/2
@@ -426,41 +389,6 @@
 
 
 
-!---------------------------------------------------------------------
-!---------------------------------------------------------------------
-
-      subroutine print_timers()
-
-!---------------------------------------------------------------------
-!---------------------------------------------------------------------
-
-      use ft_data
-      implicit none
-
-      integer i
-      double precision t, t_m
-      character*25 tstrings(T_max)
-      data tstrings / '          total ',  &
-     &                '          setup ',  &
-     &                '            fft ',  &
-     &                '         evolve ',  &
-     &                '       checksum ',  &
-     &                '           fftx ',  &
-     &                '           ffty ',  &
-     &                '           fftz ' /
-
-      t_m = timer_read(T_total)
-      if (t_m .le. 0.0d0) t_m = 1.0d0
-      do i = 1, t_max
-         t = timer_read(i)
-         write(*, 100) i, tstrings(i), t, t*100.0/t_m
-      end do
- 100  format(' timer ', i2, '(', A16,  ') :', F9.4, ' (',F6.2,'%)')
-      return
-      end
-
-
-
 !---------------------------------------------------------------------
 !---------------------------------------------------------------------
 
@@ -518,7 +446,6 @@
 
       logd1 = ilog2(d1)
 
-      if (timers_enabled) call timer_start(T_fftx)
 !$omp parallel do default(shared) private(i,j,k,jj,y1,y2,jn)  &
 !$omp&  shared(is,logd1,d1) collapse(2)
       do k = 1, d3
@@ -530,7 +457,7 @@
                   y1(j,i) = x(i,j+jj,k)
                enddo
             enddo
-            
+
             call cfftz (is, logd1, d1, y1, y2)
 
 
@@ -541,7 +468,6 @@
             enddo
          enddo
       enddo
-      if (timers_enabled) call timer_stop(T_fftx)
 
       return
       end
@@ -566,7 +492,6 @@
 
       logd2 = ilog2(d2)
 
-      if (timers_enabled) call timer_start(T_ffty)
 !$omp parallel do default(shared) private(i,j,k,ii,y1,y2,in)  &
 !$omp&  shared(is,logd2,d2) collapse(2)
       do k = 1, d3
@@ -580,7 +505,7 @@
            enddo
 
            call cfftz (is, logd2, d2, y1, y2)
-           
+
            do j = 1, d2
               do i = 1, fftblock
                  xout(i+ii,j,k) = y1(i,j)
@@ -588,7 +513,6 @@
            enddo
         enddo
       enddo
-      if (timers_enabled) call timer_stop(T_ffty)
 
       return
       end
@@ -613,7 +537,6 @@
 
       logd3 = ilog2(d3)
 
-      if (timers_enabled) call timer_start(T_fftz)
 !$omp parallel do default(shared) private(i,j,k,ii,y1,y2,in)  &
 !$omp&  shared(is) collapse(2)
       do j = 1, d2
@@ -635,7 +558,6 @@
            enddo
         enddo
       enddo
-      if (timers_enabled) call timer_stop(T_fftz)
 
       return
       end
@@ -650,7 +572,7 @@
 !---------------------------------------------------------------------
 
 !---------------------------------------------------------------------
-! compute the roots-of-unity array that will be used for subsequent FFTs. 
+! compute the roots-of-unity array that will be used for subsequent FFTs.
 !---------------------------------------------------------------------
 
       use ft_data
@@ -672,16 +594,16 @@
 
       do j = 1, m
          t = pi / ln
-         
+
          do i = 0, ln - 1
             ti = i * t
             u(i+ku) = dcmplx (cos (ti), sin(ti))
          enddo
-         
+
          ku = ku + ln
          ln = 2 * ln
       enddo
-      
+
       return
       end
 
@@ -695,10 +617,10 @@
 
 !---------------------------------------------------------------------
 !   Computes NY N-point complex-to-complex FFTs of X using an algorithm due
-!   to Swarztrauber.  X is both the input and the output array, while Y is a 
-!   scratch array.  It is assumed that N = 2^M.  Before calling CFFTZ to 
-!   perform FFTs, the array U must be initialized by calling CFFTZ with IS 
-!   set to 0 and M set to MX, where MX is the maximum value of M for any 
+!   to Swarztrauber.  X is both the input and the output array, while Y is a
+!   scratch array.  It is assumed that N = 2^M.  Before calling CFFTZ to
+!   perform FFTs, the array U must be initialized by calling CFFTZ with IS
+!   set to 0 and M set to MX, where MX is the maximum value of M for any
 !   subsequent call.
 !---------------------------------------------------------------------
 
@@ -857,7 +779,7 @@
       end do
 
       chk = chk/ntotal_f
-      
+
       write (*, 30) i, chk
  30   format (' T =',I5,5X,'Checksum =',1P2D22.12)
       sums(i) = chk
@@ -940,7 +862,7 @@
          csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02)
          csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02)
          csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02)
-      
+
       else if (d1 .eq. 512 .and.  &
      &    d2 .eq. 256 .and.  &
      &    d3 .eq. 256 .and.  &
@@ -1115,7 +1037,7 @@
 
       endif
 
-         
+
       if (class .ne. 'U') then
          if (verified) then
             write(*,2000)
diff --git a/src/benchmarks/nas-ft/fortran/ft_data.f90 b/src/benchmarks/nas-ft/fortran/ft_data.f90
index 9f17d49d..a94b1e34 100644
--- a/src/benchmarks/nas-ft/fortran/ft_data.f90
+++ b/src/benchmarks/nas-ft/fortran/ft_data.f90
@@ -22,9 +22,9 @@
 
 
 ! Cache blocking params. These values are good for most
-! RISC processors.  
+! RISC processors.
 ! FFT parameters:
-!  fftblock controls how many ffts are done at a time. 
+!  fftblock controls how many ffts are done at a time.
 !  The default is appropriate for most cache-based machines
 !  On vector machines, the FFT can be vectorized with vector
 !  length equal to the block size, so the block size should
@@ -35,11 +35,11 @@
       include 'blk_par.h'
 !      integer fftblock_default, fftblockpad_default
 !      parameter (fftblock_default=32, fftblockpad_default=34)
-      
+
       integer fftblock, fftblockpad
 
 ! we need a bunch of logic to keep track of how
-! arrays are laid out. 
+! arrays are laid out.
 
 
 ! Note: this serial version is the derived from the parallel 0D case
@@ -61,29 +61,15 @@
 ! compute residual(1)
 
 ! for the 0D, 1D, 2D strategies, the layouts look like xxx
-!        
+!
 !            0D        1D        2D
 ! 1:        xyz       xyz       xyz
 
 ! the array dimensions are stored in dims(coord, phase)
       integer dims(3)
 
-      integer T_total, T_setup, T_fft, T_evolve, T_checksum,  &
-     &        T_fftx, T_ffty,  &
-     &        T_fftz, T_max
-      parameter (T_total = 1, T_setup = 2, T_fft = 3,  &
-     &           T_evolve = 4, T_checksum = 5,  &
-     &           T_fftx = 6,  &
-     &           T_ffty = 7,  &
-     &           T_fftz = 8, T_max = 8)
-
-
-
-      logical timers_enabled
 
 
-      external timer_read
-      double precision timer_read
       external ilog2
       integer ilog2
 
@@ -125,14 +111,14 @@
       module ft_fields
 
 !---------------------------------------------------------------------
-! u0, u1, u2 are the main arrays in the problem. 
-! Depending on the decomposition, these arrays will have different 
-! dimensions. To accomodate all possibilities, we allocate them as 
-! one-dimensional arrays and pass them to subroutines for different 
+! u0, u1, u2 are the main arrays in the problem.
+! Depending on the decomposition, these arrays will have different
+! dimensions. To accomodate all possibilities, we allocate them as
+! one-dimensional arrays and pass them to subroutines for different
 ! views
 !  - u0 contains the initial (transformed) initial condition
 !  - u1 and u2 are working arrays
-!  - twiddle contains exponents for the time evolution operator. 
+!  - twiddle contains exponents for the time evolution operator.
 !---------------------------------------------------------------------
 
       double complex, allocatable ::  &
@@ -143,7 +129,7 @@
 !---------------------------------------------------------------------
 ! Large arrays are in module so that they are allocated on the
 ! heap rather than the stack. This module is not
-! referenced directly anywhere else. Padding is to avoid accidental 
+! referenced directly anywhere else. Padding is to avoid accidental
 ! cache problems, since all array sizes are powers of two.
 !---------------------------------------------------------------------
 
diff --git a/src/benchmarks/nas-ft/fortran/meson.build b/src/benchmarks/nas-ft/fortran/meson.build
index e3e613c4..add5abc1 100644
--- a/src/benchmarks/nas-ft/fortran/meson.build
+++ b/src/benchmarks/nas-ft/fortran/meson.build
@@ -2,7 +2,6 @@ sources = [
 	'ft.f90',
 	'ft_data.f90',
 	'print_results.f90',
-	'timers.f90',
 	'randi8.f90',
 ]
 
diff --git a/src/benchmarks/nas-ft/fortran/print_results.f90 b/src/benchmarks/nas-ft/fortran/print_results.f90
index f6be545c..dc168bcd 100644
--- a/src/benchmarks/nas-ft/fortran/print_results.f90
+++ b/src/benchmarks/nas-ft/fortran/print_results.f90
@@ -1,13 +1,12 @@
 
       subroutine print_results(name, class, n1, n2, n3, niter,  &
-     &               t, mops, optype, verified, npbversion,  &
+     &               optype, verified, npbversion,  &
      &               compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7)
-      
+
       implicit none
       character(len=*) name
       character class
       integer   n1, n2, n3, niter, j
-      double precision t, mops
       character optype*24, size*15
       logical   verified
       character(len=*) npbversion, compiletime,  &
@@ -58,29 +57,20 @@
 
          write (*, 5) niter
  5       format(' Iterations      = ', 12x, i12)
-         
-         write (*, 6) t
- 6       format(' Time in seconds = ',12x, f12.2)
 
          if (num_threads .gt. 0) write (*,7) num_threads
  7       format(' Total threads   = ', 12x, i12)
-         
+
          if (max_threads .gt. 0) write (*,8) max_threads
  8       format(' Avail threads   = ', 12x, i12)
 
-         if (num_threads .ne. max_threads) write (*,88) 
+         if (num_threads .ne. max_threads) write (*,88)
  88      format(' Warning: Threads used differ from threads available')
 
-         write (*,9) mops
- 9       format(' Mop/s total     = ',12x, f12.2)
-
-         if (num_threads .gt. 0) write (*,10) mops/float( num_threads )
- 10      format(' Mop/s/thread    = ', 12x, f12.2)        
-
          write(*, 11) optype
  11      format(' Operation type  = ', a24)
 
-         if (verified) then 
+         if (verified) then
             write(*,12) '  SUCCESSFUL'
          else
             write(*,12) 'UNSUCCESSFUL'
@@ -115,7 +105,7 @@
 
          write(*, 127) cs7
  127     format('    RAND         = ', A)
-        
+
          write (*,130)
  130     format(//' Please send all errors/feedbacks to:'//  &
      &            ' NPB Development Team'/  &
diff --git a/src/benchmarks/nas-ft/fortran/randi8.f90 b/src/benchmarks/nas-ft/fortran/randi8.f90
index f8932eda..102bcaa3 100644
--- a/src/benchmarks/nas-ft/fortran/randi8.f90
+++ b/src/benchmarks/nas-ft/fortran/randi8.f90
@@ -47,7 +47,7 @@
 
 ! This doesn't work, because the compiler does the calculation in 32
 ! bits and overflows. No standard way (without f90 stuff) to specify
-! that the rhs should be done in 64 bit arithmetic. 
+! that the rhs should be done in 64 bit arithmetic.
 !      parameter(i246m1=2**46-1)
 
       parameter(d2m46=0.5d0**46)
diff --git a/src/benchmarks/nas-ft/fortran/timers.f90 b/src/benchmarks/nas-ft/fortran/timers.f90
deleted file mode 100644
index 3a50de94..00000000
--- a/src/benchmarks/nas-ft/fortran/timers.f90
+++ /dev/null
@@ -1,171 +0,0 @@
-!---------------------------------------------------------------------
-!---------------------------------------------------------------------
-      
-      module timers
-
-      double precision start(64), elapsed(64)
-!$omp threadprivate(start, elapsed)
-
-      double precision, external :: elapsed_time
-
-      end module timers
-
-
-!---------------------------------------------------------------------
-!---------------------------------------------------------------------
-      
-      subroutine timer_clear(n)
-
-!---------------------------------------------------------------------
-!---------------------------------------------------------------------
-
-      use timers
-      implicit none
-
-      integer n
-
-      elapsed(n) = 0.0
-      return
-      end
-
-
-!---------------------------------------------------------------------
-!---------------------------------------------------------------------
-
-      subroutine timer_start(n)
-
-!---------------------------------------------------------------------
-!---------------------------------------------------------------------
-
-      use timers
-      implicit none
-
-      integer n
-
-      start(n) = elapsed_time()
-
-      return
-      end
-      
-
-!---------------------------------------------------------------------
-!---------------------------------------------------------------------
-
-      subroutine timer_stop(n)
-
-!---------------------------------------------------------------------
-!---------------------------------------------------------------------
-
-      use timers
-      implicit none
-
-      integer n
-
-      double precision t, now
-
-      now = elapsed_time()
-      t = now - start(n)
-      elapsed(n) = elapsed(n) + t
-
-      return
-      end
-
-
-!---------------------------------------------------------------------
-!---------------------------------------------------------------------
-
-      double precision function timer_read(n)
-
-!---------------------------------------------------------------------
-!---------------------------------------------------------------------
-
-      use timers
-      implicit none
-
-      integer n
-      
-      timer_read = elapsed(n)
-
-      return
-      end
-
-
-!---------------------------------------------------------------------
-!---------------------------------------------------------------------
-
-      double precision function elapsed_time()
-
-!---------------------------------------------------------------------
-!---------------------------------------------------------------------
-
-      implicit none
-!$    external         omp_get_wtime
-!$    double precision omp_get_wtime
-
-      double precision t
-      logical          mp
-
-! ... Use the OpenMP timer if we can (via C$ conditional compilation)
-      mp = .false.
-!$    mp = .true.
-!$    t = omp_get_wtime()
-
-      if (.not.mp) then
-! This function must measure wall clock time, not CPU time. 
-! Since there is no portable timer in Fortran (77)
-! we call a routine compiled in C (though the C source may have
-! to be tweaked). 
-         call wtime(t)
-! The following is not ok for "official" results because it reports
-! CPU time not wall clock time. It may be useful for developing/testing
-! on timeshared Crays, though. 
-!        call second(t)
-      endif
-
-      elapsed_time = t
-
-      return
-      end
-
-!---------------------------------------------------------------------
-!---------------------------------------------------------------------
-
-      subroutine check_timer_flag( timeron )
-
-!---------------------------------------------------------------------
-!---------------------------------------------------------------------
-
-      implicit none
-      logical timeron
-
-      integer nc, ios
-      character(len=20) val
-
-      timeron = .false.
-
-! ... Check environment variable "NPB_TIMER_FLAG"
-      call get_environment_variable('NPB_TIMER_FLAG', val, nc, ios)
-      if (ios .eq. 0) then
-         if (nc .le. 0) then
-            timeron = .true.
-         else if (val(1:1) .ge. '1' .and. val(1:1) .le. '9') then
-            timeron = .true.
-         else if (val .eq. 'on' .or. val .eq. 'ON' .or.  &
-     &            val .eq. 'yes' .or. val .eq. 'YES' .or.  &
-     &            val .eq. 'true' .or. val .eq. 'TRUE') then
-            timeron = .true.
-         endif
-
-      else
-
-! ... Check if the "timer.flag" file exists
-         open (unit=2, file='timer.flag', status='old', iostat=ios)
-         if (ios .eq. 0) then
-            close(2)
-            timeron = .true.
-         endif
-
-      endif
-
-      return
-      end
-- 
GitLab