Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
!-------------------------------------------------------------------------!
! !
! N A S P A R A L L E L B E N C H M A R K S 3.4 !
! !
! O p e n M P V E R S I O N !
! !
! F T !
! !
!-------------------------------------------------------------------------!
! !
! This benchmark is an OpenMP version of the NPB FT code. !
! It is described in NAS Technical Report 99-011. !
! !
! Permission to use, copy, distribute and modify this software !
! for any purpose with or without fee is hereby granted. We !
! request, however, that all derived work reference the NAS !
! Parallel Benchmarks 3.4. This software is provided "as is" !
! without express or implied warranty. !
! !
! Information on NPB 3.4, including the technical report, the !
! original specifications, source code, results and information !
! on how to submit new results, is available at: !
! !
! http://www.nas.nasa.gov/Software/NPB/ !
! !
! Send comments or suggestions to npb@nas.nasa.gov !
! !
! NAS Parallel Benchmarks Group !
! NASA Ames Research Center !
! Mail Stop: T27A-1 !
! Moffett Field, CA 94035-1000 !
! !
! E-mail: npb@nas.nasa.gov !
! Fax: (650) 604-3957 !
! !
!-------------------------------------------------------------------------!
!---------------------------------------------------------------------
!
! Authors: D. Bailey
! W. Saphir
! H. Jin
!
!---------------------------------------------------------------------
!---------------------------------------------------------------------
!---------------------------------------------------------------------
! FT benchmark
!---------------------------------------------------------------------
!---------------------------------------------------------------------
!---------------------------------------------------------------------
program ft
!---------------------------------------------------------------------
!---------------------------------------------------------------------
!---------------------------------------------------------------------
! Module ft_fields defines main arrays (u0, u1, u2) in the problem
!---------------------------------------------------------------------
use ft_data
use ft_fields
implicit none
integer iter
!---------------------------------------------------------------------
! Run the entire problem once to make sure all data is touched.
! This reduces variable startup costs, which is important for such a
! short benchmark. The other NPB 2 implementations are similar.
!---------------------------------------------------------------------
call alloc_space
call init_ui(u0, u1, twiddle, dims(1), dims(2), dims(3))
call compute_indexmap(twiddle, dims(1), dims(2), dims(3))
call compute_initial_conditions(u1, dims(1), dims(2), dims(3))
call fft_init (dims(1))
call fft(1, u1, u0)
!---------------------------------------------------------------------
! Start over from the beginning. Note that all operations must
! be timed, in contrast to other benchmarks.
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
!---------------------------------------------------------------------
call compute_indexmap(twiddle, dims(1), dims(2), dims(3))
call compute_initial_conditions(u1, dims(1), dims(2), dims(3))
call fft_init (dims(1))
call fft(1, u1, u0)
do iter = 1, niter
call evolve(u0, u1, twiddle, dims(1), dims(2), dims(3))
! call fft(-1, u1, u2)
call fft(-1, u1, u1)
! call checksum(iter, u2, dims(1), dims(2), dims(3))
call checksum(iter, u1, dims(1), dims(2), dims(3))
end do
end
!---------------------------------------------------------------------
!---------------------------------------------------------------------
subroutine init_ui(u0, u1, twiddle, d1, d2, d3)
!---------------------------------------------------------------------
!---------------------------------------------------------------------
!---------------------------------------------------------------------
! touch all the big data
!---------------------------------------------------------------------
implicit none
integer d1, d2, d3
double complex u0(d1+1,d2,d3)
double complex u1(d1+1,d2,d3)
double precision twiddle(d1+1,d2,d3)
integer i, j, k
!$omp parallel do default(shared) private(i,j,k) collapse(2)
do k = 1, d3
do j = 1, d2
do i = 1, d1
u0(i,j,k) = 0.d0
u1(i,j,k) = 0.d0
twiddle(i,j,k) = 0.d0
end do
end do
end do
return
end
!---------------------------------------------------------------------
!---------------------------------------------------------------------
subroutine evolve(u0, u1, twiddle, d1, d2, d3)
!---------------------------------------------------------------------
!---------------------------------------------------------------------
!---------------------------------------------------------------------
! evolve u0 -> u1 (t time steps) in fourier space
!---------------------------------------------------------------------
use ft_data
implicit none
integer d1, d2, d3
double complex u0(d1+1,d2,d3)
double complex u1(d1+1,d2,d3)
double precision twiddle(d1+1,d2,d3)
integer i, j, k
!$omp parallel do default(shared) private(i,j,k) collapse(2)
do k = 1, d3
do j = 1, d2
do i = 1, d1
u0(i,j,k) = u0(i,j,k) * twiddle(i,j,k)
u1(i,j,k) = u0(i,j,k)
end do
end do
end do
return
end
!---------------------------------------------------------------------
!---------------------------------------------------------------------
subroutine compute_initial_conditions(u0, d1, d2, d3)
!---------------------------------------------------------------------
!---------------------------------------------------------------------
!---------------------------------------------------------------------
! Fill in array u0 with initial conditions from
! random number generator
!---------------------------------------------------------------------
use ft_data
implicit none
integer d1, d2, d3
double complex u0(d1+1, d2, d3)
call libc_srand48(seed)
do k = 1, dims(3)
do j = 1, dims(2)
u0(i, j, k) = libc_drand48()
end do
end do
end do
return
end
!---------------------------------------------------------------------
!---------------------------------------------------------------------
subroutine setup
!---------------------------------------------------------------------
!---------------------------------------------------------------------
use ft_data
character(len=32) :: arg
if (command_argument_count() .ne. 4) then
error stop "Invalid number of arguments"
end if
call get_command_argument(1, arg)
read(arg, *) dims(1)
call get_command_argument(2, arg)
read(arg, *) dims(2)
call get_command_argument(3, arg)
read(arg, *) dims(3)
call get_command_argument(4, arg)
read(arg, *) niter
nxp = dims(1) + 1
ntotalp = nxp * dims(2) * dims(3)
maxdim = max(dims(1), max(dims(2), dims(3)))
debug = .FALSE.
write(*, 1000)
write(*, 1001) dims(1), dims(2), dims(3)
write(*, 1002) niter
!$ write(*, 1003) omp_get_max_threads()
write(*, *)
1000 format(//, ' NAS Parallel Benchmarks (NPB3.4-OMP) - FT Benchmark', /)
1001 format(' Size : ', i0, 'x', i0, 'x', i0)
1002 format(' Iterations : ', i0)
1003 format(' Number of available threads : ', i0)
!---------------------------------------------------------------------
! Set up info for blocking of ffts and transposes. This improves
! performance on cache-based systems. Blocking involves
! working on a chunk of the problem at a time, taking chunks
! along the first, second, or third dimension.
!
! - In cffts1 blocking is on 2nd dimension (with fft on 1st dim)
! - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims)
! Since 1st dim is always in processor, we'll assume it's long enough
! (default blocking factor is 16 so min size for 1st dim is 16)
! The only case we have to worry about is cffts1 in a 2d decomposition.
! so the blocking factor should not be larger than the 2nd dimension.
!---------------------------------------------------------------------
fftblock = fftblock_default
fftblockpad = fftblockpad_default
return
end
!---------------------------------------------------------------------
!---------------------------------------------------------------------
subroutine compute_indexmap(twiddle, d1, d2, d3)
!---------------------------------------------------------------------
!---------------------------------------------------------------------
!---------------------------------------------------------------------
! compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2
! for time evolution exponent.
!---------------------------------------------------------------------
use ft_data
implicit none
integer d1, d2, d3
double precision twiddle(d1+1, d2, d3)
integer i, j, k, kk, kk2, jj, kj2, ii
double precision ap
!---------------------------------------------------------------------
! basically we want to convert the fortran indices
! 1 2 3 4 5 6 7 8
! to
! 0 1 2 3 -4 -3 -2 -1
! The following magic formula does the trick:
! mod(i-1+n/2, n) - n/2
!---------------------------------------------------------------------
ap = - 4.d0 * alpha * pi *pi
!$omp parallel do default(shared) private(i,j,k,kk,kk2,jj,kj2,ii) &
!$omp& collapse(2)
do k = 1, dims(3)
do j = 1, dims(2)
kk = mod(k-1+dims(3)/2, dims(3)) - dims(3)/2
jj = mod(j-1+dims(2)/2, dims(2)) - dims(2)/2
kj2 = jj*jj+kk2
do i = 1, dims(1)
ii = mod(i-1+dims(1)/2, dims(1)) - dims(1)/2
twiddle(i,j,k) = dexp(ap*dble(ii*ii+kj2))
end do
end do
end do
return
end
!---------------------------------------------------------------------
!---------------------------------------------------------------------
subroutine fft(dir, x1, x2)
!---------------------------------------------------------------------
!---------------------------------------------------------------------
use ft_data
implicit none
integer dir
double complex x1(ntotalp), x2(ntotalp)
double complex y1(fftblockpad*maxdim), &
& y2(fftblockpad*maxdim)
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
!---------------------------------------------------------------------
! note: args x1, x2 must be different arrays
! note: args for cfftsx are (direction, layout, xin, xout, scratch)
! xin/xout may be the same and it can be somewhat faster
! if they are
!---------------------------------------------------------------------
if (dir .eq. 1) then
call cffts1(1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
call cffts2(1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
call cffts3(1, dims(1), dims(2), dims(3), x1, x2, y1, y2)
else
call cffts3(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
call cffts2(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
call cffts1(-1, dims(1), dims(2), dims(3), x1, x2, y1, y2)
endif
return
end
!---------------------------------------------------------------------
!---------------------------------------------------------------------
subroutine cffts1(is, d1, d2, d3, x, xout, y1, y2)
!---------------------------------------------------------------------
!---------------------------------------------------------------------
use ft_data
implicit none
integer is, d1, d2, d3, logd1
double complex x(d1+1,d2,d3)
double complex xout(d1+1,d2,d3)
double complex y1(fftblockpad, d1), y2(fftblockpad, d1)
integer i, j, k, jj, jn
logd1 = ilog2(d1)
!$omp parallel do default(shared) private(i,j,k,jj,y1,y2,jn) &
!$omp& shared(is,logd1,d1) collapse(2)
do k = 1, d3
do jn = 0, d2/fftblock - 1
! do jj = 0, d2 - fftblock, fftblock
jj = jn*fftblock
do j = 1, fftblock
do i = 1, d1
y1(j,i) = x(i,j+jj,k)
enddo
enddo
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
call cfftz (is, logd1, d1, y1, y2)
do j = 1, fftblock
do i = 1, d1
xout(i,j+jj,k) = y1(j,i)
enddo
enddo
enddo
enddo
return
end
!---------------------------------------------------------------------
!---------------------------------------------------------------------
subroutine cffts2(is, d1, d2, d3, x, xout, y1, y2)
!---------------------------------------------------------------------
!---------------------------------------------------------------------
use ft_data
implicit none
integer is, d1, d2, d3, logd2
double complex x(d1+1,d2,d3)
double complex xout(d1+1,d2,d3)
double complex y1(fftblockpad, d2), y2(fftblockpad, d2)
integer i, j, k, ii, in
logd2 = ilog2(d2)
!$omp parallel do default(shared) private(i,j,k,ii,y1,y2,in) &
!$omp& shared(is,logd2,d2) collapse(2)
do k = 1, d3
do in = 0, d1/fftblock - 1
! do ii = 0, d1 - fftblock, fftblock
ii = in*fftblock
do j = 1, d2
do i = 1, fftblock
y1(i,j) = x(i+ii,j,k)
enddo
enddo
call cfftz (is, logd2, d2, y1, y2)
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
do j = 1, d2
do i = 1, fftblock
xout(i+ii,j,k) = y1(i,j)
enddo
enddo
enddo
enddo
return
end
!---------------------------------------------------------------------
!---------------------------------------------------------------------
subroutine cffts3(is, d1, d2, d3, x, xout, y1, y2)
!---------------------------------------------------------------------
!---------------------------------------------------------------------
use ft_data
implicit none
integer is, d1, d2, d3, logd3
double complex x(d1+1,d2,d3)
double complex xout(d1+1,d2,d3)
double complex y1(fftblockpad, d3), y2(fftblockpad, d3)
integer i, j, k, ii, in
logd3 = ilog2(d3)
!$omp parallel do default(shared) private(i,j,k,ii,y1,y2,in) &
!$omp& shared(is) collapse(2)
do j = 1, d2
do in = 0, d1/fftblock - 1
! do ii = 0, d1 - fftblock, fftblock
ii = in*fftblock
do k = 1, d3
do i = 1, fftblock
y1(i,k) = x(i+ii,j,k)
enddo
enddo
call cfftz (is, logd3, d3, y1, y2)
do k = 1, d3
do i = 1, fftblock
xout(i+ii,j,k) = y1(i,k)
enddo
enddo
enddo
enddo
return
end
!---------------------------------------------------------------------
!---------------------------------------------------------------------
subroutine fft_init (n)
!---------------------------------------------------------------------
!---------------------------------------------------------------------
!---------------------------------------------------------------------
! compute the roots-of-unity array that will be used for subsequent FFTs.
!---------------------------------------------------------------------
use ft_data
implicit none
integer m,n,nu,ku,i,j,ln
double precision t, ti
!---------------------------------------------------------------------
! Initialize the U array with sines and cosines in a manner that permits
! stride one access at each FFT iteration.
!---------------------------------------------------------------------
nu = n
m = ilog2(n)
u(1) = m
ku = 2
ln = 1
do j = 1, m
t = pi / ln
do i = 0, ln - 1
ti = i * t
u(i+ku) = dcmplx (cos (ti), sin(ti))
enddo
ku = ku + ln
ln = 2 * ln
enddo
return
end
!---------------------------------------------------------------------
!---------------------------------------------------------------------
subroutine cfftz (is, m, n, x, y)
!---------------------------------------------------------------------
!---------------------------------------------------------------------
!---------------------------------------------------------------------
! Computes NY N-point complex-to-complex FFTs of X using an algorithm due
! to Swarztrauber. X is both the input and the output array, while Y is a
! scratch array. It is assumed that N = 2^M. Before calling CFFTZ to
! perform FFTs, the array U must be initialized by calling CFFTZ with IS
! set to 0 and M set to MX, where MX is the maximum value of M for any
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
! subsequent call.
!---------------------------------------------------------------------
use ft_data
implicit none
integer is,m,n,i,j,l,mx
double complex x, y
dimension x(fftblockpad,n), y(fftblockpad,n)
!---------------------------------------------------------------------
! Check if input parameters are invalid.
!---------------------------------------------------------------------
mx = u(1)
if ((is .ne. 1 .and. is .ne. -1) .or. m .lt. 1 .or. m .gt. mx) &
& then
write (*, 1) is, m, mx
1 format ('CFFTZ: Either U has not been initialized, or else'/ &
& 'one of the input parameters is invalid', 3I5)
stop
endif
!---------------------------------------------------------------------
! Perform one variant of the Stockham FFT.
!---------------------------------------------------------------------
do l = 1, m, 2
call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y)
if (l .eq. m) goto 160
call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x)
enddo
goto 180
!---------------------------------------------------------------------
! Copy Y to X.
!---------------------------------------------------------------------
160 do j = 1, n
do i = 1, fftblock
x(i,j) = y(i,j)
enddo
enddo
180 continue
return
end
!---------------------------------------------------------------------
!---------------------------------------------------------------------
subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y)
!---------------------------------------------------------------------
!---------------------------------------------------------------------
!---------------------------------------------------------------------
! Performs the L-th iteration of the second variant of the Stockham FFT.
!---------------------------------------------------------------------
implicit none
integer is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22
double complex u,x,y,u1,x11,x21
dimension u(n), x(ny1,n), y(ny1,n)
!---------------------------------------------------------------------
! Set initial parameters.
!---------------------------------------------------------------------
n1 = n / 2
lk = 2 ** (l - 1)
li = 2 ** (m - l)
lj = 2 * lk
ku = li + 1
do i = 0, li - 1
i11 = i * lk + 1
i12 = i11 + n1
i21 = i * lj + 1
i22 = i21 + lk
if (is .ge. 1) then
u1 = u(ku+i)
else
u1 = dconjg (u(ku+i))
endif
!---------------------------------------------------------------------
! This loop is vectorizable.
!---------------------------------------------------------------------
do k = 0, lk - 1
do j = 1, ny
x11 = x(j,i11+k)
x21 = x(j,i12+k)
y(j,i21+k) = x11 + x21
y(j,i22+k) = u1 * (x11 - x21)
enddo
enddo
enddo
return
end
!---------------------------------------------------------------------
!---------------------------------------------------------------------
!---------------------------------------------------------------------
integer function ilog2(n)
!---------------------------------------------------------------------
!---------------------------------------------------------------------
implicit none
integer n, nn, lg
if (n .eq. 1) then
ilog2=0
return
endif
lg = 1
nn = 2
do while (nn .lt. n)
nn = nn*2
lg = lg+1
end do
ilog2 = lg
return
end
!---------------------------------------------------------------------
!---------------------------------------------------------------------
subroutine checksum(i, u1, d1, d2, d3)
!---------------------------------------------------------------------
!---------------------------------------------------------------------
use ft_data
implicit none
integer i, d1, d2, d3
double complex u1(d1+1,d2,d3)
integer j, q,r,s
double complex chk, local
!$omp parallel default(shared) private(i,q,r,s,local)
local = (0.0,0.0)
!$omp do
q = mod(j, dims(1))+1
r = mod(3*j,dims(2))+1
s = mod(5*j,dims(3))+1
!$omp critical
chk=chk+local
!$omp end critical
!$omp end parallel
chk = chk/dble(dims(1) * dims(2) * dims(3))
write (*, 30) i, REAL(chk), IMAG(chk)
30 format (' T = ', i0, ' Checksum = ', es0.10, " ", es0.10)