#include <complex> #include <fmt/core.h> #include <omp.h> #include <unsupported/Eigen/CXX11/Tensor> using namespace std::complex_literals; class FFT { public: // If processor array is 1x1 -> 0D grid decomposition // Cache blocking params. These values are good for most RISC processors. // FFT parameters: // fftblock controls how many ffts are done at a time. // The default is appropriate for most cache-based machines // On vector machines, the FFT can be vectorized with vector // length equal to the block size, so the block size should // be as large as possible. This is the size of the smallest // dimension of the problem: 128 for class A, 256 for class B and // 512 for class C. constexpr static int FFTBLOCK_DEFAULT = 32; constexpr static int FFTBLOCKPAD_DEFAULT = FFTBLOCK_DEFAULT + 2; // other stuff constexpr static int SEED = 314159265; constexpr static double ALPHA = 1E-6; private: bool m_debug; int m_niter; int m_fftblock; int m_fftblockpad; int m_nx; int m_ny; int m_nz; int m_nxp; int m_maxdim; Eigen::Tensor<std::complex<double>, 1> m_u; Eigen::Tensor<std::complex<double>, 3> m_u0; Eigen::Tensor<std::complex<double>, 3> m_u1; Eigen::Tensor<double, 3> m_twiddle; public: FFT(int nx, int ny, int nz, int iterations) : m_nx {nx}, m_ny {ny}, m_nz {nz}, m_nxp {nx + 1}, m_maxdim {std::max(nx, std::max(ny, nz))}, m_u {m_nxp}, m_u0 {m_nxp, ny, nz}, m_u1 {m_nxp, ny, nz}, m_twiddle {m_nxp, ny, nz} { m_debug = false; m_niter = iterations; fmt::println(""); fmt::println(""); fmt::println(" NAS Parallel Benchmarks (NPB3.4-OMP) - FT Benchmark"); fmt::println(""); fmt::println(" Size : {}x{}x{}", m_nx, m_ny, m_nz); fmt::println(" Iterations : {}", m_niter); fmt::println(" Number of available threads : {}", omp_get_max_threads()); fmt::println(""); // --------------------------------------------------------------------- // Set up info for blocking of ffts and transposes. This improves // performance on cache-based systems. Blocking involves // working on a chunk of the problem at a time, taking chunks // along the first, second, or third dimension. // // - In cffts1 blocking is on 2nd dimension (with fft on 1st dim) // - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims) // // Since 1st dim is always in processor, we'll assume it's long enough // (default blocking factor is 16 so min size for 1st dim is 16) // The only case we have to worry about is cffts1 in a 2d decomposition. // so the blocking factor should not be larger than the 2nd dimension. // --------------------------------------------------------------------- m_fftblock = FFTBLOCK_DEFAULT; m_fftblockpad = FFTBLOCKPAD_DEFAULT; this->init_ui(); } void run() { // --------------------------------------------------------------------- // Run the entire problem once to make sure all data is touched. // This reduces variable startup costs, which is important for such a // short benchmark. The other NPB 2 implementations are similar. // --------------------------------------------------------------------- this->compute_indexmap(); this->compute_initial_conditions(); this->fft_init(); this->fft(1, m_u1, m_u0); // --------------------------------------------------------------------- // Start over from the beginning. Note that all operations must // be timed, in contrast to other benchmarks. // --------------------------------------------------------------------- this->compute_indexmap(); this->compute_initial_conditions(); this->fft_init(); this->fft(1, m_u1, m_u0); for (int iter = 1; iter <= m_niter; iter++) { this->evolve(); this->fft(-1, m_u1, m_u1); this->checksum(iter); } } private: void init_ui() { // --------------------------------------------------------------------- // touch all the big data // --------------------------------------------------------------------- const Eigen::Index d1 = m_twiddle.dimension(0) - 1; const Eigen::Index d2 = m_twiddle.dimension(1); const Eigen::Index d3 = m_twiddle.dimension(2); // clang-format off #pragma omp parallel for collapse(2) // clang-format on for (int k = 0; k < d3; k++) { for (int j = 0; j < d2; j++) { for (int i = 0; i < d1; i++) { m_u0(i, j, k) = 0.0 + 0i; m_u1(i, j, k) = 0.0 + 0i; m_twiddle(i, j, k) = 0; } } } } // --------------------------------------------------------------------- // compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2 // for time evolution exponent. // --------------------------------------------------------------------- void compute_indexmap() { constexpr double AP = -4 * ALPHA * M_PI * M_PI; // --------------------------------------------------------------------- // basically we want to convert the fortran indices // 1 2 3 4 5 6 7 8 // to // 0 1 2 3 -4 -3 -2 -1 // The following magic formula does the trick: // mod(i-1+n/2, n) - n/2 // --------------------------------------------------------------------- const Eigen::Index d1 = m_twiddle.dimension(0) - 1; const Eigen::Index d2 = m_twiddle.dimension(1); const Eigen::Index d3 = m_twiddle.dimension(2); // clang-format off #pragma omp parallel for collapse(2) // clang-format on for (Eigen::Index k = 0; k < d3; k++) { for (Eigen::Index j = 0; j < d2; j++) { const Eigen::Index kk = (k + d3 / 2) % d3 - d3 / 2; const Eigen::Index kk2 = kk * kk; const Eigen::Index jj = (j + d2 / 2) % d2 - d2 / 2; const Eigen::Index kj2 = jj * jj + kk2; for (Eigen::Index i = 0; i < d1; i++) { const int ii = (i + d1 / 2) % d1 - d1 / 2; m_twiddle(i, j, k) = exp(AP * (ii * ii + kj2)); } } } } void compute_initial_conditions() { srand48(SEED); const Eigen::Index d1 = m_u1.dimension(0); const Eigen::Index d2 = m_u1.dimension(1); const Eigen::Index d3 = m_u1.dimension(2); for (int k = 0; k < d3; k++) { for (int j = 0; j < d2; j++) { for (int i = 0; i < d1; i++) m_u1(i, j, k) = drand48(); } } } void evolve() { // --------------------------------------------------------------------- // evolve u0 -> u1 (t time steps) in fourier space // --------------------------------------------------------------------- const Eigen::Index d1 = m_u1.dimension(0) - 1; const Eigen::Index d2 = m_u1.dimension(1); const Eigen::Index d3 = m_u1.dimension(2); // clang-format off #pragma omp parallel for collapse(2) // clang-format on for (Eigen::Index k = 0; k < d3; k++) { for (Eigen::Index j = 0; j < d2; j++) { for (Eigen::Index i = 0; i < d1; i++) { m_u0(i, j, k) *= m_twiddle(i, j, k); m_u1(i, j, k) = m_u0(i, j, k); } } } } // --------------------------------------------------------------------- // compute the roots-of-unity array that will be used for subsequent FFTs. // --------------------------------------------------------------------- void fft_init() { // --------------------------------------------------------------------- // Initialize the U array with sines and cosines in a manner that permits // stride one access at each FFT iteration. // --------------------------------------------------------------------- const Eigen::Index m = ilog2(m_u.dimension(0) - 1); m_u(0) = m; Eigen::Index ku = 1; Eigen::Index ln = 1; for (Eigen::Index j = 0; j < m; j++) { const double t = M_PI / ln; for (Eigen::Index i = 0; i < ln; i++) { const double ti = i * t; m_u(i + ku) = cos(ti) + 1i * sin(ti); } ku += ln; ln *= 2; } } void fft(const int dir, Eigen::Tensor<std::complex<double>, 3> &x1, Eigen::Tensor<std::complex<double>, 3> &x2) { Eigen::Tensor<std::complex<double>, 2> y1 {m_fftblockpad, m_maxdim}; Eigen::Tensor<std::complex<double>, 2> y2 {m_fftblockpad, m_maxdim}; // --------------------------------------------------------------------- // note: args x1, x2 must be different arrays // note: args for cfftsx are (direction, layout, xin, xout, scratch) // xin/xout may be the same and it can be somewhat faster // if they are // --------------------------------------------------------------------- if (dir == 1) { cffts1(1, x1, x1, y1, y2); cffts2(1, x1, x1, y1, y2); cffts3(1, x1, x2, y1, y2); } else { cffts3(-1, x1, x1, y1, y2); cffts2(-1, x1, x1, y1, y2); cffts1(-1, x1, x2, y1, y2); } } void cffts1(const int is, const Eigen::Tensor<std::complex<double>, 3> &x, Eigen::Tensor<std::complex<double>, 3> &xout, Eigen::Tensor<std::complex<double>, 2> &y1, Eigen::Tensor<std::complex<double>, 2> &y2) { const Eigen::Index d1 = x.dimension(0) - 1; const Eigen::Index d2 = x.dimension(1); const Eigen::Index d3 = x.dimension(2); const Eigen::Index logd1 = ilog2(d1); // clang-format off #pragma omp parallel for default(shared) firstprivate(y1, y2) collapse(2) // clang-format on for (Eigen::Index k = 0; k < d3; k++) { for (Eigen::Index jn = 0; jn < d2 / m_fftblock; jn++) { const Eigen::Index jj = jn * m_fftblock; for (Eigen::Index j = 0; j < m_fftblock; j++) { for (Eigen::Index i = 0; i < d1; i++) { y1(j, i) = x(i, j + jj, k); } } cfftz(is, logd1, d1, y1, y2); for (Eigen::Index j = 0; j < m_fftblock; j++) { for (Eigen::Index i = 0; i < d1; i++) { xout(i, j + jj, k) = y1(j, i); } } } } } void cffts2(const int is, const Eigen::Tensor<std::complex<double>, 3> &x, Eigen::Tensor<std::complex<double>, 3> &xout, Eigen::Tensor<std::complex<double>, 2> &y1, Eigen::Tensor<std::complex<double>, 2> &y2) { const Eigen::Index d1 = x.dimension(0) - 1; const Eigen::Index d2 = x.dimension(1); const Eigen::Index d3 = x.dimension(2); const Eigen::Index logd1 = ilog2(d2); // clang-format off #pragma omp parallel for default(shared) firstprivate(y1, y2) collapse(2) // clang-format on for (Eigen::Index k = 0; k < d3; k++) { for (Eigen::Index in = 0; in < d1 / m_fftblock; in++) { const Eigen::Index ii = in * m_fftblock; for (Eigen::Index j = 0; j < d2; j++) { for (Eigen::Index i = 0; i < m_fftblock; i++) { y1(i, j) = x(i + ii, j, k); } } cfftz(is, logd1, d1, y1, y2); for (Eigen::Index j = 0; j < d2; j++) { for (Eigen::Index i = 0; i < m_fftblock; i++) { xout(i + ii, j, k) = y1(i, j); } } } } } void cffts3(const int is, const Eigen::Tensor<std::complex<double>, 3> &x, Eigen::Tensor<std::complex<double>, 3> &xout, Eigen::Tensor<std::complex<double>, 2> &y1, Eigen::Tensor<std::complex<double>, 2> &y2) { const Eigen::Index d1 = x.dimension(0) - 1; const Eigen::Index d2 = x.dimension(1); const Eigen::Index d3 = x.dimension(2); const Eigen::Index logd1 = ilog2(d3); // clang-format off #pragma omp parallel for default(shared) firstprivate(y1, y2) collapse(2) // clang-format on for (Eigen::Index j = 0; j < d2; j++) { for (Eigen::Index in = 0; in < d1 / m_fftblock; in++) { const Eigen::Index ii = in * m_fftblock; for (Eigen::Index k = 0; k < d3; k++) { for (Eigen::Index i = 0; i < m_fftblock; i++) { y1(i, k) = x(i + ii, j, k); } } cfftz(is, logd1, d1, y1, y2); for (Eigen::Index k = 0; k < d3; k++) { for (Eigen::Index i = 0; i < m_fftblock; i++) { xout(i + ii, j, k) = y1(i, k); } } } } } void cfftz(const int is, const Eigen::Index m, const Eigen::Index n, Eigen::Tensor<std::complex<double>, 2> &x, Eigen::Tensor<std::complex<double>, 2> &y) { // --------------------------------------------------------------------- // Computes NY N-point complex-to-complex FFTs of X using an algorithm due // to Swarztrauber. X is both the input and the output array, while Y is a // scratch array. It is assumed that N = 2^M. Before calling CFFTZ to // perform FFTs, the array U must be initialized by calling CFFTZ with IS // set to 0 and M set to MX, where MX is the maximum value of M for any // subsequent call. // --------------------------------------------------------------------- // --------------------------------------------------------------------- // Check if input parameters are invalid. // --------------------------------------------------------------------- const int mx = m_u(0).real(); if ((is != 1 && is != -1) || m < 1 || m > mx) { // clang-format off throw std::runtime_error { fmt::format("CFFTZ: Either U has not been initialized, or else one " "of the input parameters is invalid {} {} {}", is, m, mx)}; // clang-format on } // --------------------------------------------------------------------- // Perform one variant of the Stockham FFT. // --------------------------------------------------------------------- for (Eigen::Index l = 1; l <= m; l += 2) { fftz2(is, l, m, n, m_fftblock, x, y); if (l == m) { // --------------------------------------------------------------------- // Copy Y to X. // --------------------------------------------------------------------- for (Eigen::Index j = 0; j < n; j++) { for (Eigen::Index i = 0; i < m_fftblock; i++) x(i, j) = y(i, j); } continue; } fftz2(is, l + 1, m, n, m_fftblock, y, x); } } void fftz2(const int is, const Eigen::Index l, const Eigen::Index m, const Eigen::Index n, const Eigen::Index ny, Eigen::Tensor<std::complex<double>, 2> &x, Eigen::Tensor<std::complex<double>, 2> &y) { // --------------------------------------------------------------------- // Performs the L-th iteration of the second variant of the Stockham FFT. // -------------------------------------------------------------------- const Eigen::Index n1 = n / 2; const Eigen::Index lk = 1 << (l - 1); const Eigen::Index li = 1 << (m - l); const Eigen::Index lj = 2 * lk; for (Eigen::Index i = 0; i < li; i++) { const Eigen::Index i11 = i * lk; const Eigen::Index i12 = i11 + n1; const Eigen::Index i21 = i * lj; const Eigen::Index i22 = i21 + lk; std::complex<double> u1; if (is >= 1) u1 = m_u(li + i); else u1 = conj(m_u(li + i)); // --------------------------------------------------------------------- // This loop is vectorizable. // --------------------------------------------------------------------- for (Eigen::Index k = 0; k < lk; k++) { for (Eigen::Index j = 0; j < ny; j++) { const std::complex<double> x11 = x(j, i11 + k); const std::complex<double> x21 = x(j, i12 + k); y(j, i21 + k) = x11 + x21; y(j, i22 + k) = u1 * (x11 - x21); } } } } template <class T> T ilog2(const T n) const { if (n == 1) return 0; T lg = 1; T nn = 2; while (nn < n) { nn *= 2; lg += 1; } return lg; } void checksum(const int i) const { std::complex<double> chk {0, 0}; // clang-format off #pragma omp parallel // clang-format on { std::complex<double> local = 0; // clang-format off #pragma omp for // clang-format on for (Eigen::Index j = 1; j <= 1024; j++) { const Eigen::Index q = j % m_nx; const Eigen::Index r = (3 * j) % m_ny; const Eigen::Index s = (5 * j) % m_nz; local += m_u1(q, r, s); } // clang-format off #pragma omp critical // clang-format on { chk += local; } } chk /= (double)(m_nx * m_ny * m_nz); fmt::println(" T = {} Checksum = {:.10E} {:.10E}", i, chk.real(), chk.imag()); } }; int main(int argc, char **argv) { assert(argc == 5); int nx = atoi(argv[1]); int ny = atoi(argv[2]); int nz = atoi(argv[3]); int it = atoi(argv[4]); FFT {nx, ny, nz, it}.run(); }