#include <complex>
#include <fmt/core.h>
#include <omp.h>
#include <unsupported/Eigen/CXX11/Tensor>

using namespace std::complex_literals;

class FFT {
public:
	// If processor array is 1x1 -> 0D grid decomposition

	// Cache blocking params. These values are good for most RISC processors.
	// FFT parameters:
	//   fftblock controls how many ffts are done at a time.
	//   The default is appropriate for most cache-based machines
	//   On vector machines, the FFT can be vectorized with vector
	//   length equal to the block size, so the block size should
	//   be as large as possible. This is the size of the smallest
	//   dimension of the problem: 128 for class A, 256 for class B and
	//   512 for class C.

	constexpr static int FFTBLOCK_DEFAULT = 32;
	constexpr static int FFTBLOCKPAD_DEFAULT = FFTBLOCK_DEFAULT + 2;

	// other stuff
	constexpr static int SEED = 314159265;
	constexpr static double ALPHA = 1E-6;

private:
	bool m_debug;

	int m_niter;
	int m_fftblock;
	int m_fftblockpad;

	int m_nx;
	int m_ny;
	int m_nz;

	int m_nxp;
	int m_maxdim;

	Eigen::Tensor<std::complex<double>, 1> m_u;
	Eigen::Tensor<std::complex<double>, 3> m_u0;
	Eigen::Tensor<std::complex<double>, 3> m_u1;
	Eigen::Tensor<double, 3> m_twiddle;

public:
	FFT(int nx, int ny, int nz, int iterations)
		: m_nx {nx},
		  m_ny {ny},
		  m_nz {nz},
		  m_nxp {nx + 1},
		  m_maxdim {std::max(nx, std::max(ny, nz))},
		  m_u {m_nxp},
		  m_u0 {m_nxp, ny, nz},
		  m_u1 {m_nxp, ny, nz},
		  m_twiddle {m_nxp, ny, nz}
	{
		m_debug = false;
		m_niter = iterations;

		fmt::println("");
		fmt::println("");
		fmt::println(" NAS Parallel Benchmarks (NPB3.4-OMP) - FT Benchmark");
		fmt::println("");
		fmt::println(" Size                        : {}x{}x{}", m_nx, m_ny, m_nz);
		fmt::println(" Iterations                  : {}", m_niter);
		fmt::println(" Number of available threads : {}", omp_get_max_threads());
		fmt::println("");

		// ---------------------------------------------------------------------
		// Set up info for blocking of ffts and transposes.  This improves
		// performance on cache-based systems. Blocking involves
		// working on a chunk of the problem at a time, taking chunks
		// along the first, second, or third dimension.
		//
		// - In cffts1 blocking is on 2nd dimension (with fft on 1st dim)
		// - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims)
		//
		// Since 1st dim is always in processor, we'll assume it's long enough
		// (default blocking factor is 16 so min size for 1st dim is 16)
		// The only case we have to worry about is cffts1 in a 2d decomposition.
		// so the blocking factor should not be larger than the 2nd dimension.
		// ---------------------------------------------------------------------

		m_fftblock = FFTBLOCK_DEFAULT;
		m_fftblockpad = FFTBLOCKPAD_DEFAULT;

		this->init_ui();
	}

	void run()
	{
		// ---------------------------------------------------------------------
		// Run the entire problem once to make sure all data is touched.
		// This reduces variable startup costs, which is important for such a
		// short benchmark. The other NPB 2 implementations are similar.
		// ---------------------------------------------------------------------
		this->compute_indexmap();
		this->compute_initial_conditions();
		this->fft_init();
		this->fft(1, m_u1, m_u0);

		// ---------------------------------------------------------------------
		// Start over from the beginning. Note that all operations must
		// be timed, in contrast to other benchmarks.
		// ---------------------------------------------------------------------
		this->compute_indexmap();
		this->compute_initial_conditions();
		this->fft_init();
		this->fft(1, m_u1, m_u0);

		for (int iter = 1; iter <= m_niter; iter++) {
			this->evolve();
			this->fft(-1, m_u1, m_u1);
			this->checksum(iter);
		}
	}

private:
	void init_ui()
	{
		// ---------------------------------------------------------------------
		//  touch all the big data
		// ---------------------------------------------------------------------

		const Eigen::Index d1 = m_twiddle.dimension(0) - 1;
		const Eigen::Index d2 = m_twiddle.dimension(1);
		const Eigen::Index d3 = m_twiddle.dimension(2);

		// clang-format off
		#pragma omp parallel for collapse(2)
		// clang-format on
		for (int k = 0; k < d3; k++) {
			for (int j = 0; j < d2; j++) {
				for (int i = 0; i < d1; i++) {
					m_u0(i, j, k) = 0.0 + 0i;
					m_u1(i, j, k) = 0.0 + 0i;
					m_twiddle(i, j, k) = 0;
				}
			}
		}
	}

	// ---------------------------------------------------------------------
	//  compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2
	//  for time evolution exponent.
	// ---------------------------------------------------------------------
	void compute_indexmap()
	{
		constexpr double AP = -4 * ALPHA * M_PI * M_PI;

		// ---------------------------------------------------------------------
		// basically we want to convert the fortran indices
		//   1 2 3 4 5 6 7 8
		// to
		//   0 1 2 3 -4 -3 -2 -1
		// The following magic formula does the trick:
		// mod(i-1+n/2, n) - n/2
		// ---------------------------------------------------------------------

		const Eigen::Index d1 = m_twiddle.dimension(0) - 1;
		const Eigen::Index d2 = m_twiddle.dimension(1);
		const Eigen::Index d3 = m_twiddle.dimension(2);

		// clang-format off
		#pragma omp parallel for collapse(2)
		// clang-format on
		for (Eigen::Index k = 0; k < d3; k++) {
			for (Eigen::Index j = 0; j < d2; j++) {
				const Eigen::Index kk = (k + d3 / 2) % d3 - d3 / 2;
				const Eigen::Index kk2 = kk * kk;

				const Eigen::Index jj = (j + d2 / 2) % d2 - d2 / 2;
				const Eigen::Index kj2 = jj * jj + kk2;

				for (Eigen::Index i = 0; i < d1; i++) {
					const int ii = (i + d1 / 2) % d1 - d1 / 2;
					m_twiddle(i, j, k) = exp(AP * (ii * ii + kj2));
				}
			}
		}
	}

	void compute_initial_conditions()
	{
		srand48(SEED);

		const Eigen::Index d1 = m_u1.dimension(0);
		const Eigen::Index d2 = m_u1.dimension(1);
		const Eigen::Index d3 = m_u1.dimension(2);

		for (int k = 0; k < d3; k++) {
			for (int j = 0; j < d2; j++) {
				for (int i = 0; i < d1; i++)
					m_u1(i, j, k) = drand48();
			}
		}
	}

	void evolve()
	{
		// ---------------------------------------------------------------------
		//  evolve u0 -> u1 (t time steps) in fourier space
		// ---------------------------------------------------------------------

		const Eigen::Index d1 = m_u1.dimension(0) - 1;
		const Eigen::Index d2 = m_u1.dimension(1);
		const Eigen::Index d3 = m_u1.dimension(2);

		// clang-format off
		#pragma omp parallel for collapse(2)
		// clang-format on
		for (Eigen::Index k = 0; k < d3; k++) {
			for (Eigen::Index j = 0; j < d2; j++) {
				for (Eigen::Index i = 0; i < d1; i++) {
					m_u0(i, j, k) *= m_twiddle(i, j, k);
					m_u1(i, j, k) = m_u0(i, j, k);
				}
			}
		}
	}

	// ---------------------------------------------------------------------
	//  compute the roots-of-unity array that will be used for subsequent FFTs.
	// ---------------------------------------------------------------------
	void fft_init()
	{
		// ---------------------------------------------------------------------
		//   Initialize the U array with sines and cosines in a manner that permits
		//   stride one access at each FFT iteration.
		// ---------------------------------------------------------------------

		const Eigen::Index m = ilog2(m_u.dimension(0) - 1);
		m_u(0) = m;

		Eigen::Index ku = 1;
		Eigen::Index ln = 1;

		for (Eigen::Index j = 0; j < m; j++) {
			const double t = M_PI / ln;

			for (Eigen::Index i = 0; i < ln; i++) {
				const double ti = i * t;

				m_u(i + ku) = cos(ti) + 1i * sin(ti);
			}

			ku += ln;
			ln *= 2;
		}
	}

	void fft(const int dir,
	         Eigen::Tensor<std::complex<double>, 3> &x1,
	         Eigen::Tensor<std::complex<double>, 3> &x2)
	{
		Eigen::Tensor<std::complex<double>, 2> y1 {m_fftblockpad, m_maxdim};
		Eigen::Tensor<std::complex<double>, 2> y2 {m_fftblockpad, m_maxdim};

		// ---------------------------------------------------------------------
		//  note: args x1, x2 must be different arrays
		//  note: args for cfftsx are (direction, layout, xin, xout, scratch)
		//        xin/xout may be the same and it can be somewhat faster
		//        if they are
		// ---------------------------------------------------------------------

		if (dir == 1) {
			cffts1(1, x1, x1, y1, y2);
			cffts2(1, x1, x1, y1, y2);
			cffts3(1, x1, x2, y1, y2);
		} else {
			cffts3(-1, x1, x1, y1, y2);
			cffts2(-1, x1, x1, y1, y2);
			cffts1(-1, x1, x2, y1, y2);
		}
	}

	void cffts1(const int is,
	            const Eigen::Tensor<std::complex<double>, 3> &x,
	            Eigen::Tensor<std::complex<double>, 3> &xout,
	            Eigen::Tensor<std::complex<double>, 2> &y1,
	            Eigen::Tensor<std::complex<double>, 2> &y2)
	{
		const Eigen::Index d1 = x.dimension(0) - 1;
		const Eigen::Index d2 = x.dimension(1);
		const Eigen::Index d3 = x.dimension(2);

		const Eigen::Index logd1 = ilog2(d1);

		// clang-format off
		#pragma omp parallel for default(shared) firstprivate(y1, y2) collapse(2)
		// clang-format on
		for (Eigen::Index k = 0; k < d3; k++) {
			for (Eigen::Index jn = 0; jn < d2 / m_fftblock; jn++) {
				const Eigen::Index jj = jn * m_fftblock;

				for (Eigen::Index j = 0; j < m_fftblock; j++) {
					for (Eigen::Index i = 0; i < d1; i++) {
						y1(j, i) = x(i, j + jj, k);
					}
				}

				cfftz(is, logd1, d1, y1, y2);

				for (Eigen::Index j = 0; j < m_fftblock; j++) {
					for (Eigen::Index i = 0; i < d1; i++) {
						xout(i, j + jj, k) = y1(j, i);
					}
				}
			}
		}
	}

	void cffts2(const int is,
	            const Eigen::Tensor<std::complex<double>, 3> &x,
	            Eigen::Tensor<std::complex<double>, 3> &xout,
	            Eigen::Tensor<std::complex<double>, 2> &y1,
	            Eigen::Tensor<std::complex<double>, 2> &y2)
	{
		const Eigen::Index d1 = x.dimension(0) - 1;
		const Eigen::Index d2 = x.dimension(1);
		const Eigen::Index d3 = x.dimension(2);

		const Eigen::Index logd1 = ilog2(d2);

		// clang-format off
		#pragma omp parallel for default(shared) firstprivate(y1, y2) collapse(2)
		// clang-format on
		for (Eigen::Index k = 0; k < d3; k++) {
			for (Eigen::Index in = 0; in < d1 / m_fftblock; in++) {
				const Eigen::Index ii = in * m_fftblock;

				for (Eigen::Index j = 0; j < d2; j++) {
					for (Eigen::Index i = 0; i < m_fftblock; i++) {
						y1(i, j) = x(i + ii, j, k);
					}
				}

				cfftz(is, logd1, d1, y1, y2);

				for (Eigen::Index j = 0; j < d2; j++) {
					for (Eigen::Index i = 0; i < m_fftblock; i++) {
						xout(i + ii, j, k) = y1(i, j);
					}
				}
			}
		}
	}

	void cffts3(const int is,
	            const Eigen::Tensor<std::complex<double>, 3> &x,
	            Eigen::Tensor<std::complex<double>, 3> &xout,
	            Eigen::Tensor<std::complex<double>, 2> &y1,
	            Eigen::Tensor<std::complex<double>, 2> &y2)
	{
		const Eigen::Index d1 = x.dimension(0) - 1;
		const Eigen::Index d2 = x.dimension(1);
		const Eigen::Index d3 = x.dimension(2);

		const Eigen::Index logd1 = ilog2(d3);

		// clang-format off
		#pragma omp parallel for default(shared) firstprivate(y1, y2) collapse(2)
		// clang-format on
		for (Eigen::Index j = 0; j < d2; j++) {
			for (Eigen::Index in = 0; in < d1 / m_fftblock; in++) {
				const Eigen::Index ii = in * m_fftblock;

				for (Eigen::Index k = 0; k < d3; k++) {
					for (Eigen::Index i = 0; i < m_fftblock; i++) {
						y1(i, k) = x(i + ii, j, k);
					}
				}

				cfftz(is, logd1, d1, y1, y2);

				for (Eigen::Index k = 0; k < d3; k++) {
					for (Eigen::Index i = 0; i < m_fftblock; i++) {
						xout(i + ii, j, k) = y1(i, k);
					}
				}
			}
		}
	}

	void cfftz(const int is,
	           const Eigen::Index m,
	           const Eigen::Index n,
	           Eigen::Tensor<std::complex<double>, 2> &x,
	           Eigen::Tensor<std::complex<double>, 2> &y)
	{
		// ---------------------------------------------------------------------
		//    Computes NY N-point complex-to-complex FFTs of X using an algorithm due
		//    to Swarztrauber.  X is both the input and the output array, while Y is a
		//    scratch array.  It is assumed that N = 2^M.  Before calling CFFTZ to
		//    perform FFTs, the array U must be initialized by calling CFFTZ with IS
		//    set to 0 and M set to MX, where MX is the maximum value of M for any
		//    subsequent call.
		// ---------------------------------------------------------------------

		// ---------------------------------------------------------------------
		//    Check if input parameters are invalid.
		// ---------------------------------------------------------------------

		const int mx = m_u(0).real();

		if ((is != 1 && is != -1) || m < 1 || m > mx) {
			// clang-format off
			throw std::runtime_error {
				fmt::format("CFFTZ: Either U has not been initialized, or else one "
			                    "of the input parameters is invalid {} {} {}", is, m, mx)};
			// clang-format on
		}

		// ---------------------------------------------------------------------
		//    Perform one variant of the Stockham FFT.
		// ---------------------------------------------------------------------

		for (Eigen::Index l = 1; l <= m; l += 2) {
			fftz2(is, l, m, n, m_fftblock, x, y);

			if (l == m) {
				// ---------------------------------------------------------------------
				//   Copy Y to X.
				// ---------------------------------------------------------------------

				for (Eigen::Index j = 0; j < n; j++) {
					for (Eigen::Index i = 0; i < m_fftblock; i++)
						x(i, j) = y(i, j);
				}

				continue;
			}

			fftz2(is, l + 1, m, n, m_fftblock, y, x);
		}
	}

	void fftz2(const int is,
	           const Eigen::Index l,
	           const Eigen::Index m,
	           const Eigen::Index n,
	           const Eigen::Index ny,
	           Eigen::Tensor<std::complex<double>, 2> &x,
	           Eigen::Tensor<std::complex<double>, 2> &y)
	{
		// ---------------------------------------------------------------------
		//    Performs the L-th iteration of the second variant of the Stockham FFT.
		// --------------------------------------------------------------------

		const Eigen::Index n1 = n / 2;
		const Eigen::Index lk = 1 << (l - 1);
		const Eigen::Index li = 1 << (m - l);
		const Eigen::Index lj = 2 * lk;

		for (Eigen::Index i = 0; i < li; i++) {
			const Eigen::Index i11 = i * lk;
			const Eigen::Index i12 = i11 + n1;
			const Eigen::Index i21 = i * lj;
			const Eigen::Index i22 = i21 + lk;

			std::complex<double> u1;

			if (is >= 1)
				u1 = m_u(li + i);
			else
				u1 = conj(m_u(li + i));

			// ---------------------------------------------------------------------
			//    This loop is vectorizable.
			// ---------------------------------------------------------------------
			for (Eigen::Index k = 0; k < lk; k++) {
				for (Eigen::Index j = 0; j < ny; j++) {
					const std::complex<double> x11 = x(j, i11 + k);
					const std::complex<double> x21 = x(j, i12 + k);

					y(j, i21 + k) = x11 + x21;
					y(j, i22 + k) = u1 * (x11 - x21);
				}
			}
		}
	}

	template <class T>
	T ilog2(const T n) const
	{
		if (n == 1)
			return 0;

		T lg = 1;
		T nn = 2;

		while (nn < n) {
			nn *= 2;
			lg += 1;
		}

		return lg;
	}

	void checksum(const int i) const
	{
		std::complex<double> chk {0, 0};

		// clang-format off
		#pragma omp parallel
		// clang-format on
		{
			std::complex<double> local = 0;

			// clang-format off
			#pragma omp for
			// clang-format on
			for (Eigen::Index j = 1; j <= 1024; j++) {
				const Eigen::Index q = j % m_nx;
				const Eigen::Index r = (3 * j) % m_ny;
				const Eigen::Index s = (5 * j) % m_nz;

				local += m_u1(q, r, s);
			}

			// clang-format off
			#pragma omp critical
			// clang-format on
			{
				chk += local;
			}
		}

		chk /= (double)(m_nx * m_ny * m_nz);

		fmt::println(" T = {}     Checksum = {:.10E} {:.10E}", i, chk.real(), chk.imag());
	}
};

int main(int argc, char **argv)
{
	assert(argc == 5);

	int nx = atoi(argv[1]);
	int ny = atoi(argv[2]);
	int nz = atoi(argv[3]);
	int it = atoi(argv[4]);

	FFT {nx, ny, nz, it}.run();
}