From cdd4bac26b1c6a076911367863b0d8236c6e0045 Mon Sep 17 00:00:00 2001
From: Dorian Stoll <dorian.stoll@uni-potsdam.de>
Date: Sun, 9 Jun 2024 20:45:55 +0200
Subject: [PATCH] zellularautomat: c: Init from provided reference

---
 src/benchmarks/zellularautomat/c/Makefile     |  96 ++++++++
 src/benchmarks/zellularautomat/c/ca_common.c  | 189 ++++++++++++++++
 src/benchmarks/zellularautomat/c/ca_common.h  |  60 +++++
 src/benchmarks/zellularautomat/c/ca_mpi_p2p.c | 123 +++++++++++
 src/benchmarks/zellularautomat/c/ca_openmp.c  |  97 ++++++++
 src/benchmarks/zellularautomat/c/ca_seq.c     |  90 ++++++++
 src/benchmarks/zellularautomat/c/random.c     | 207 ++++++++++++++++++
 src/benchmarks/zellularautomat/c/random.h     |  98 +++++++++
 8 files changed, 960 insertions(+)
 create mode 100644 src/benchmarks/zellularautomat/c/Makefile
 create mode 100644 src/benchmarks/zellularautomat/c/ca_common.c
 create mode 100644 src/benchmarks/zellularautomat/c/ca_common.h
 create mode 100644 src/benchmarks/zellularautomat/c/ca_mpi_p2p.c
 create mode 100644 src/benchmarks/zellularautomat/c/ca_openmp.c
 create mode 100644 src/benchmarks/zellularautomat/c/ca_seq.c
 create mode 100644 src/benchmarks/zellularautomat/c/random.c
 create mode 100644 src/benchmarks/zellularautomat/c/random.h

diff --git a/src/benchmarks/zellularautomat/c/Makefile b/src/benchmarks/zellularautomat/c/Makefile
new file mode 100644
index 00000000..e002686c
--- /dev/null
+++ b/src/benchmarks/zellularautomat/c/Makefile
@@ -0,0 +1,96 @@
+BASE_CC=gcc
+MPI_CC=mpicc
+
+COMMON_CFLAGS=-O2
+COMMON_LDFLAGS=-lcrypto -lrt
+
+BASE_CFLAGS=-Wall -std=gnu99 -pedantic
+
+OMP_CFLAGS=-fopenmp
+
+MPI_CFLAGS=-DUSE_MPI
+
+C_DEPS=ca_common.c random.c
+
+MPI_TARGETS=ca_mpi_p2p ca_mpi_p2p_hybrid
+
+TARGETS=ca_seq \
+		ca_openmp \
+		$(MPI_TARGETS)
+
+.PHONY: all
+all: $(TARGETS)
+
+.PHONY: cpu
+cpu: ca_seq ca_openmp
+
+.PHONY: mpi
+mpi: $(MPI_TARGETS)
+
+ca_seq: ca_seq.c $(C_DEPS)
+	$(BASE_CC) $(COMMON_CFLAGS) $(BASE_CFLAGS) $^ $(COMMON_LDFLAGS) -o $@
+
+ca_openmp: ca_openmp.c $(C_DEPS)
+	$(BASE_CC) $(COMMON_CFLAGS) $(BASE_CFLAGS) $(OMP_CFLAGS) $^ $(COMMON_LDFLAGS) -o $@
+
+ca_mpi_p2p: ca_mpi_p2p.c $(C_DEPS)
+	$(MPI_CC) $(COMMON_CFLAGS) $(BASE_CFLAGS) $(MPI_CFLAGS) $^ $(COMMON_LDFLAGS) -o $@
+
+ca_mpi_p2p_hybrid: ca_mpi_p2p.c $(C_DEPS)
+	$(MPI_CC) $(COMMON_CFLAGS) $(BASE_CFLAGS) $(MPI_CFLAGS) $(OMP_CFLAGS) $^ $(COMMON_LDFLAGS) -o $@
+
+.PHONY: test
+
+test: $(TARGETS)
+	@for ITS in 10 31 57 100; do \
+		for LINES in 10 33 47 100; do \
+			echo "$$LINES lines, $$ITS iterations"; \
+			for BINARY in $^; do printf '%-10s\t' $$BINARY; ./$$BINARY $$LINES $$ITS; done; \
+		done \
+	done
+
+.PHONY: mpi-test
+
+mpi-test: $(MPI_TARGETS)
+	@for ITS in 10 31 57 100; do \
+		for LINES in 20 33 47 100; do \
+			for NP in 2 3 4; do \
+				echo "$$LINES lines, $$ITS iterations, $$NP procs"; \
+				for BINARY in ca_seq $^; do \
+					printf '%-10s\t' $$BINARY; \
+					mpiexec -n $$NP ./$$BINARY $$LINES $$ITS; \
+				done \
+			done \
+		done \
+	done
+
+.PHONY: bench
+
+bench: ca_openmp
+	@for ITS in 128 256 512; do \
+		for LINES in 1000 10000 50000; do \
+			echo "$$LINES lines, $$ITS iterations"; \
+			for BINARY in $^; do printf '%-10s\t' $$BINARY; time ./$$BINARY $$LINES $$ITS; done; \
+		done \
+	done
+
+.PHONY: omp_scaling
+
+omp_scaling: ca_seq ca_openmp
+	@for ITS in 128 256 512; do \
+		for LINES in 1000 10000 50000; do \
+			echo "$$LINES lines, $$ITS iterations"; \
+			printf "sequential \t"; \
+			./ca_seq $$LINES $$ITS; \
+			for THREADS in `seq \`nproc\``; do \
+				printf "$$THREADS threads\t"; \
+				OMP_NUM_THREADS=$$THREADS ./ca_openmp $$LINES $$ITS; \
+			done; \
+		done; \
+	done
+
+.PHONY: clean
+
+clean:
+	rm -f *.o
+	rm -f $(TARGETS)
diff --git a/src/benchmarks/zellularautomat/c/ca_common.c b/src/benchmarks/zellularautomat/c/ca_common.c
new file mode 100644
index 00000000..99e000d3
--- /dev/null
+++ b/src/benchmarks/zellularautomat/c/ca_common.c
@@ -0,0 +1,189 @@
+/*
+ * common functions for (parallel) cellular automaton
+ *
+ * (c) 2016 Steffen Christgau
+ *
+ * configuration initialization based on
+ * (c) 1996,1997 Peter Sanders, Ingo Boesnach
+ *
+ */
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include "openssl/md5.h"
+#include "openssl/evp.h"
+
+#include "ca_common.h"
+#include "random.h"
+
+#ifdef USE_MPI
+#include <mpi.h>
+#endif
+
+/* determine random integer between 0 and n-1 */
+#define randInt(n) ((int)(nextRandomLEcuyer() * n))
+
+void ca_init(int argc, char** argv, int *lines, int *its)
+{
+	assert(argc == 3);
+
+	*lines = atoi(argv[1]);
+	*its = atoi(argv[2]);
+
+	assert(*lines > 0);
+}
+
+/* random starting configuration */
+void ca_init_config(line_t *buf, int lines, int skip_lines)
+{
+	volatile int scratch;
+
+	initRandomLEcuyer(424243);
+
+	/* let the RNG spin for some rounds (used for distributed initialization) */
+	for (int y = 1;  y <= skip_lines;  y++) {
+		for (int x = 1;  x <= XSIZE;  x++) {
+			scratch = scratch + randInt(100) >= 50;
+		}
+	}
+
+	for (int y = 1;  y <= lines;  y++) {
+		for (int x = 1;  x <= XSIZE;  x++) {
+			buf[y][x] = randInt(100) >= 50;
+		}
+	}
+}
+
+static char* ca_buffer_to_hex_str(const uint8_t* buf, size_t buf_size)
+{
+  char *retval, *ptr;
+
+  retval = ptr = calloc(MD5_DIGEST_LENGTH * 2 + 1, sizeof(*retval));
+  for (size_t i = 0; i < MD5_DIGEST_LENGTH; i++) {
+    snprintf(ptr, 3, "%02X", buf[i]);
+    ptr += 2;
+  }
+
+  return retval;
+}
+
+static void ca_print_hash_and_time(const char *hash, const double time)
+{
+	printf("hash: %s\ttime: %.3f s\n", (hash ? hash : "ERROR"), time);
+}
+
+static void ca_clean_ghost_zones(line_t *buf, int lines)
+{
+	for (int y = 0; y < lines; y++) {
+		buf[y][0] = 0;
+		buf[y][XSIZE + 1] = 0;
+	}
+}
+
+void ca_hash_and_report(line_t *buf, int lines, double time_in_s)
+{
+	uint8_t hash[MD5_DIGEST_LENGTH];
+	uint32_t md_len;
+	EVP_MD_CTX *ctx = EVP_MD_CTX_new();
+	EVP_DigestInit_ex(ctx, EVP_md5(), NULL);
+
+	ca_clean_ghost_zones(buf, lines);
+
+	EVP_DigestUpdate(ctx, buf, lines * sizeof(*buf));
+	EVP_DigestFinal_ex(ctx, hash, &md_len);
+
+	char* hash_str = ca_buffer_to_hex_str(hash, MD5_DIGEST_LENGTH);
+	ca_print_hash_and_time(hash_str, time_in_s);
+	free(hash_str);
+
+	EVP_MD_CTX_free(ctx);
+}
+
+#ifdef MPI_VERSION /* defined by mpi.h */
+
+static int num_remainder_procs;
+#ifdef USE_MPI_TOPOLOGY
+static MPI_Comm topo_comm;
+#endif
+
+void ca_mpi_init(int num_procs, int rank, int num_total_lines,
+		int *num_local_lines, int *global_first_line)
+{
+	*num_local_lines = num_total_lines / num_procs;
+	*global_first_line = rank * (*num_local_lines);
+
+
+	/* if work cannot be distributed equally, distribute the remaining lines equally */
+	num_remainder_procs = num_total_lines % num_procs;
+	if (rank < num_remainder_procs) {
+		(*num_local_lines)++;
+		*global_first_line = *global_first_line + rank;
+	} else {
+		*global_first_line = *global_first_line + num_remainder_procs;
+	}
+
+#ifdef USE_TOPO
+	int topo_periodic = 1, topo_dim = num_procs;
+	MPI_Cart_create(MPI_COMM_WORLD, 1, &topo_dim, &topo_periodic, 0, &topo_comm);
+#endif
+}
+
+#define TAG_RESULT (0xCAFE)
+
+void ca_mpi_hash_and_report(line_t* local_buf, int num_local_lines,
+		int num_total_lines, int num_procs, double time_in_s)
+{
+	int i, rank, num_lines = num_local_lines, count;
+	uint32_t md_len;
+	uint8_t hash[MD5_DIGEST_LENGTH];
+
+	EVP_MD_CTX *ctx = EVP_MD_CTX_new();
+
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    if (rank == 0) {
+
+		EVP_DigestInit_ex(ctx, EVP_md5(), NULL);
+		count = num_local_lines;
+		ca_clean_ghost_zones(local_buf + 1, num_local_lines);
+	    /* insert our own data into MD5 hash */
+
+		EVP_DigestUpdate(ctx, local_buf + 1, num_local_lines * sizeof(line_t));
+
+	    /* recieve partial results from all other processes in our local buffer and
+		 * update the hash. Our buffer is garanteed to have the maximum required
+	     * size in any case (see partioning above) */
+		for (i = 1; i < num_procs; i++) {
+			num_lines = num_total_lines / num_procs;
+			if (i < num_remainder_procs) {
+				num_lines++;
+			}
+			count += num_lines;
+			MPI_Recv(
+				local_buf, num_lines * LINE_SIZE, CA_MPI_CELL_DATATYPE,
+				i, TAG_RESULT, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+			ca_clean_ghost_zones(local_buf, num_lines);
+			EVP_DigestUpdate(ctx, local_buf, num_lines * sizeof(line_t));
+		}
+
+		EVP_DigestFinal_ex(ctx, hash, &md_len);
+
+		char* hash_str = ca_buffer_to_hex_str(hash, MD5_DIGEST_LENGTH);
+		ca_print_hash_and_time(hash_str, time_in_s);
+
+		free(hash_str);
+		EVP_MD_CTX_free(ctx);
+	} else {
+		MPI_Send(
+			local_buf[1], num_local_lines * LINE_SIZE, CA_MPI_CELL_DATATYPE,
+			0, TAG_RESULT, MPI_COMM_WORLD);
+	}
+
+#ifdef USE_MPI_TOPOLOGY
+	MPI_Comm_free(&topo_comm);
+#endif
+}
+
+#endif /* MPI_VERSION */
diff --git a/src/benchmarks/zellularautomat/c/ca_common.h b/src/benchmarks/zellularautomat/c/ca_common.h
new file mode 100644
index 00000000..0ec35c2b
--- /dev/null
+++ b/src/benchmarks/zellularautomat/c/ca_common.h
@@ -0,0 +1,60 @@
+#ifndef CA_COMMON_H
+#define CA_COMMON_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include <time.h>
+
+/* a: pointer to array; x,y: coordinates; result: n-th element of anneal,
+      where n is the number of neighbors */
+#define transition(a, x, y) \
+   (anneal[(a)[(y)-1][(x)-1] + (a)[(y)][(x)-1] + (a)[(y)+1][(x)-1] +\
+           (a)[(y)-1][(x)  ] + (a)[(y)][(x)  ] + (a)[(y)+1][(x)  ] +\
+           (a)[(y)-1][(x)+1] + (a)[(y)][(x)+1] + (a)[(y)+1][(x)+1]])
+
+#define TIME_GET(timer) \
+	struct timespec timer; \
+	clock_gettime(CLOCK_MONOTONIC, &timer)
+
+#define TIME_DIFF(timer1, timer2) \
+	((timer2.tv_sec * 1.0E+9 + timer2.tv_nsec) - \
+	 (timer1.tv_sec * 1.0E+9 + timer1.tv_nsec)) / 1.0E+9
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* horizontal size of the configuration */
+#define XSIZE 1024
+#define LINE_SIZE (XSIZE + 2)
+
+/* "ADT" State and line of states (plus border) */
+typedef uint8_t cell_state_t;
+typedef cell_state_t line_t[XSIZE + 2];
+
+void ca_init(int argc, char** argv, int *lines, int *its);
+void ca_init_config(line_t *buf, int lines, int skip_lines);
+void ca_hash_and_report(line_t *buf, int lines, double time_in_s);
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef USE_MPI
+
+/* next/prev process in communicator */
+#define PREV_PROC(n, num_procs) ((n - 1 + num_procs) % num_procs)
+#define SUCC_PROC(n, num_procs) ((n + 1) % num_procs)
+
+#define CA_MPI_CELL_DATATYPE MPI_BYTE
+
+void ca_mpi_init(int num_procs, int rank, int num_total_lines,
+		int *num_local_lines, int *global_first_line);
+void ca_mpi_hash_and_report(line_t* local_buf, int num_local_lines,
+		int num_total_lines, int num_procs, double time_in_s);
+
+#endif /* USE_MPI */
+
+
+#endif /* CA_COMMON_H */
diff --git a/src/benchmarks/zellularautomat/c/ca_mpi_p2p.c b/src/benchmarks/zellularautomat/c/ca_mpi_p2p.c
new file mode 100644
index 00000000..558564b9
--- /dev/null
+++ b/src/benchmarks/zellularautomat/c/ca_mpi_p2p.c
@@ -0,0 +1,123 @@
+/*
+ * simulate a cellular automaton with periodic boundaries (torus-like)
+ * MPI version using two-sided blocking communication
+ *
+ * (c) 2016 Steffen Christgau (C99 port, modularization, parallelization)
+ * (c) 1996,1997 Peter Sanders, Ingo Boesnach (original source)
+ *
+ * command line arguments:
+ * #1: Number of lines
+ * #2: Number of iterations to be simulated
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <mpi.h>
+
+#include "ca_common.h"
+
+/* tags for communication */
+#define TAG_SEND_UPPER_BOUND (1)
+#define TAG_SEND_LOWER_BOUND (2)
+
+#define TAG_RECV_UPPER_BOUND TAG_SEND_LOWER_BOUND
+#define TAG_RECV_LOWER_BOUND TAG_SEND_UPPER_BOUND
+
+/* --------------------- CA simulation -------------------------------- */
+
+/* annealing rule from ChoDro96 page 34
+ * the table is used to map the number of nonzero
+ * states in the neighborhood to the new state
+ */
+static const cell_state_t anneal[10] = {0, 0, 0, 0, 1, 0, 1, 1, 1, 1};
+
+/* treat torus like boundary conditions */
+static void boundary(line_t *buf, int lines)
+{
+   for (int y = 0;  y <= lines + 1; y++) {
+      /* copy rightmost column to the buffer column 0 */
+      buf[y][0] = buf[y][XSIZE];
+
+      /* copy leftmost column to the buffer column XSIZE + 1 */
+      buf[y][XSIZE+1] = buf[y][1];
+   }
+
+   /* no wrap of upper/lower boundary, since it is done by exchanged ghost zones */
+}
+
+/* make one simulation iteration with lines lines.
+ * old configuration is in from, new one is written to to.
+ */
+static void simulate(line_t *from, line_t *to, int lines)
+{
+	#ifdef _OPENMP
+	#pragma omp parallel for
+	#endif
+	for (int y = 1;  y <= lines;  y++) {
+		for (int x = 1;  x <= XSIZE;  x++) {
+			to[y][x] = transition(from, x, y);
+		}
+	}
+}
+
+/* --------------------- measurement ---------------------------------- */
+
+int main(int argc, char** argv)
+{
+	int num_total_lines, num_local_lines, num_skip_lines, its;
+
+	/* init MPI and application */
+	MPI_Init(&argc, &argv);
+
+	ca_init(argc, argv, &num_total_lines, &its);
+
+	int num_procs, local_rank;
+	MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
+	MPI_Comm_rank(MPI_COMM_WORLD, &local_rank);
+
+	ca_mpi_init(num_procs, local_rank, num_total_lines,
+		&num_local_lines, &num_skip_lines);
+
+	line_t *from = calloc((num_local_lines + 2), sizeof(*from));
+	line_t *to = calloc((num_local_lines + 2), sizeof(*to));
+
+	ca_init_config(from, num_local_lines, num_skip_lines);
+
+	/* actual computation */
+	TIME_GET(sim_start);
+	for (int i = 0; i < its; i++) {
+		MPI_Sendrecv(
+			from[1], LINE_SIZE, CA_MPI_CELL_DATATYPE,
+			PREV_PROC(local_rank, num_procs), TAG_SEND_UPPER_BOUND,
+			from[num_local_lines + 1], LINE_SIZE, CA_MPI_CELL_DATATYPE,
+			SUCC_PROC(local_rank, num_procs), TAG_RECV_LOWER_BOUND, MPI_COMM_WORLD,
+			MPI_STATUS_IGNORE);
+
+		MPI_Sendrecv(
+			from[num_local_lines], LINE_SIZE, CA_MPI_CELL_DATATYPE,
+			SUCC_PROC(local_rank, num_procs), TAG_SEND_LOWER_BOUND,
+			from[0], LINE_SIZE, CA_MPI_CELL_DATATYPE,
+			PREV_PROC(local_rank, num_procs), TAG_RECV_UPPER_BOUND, MPI_COMM_WORLD,
+			MPI_STATUS_IGNORE);
+
+		boundary(from, num_local_lines);
+		simulate(from, to, num_local_lines);
+
+		line_t *temp = from;
+		from = to;
+		to = temp;
+	}
+	TIME_GET(sim_stop);
+
+	ca_mpi_hash_and_report(from, num_local_lines, num_total_lines,
+		num_procs, TIME_DIFF(sim_start, sim_stop));
+
+	free(from);
+	free(to);
+
+	MPI_Finalize();
+
+	return EXIT_SUCCESS;
+}
diff --git a/src/benchmarks/zellularautomat/c/ca_openmp.c b/src/benchmarks/zellularautomat/c/ca_openmp.c
new file mode 100644
index 00000000..b42fc1ea
--- /dev/null
+++ b/src/benchmarks/zellularautomat/c/ca_openmp.c
@@ -0,0 +1,97 @@
+/*
+ * simulate a cellular automaton with periodic boundaries (torus-like)
+ * OpenMP version
+ *
+ * (c) 2016 Steffen Christgau (C99 port, modularization, parallelization)
+ * (c) 1996,1997 Peter Sanders, Ingo Boesnach (original source)
+ *
+ * command line arguments:
+ * #1: Number of lines
+ * #2: Number of iterations to be simulated
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "ca_common.h"
+
+/* --------------------- CA simulation -------------------------------- */
+
+/* annealing rule from ChoDro96 page 34
+ * the table is used to map the number of nonzero
+ * states in the neighborhood to the new state
+ */
+static const cell_state_t anneal[10] = {0, 0, 0, 0, 1, 0, 1, 1, 1, 1};
+
+/* treat torus like boundary conditions */
+static void boundary(line_t *buf, int lines)
+{
+	#pragma omp sections
+	{
+		#pragma omp section
+		#pragma omp parallel for
+		for (int y = 0; y <= lines + 1; y++) {
+			/* copy rightmost column to the buffer column 0 */
+			buf[y][0] = buf[y][XSIZE];
+	
+			/* copy leftmost column to the buffer column XSIZE + 1 */
+			buf[y][XSIZE + 1] = buf[y][1];
+		}
+
+		#pragma omp section
+		#pragma omp parallel for
+		for (int x = 0; x <= XSIZE + 1; x++) {
+			/* copy bottommost row to buffer row 0 */
+			buf[0][x] = buf[lines][x];
+	
+			/* copy topmost row to buffer row lines + 1 */
+			buf[lines + 1][x] = buf[1][x];
+		}
+	}
+}
+
+/* make one simulation iteration with lines lines.
+ * old configuration is in from, new one is written to to.
+ */
+static void simulate(line_t *from, line_t *to, int lines)
+{
+	#pragma omp parallel for
+	for (int y = 1; y <= lines; y++) {
+		for (int x = 1; x <= XSIZE; x++) {
+			/* transition is defined in ca_common.h */
+			to[y][x] = transition(from, x, y);
+		}
+	}
+}
+
+/* --------------------- main routine --------------------------------- */
+
+int main(int argc, char** argv)
+{
+	int lines, its;
+
+	ca_init(argc, argv, &lines, &its);
+
+	line_t *from = calloc((lines + 2), sizeof(line_t));
+	line_t *to = calloc((lines + 2), sizeof(line_t));
+
+	ca_init_config(from, lines, 0);
+
+	TIME_GET(sim_start);
+	for (int i = 0; i < its; i++) {
+		boundary(from, lines);
+		simulate(from, to, lines);
+
+		line_t *temp = from;
+		from = to;
+		to = temp;
+	}
+	TIME_GET(sim_stop);
+
+	ca_hash_and_report(from + 1, lines, TIME_DIFF(sim_start, sim_stop));
+
+	free(from);
+	free(to);
+
+	return EXIT_SUCCESS;
+}
diff --git a/src/benchmarks/zellularautomat/c/ca_seq.c b/src/benchmarks/zellularautomat/c/ca_seq.c
new file mode 100644
index 00000000..ff187f0a
--- /dev/null
+++ b/src/benchmarks/zellularautomat/c/ca_seq.c
@@ -0,0 +1,90 @@
+/*
+ * simulate a cellular automaton with periodic boundaries (torus-like)
+ * serial version
+ *
+ * (c) 2016 Steffen Christgau (C99 port, modularization)
+ * (c) 1996,1997 Peter Sanders, Ingo Boesnach (original source)
+ *
+ * command line arguments:
+ * #1: Number of lines
+ * #2: Number of iterations to be simulated
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "ca_common.h"
+#include <time.h>
+
+/* --------------------- CA simulation -------------------------------- */
+
+/* annealing rule from ChoDro96 page 34
+ * the table is used to map the number of nonzero
+ * states in the neighborhood to the new state
+ */
+static const cell_state_t anneal[10] = {0, 0, 0, 0, 1, 0, 1, 1, 1, 1};
+
+/* treat torus like boundary conditions */
+static void boundary(line_t *buf, int lines)
+{
+	for (int y = 0; y <= lines + 1; y++) {
+		/* copy rightmost column to the buffer column 0 */
+		buf[y][0] = buf[y][XSIZE];
+
+		/* copy leftmost column to the buffer column XSIZE + 1 */
+		buf[y][XSIZE + 1] = buf[y][1];
+	}
+
+	for (int x = 0; x <= XSIZE + 1; x++) {
+		/* copy bottommost row to buffer row 0 */
+		buf[0][x] = buf[lines][x];
+
+		/* copy topmost row to buffer row lines + 1 */
+		buf[lines + 1][x] = buf[1][x];
+	}
+}
+
+/* make one simulation iteration with lines lines.
+ * old configuration is in from, new one is written to to.
+ */
+static void simulate(line_t *from, line_t *to, int lines)
+{
+	for (int y = 1; y <= lines; y++) {
+		for (int x = 1; x <= XSIZE; x++) {
+			/* transition is defined in ca_common.h */
+			to[y][x] = transition(from, x, y);
+		}
+	}
+}
+
+/* --------------------- measurement ---------------------------------- */
+
+int main(int argc, char** argv)
+{
+	int lines, its;
+
+	ca_init(argc, argv, &lines, &its);
+
+	line_t *from = calloc((lines + 2), sizeof(line_t));
+	line_t *to = calloc((lines + 2), sizeof(line_t));
+
+	ca_init_config(from, lines, 0);
+
+	TIME_GET(sim_start);
+	for (int i = 0; i < its; i++) {
+		boundary(from, lines);
+		simulate(from, to, lines);
+
+		line_t *temp = from;
+		from = to;
+		to = temp;
+	}
+	TIME_GET(sim_stop);
+
+	ca_hash_and_report(from + 1, lines, TIME_DIFF(sim_start, sim_stop));
+
+	free(from);
+	free(to);
+
+	return EXIT_SUCCESS;
+}
diff --git a/src/benchmarks/zellularautomat/c/random.c b/src/benchmarks/zellularautomat/c/random.c
new file mode 100644
index 00000000..2a186bcb
--- /dev/null
+++ b/src/benchmarks/zellularautomat/c/random.c
@@ -0,0 +1,207 @@
+#include "random.h"
+/* (c) 1996,1997 Thomas Worsch, Peter Sanders */
+/* =====================================================================
+ * The pseudo random number generator functions in this file are
+ * similar to those in 
+ *    Numerical Recipes in C, Second Edition, pages 279-282.
+ *
+ * The RNGs have names nextRandom<something> and they return a Float64.
+ *    They are initialized by a call to initRandom<something> which 
+ *    take an Int32 as seed.
+ */
+
+#define EPS 1.2e-7
+#define RNMX (1.0-EPS)
+#define IM 2147483647
+#define AM ((Float64)1.0/IM)
+#define IA 16807
+#define IQ 127773
+#define IR 2836
+
+static Int32 state = 123456789;
+
+void initRandomParkMiller(Int32 seed)
+{
+  state = seed;
+  /* but we have to make sure that state never ever is set to zero */
+  if (state==0) { state = 42; }
+}
+
+Float64 nextRandomParkMiller(void)
+{
+  Int32 k;
+  Float64 result;
+
+  k = state/IQ;
+  state = IA*(state-k*IQ)-k*IR;
+  if (state < 0) { state += IM; }
+  result = AM*state;
+  if (result >= 1.0) { result = RNMX; }
+  return result;
+}
+
+/* =====================================================================
+ * Now a pseudo RNG with a longer period (roughly 10^18) by lEcuyer
+ *    (Numerical Recipes, page 282).
+ */
+
+#define IM1 2147483563
+#define IM2 2147483399
+#define AM1 ((Float64)1.0/IM1)
+#define IMM1 (IM1-1)
+#define IA1 40014
+#define IA2 40692
+#define IQ1 53668
+#define IQ2 52774
+#define IR1 12211
+#define IR2 3791
+
+#define NTAB 32
+#define NDIV (1+IMM1/NTAB)
+
+static Int32 state1 = 987654321;
+static Int32 state2;
+static Int32 y;
+static Int32 v[NTAB];
+
+/* ------------------------------------------------------------------ */
+static void initRandomSeedLEcuyer(Int32 seed)
+{
+  state1 = seed;
+  if (state1==0) { state1 = 987654321; }
+  state2 = state1;
+}
+
+/* ------------------------------------------------------------------ */
+static void initRandomTabLEcuyer(void)
+{
+  Int32 j, k;
+
+  for (j=NTAB+7;  j>=0;  j--) {
+    k = state1/IQ1;
+    state1 = IA1*(state1-k*IQ1)-k*IR1;
+    if (state1 < 0) { state1 += IM1; }
+    if (j < NTAB) { v[j] = state1; }
+  }
+  y = v[0];
+}
+
+/* ------------------------------------------------------------------ */
+void initRandomLEcuyer(Int32 seed)
+{
+  initRandomSeedLEcuyer(seed);
+  initRandomTabLEcuyer();
+}
+
+/* ------------------------------------------------------------------ */
+static Int32 power(Int32 base, Card64 exp, Int32 modulus)
+{
+  Int64 temp = 1;
+  Card64 mask;
+
+  if (base < 0) { return 0; }
+
+  /* note that at on each entry into the following loop body, the 
+     actual value of temp is always positive and fits into an Int32 */
+  for (mask = ((Card64)1) << 63;  mask != 0;  mask >>= 1) {
+    temp = (temp * temp) % modulus;
+    if (exp & mask) {
+      temp = (temp * base) % modulus;
+    }
+  }
+  return ((Int32) temp);
+}
+
+/* ------------------------------------------------------------------ */
+static void forwardRandomLEcuyer(Card64 steps)
+{
+  Int32 a;
+
+  a = power(IA1, steps, IM1);
+  state1 = (Int32) ( (((Int64)a) * state1) % IM1);
+
+  a = power(IA2, steps, IM2);
+  state2 = (Int32) ( (((Int64)a) * state2) % IM2);
+}
+
+/* ------------------------------------------------------------------ */
+/*
+ * The following initialization function is intended for use on a
+ * parallel machine. 
+ * Each PE must call initParallelRandomLEcuyer once at the beginning.
+ * Calls to nextRandomLEcuyer will then return different (and hopefully 
+ *    pseudo unrelated) pseudo random numbers on different PEs.
+ * The calls to initParallelRandomLEcuyer on different PEs
+ *    *MUST* *SATISFY* *THE* *FOLLOWING* *CONDITIONS*:
+ *    - The parameter seed is the same on all PEs.
+ *    - The parameter total is the same on all PEs and it must be 
+ *      the total number of PEs using the RNG.
+ *    - The parameter pe must be different on each PE and for each
+ *      i in the set {0,1,...,total-1} there must be exactly one PE
+ *      calling initParallelRandomLEcuyer with parameter pe set to i.
+ * Please note:
+ *    - There are *NO* run time checks to see whether the above
+ *      conditions have been satisfied.
+ *    - The numbers generated on different PEs are probably only more or 
+ *      less unrelated as long as the number of calls of nextRandomLEcuyer
+ *      on each of the PEs is smaller than the period of the RNG
+ *      (approx. 10^18) divided by the number of PEs (probably below 10^4).
+ *      Since 10^14 calls of nextRandomLEcuyer will take quite some time, 
+ *      you are probably safe using this RNG.
+ *    - This function uses nextRandomParkMiller. If you want to have 
+ *      reproducible results, make sure that you do not use
+ *      RandomParkMiller yourself before calling initParallelRandomLEcuyer.
+ */
+void initParallelRandomLEcuyer(Int32 seed, Int32 pe, Int32 total)
+{
+  Card64 steps;
+
+  initRandomSeedLEcuyer(seed);
+
+  /* The period of the RNG is roughly 2.3e18, i.e. 2^61,
+     which should be distributed onto the PEs approximately equally;
+     because we do not know the exact value we are careful and take
+     one half of the average length of the interval per PE: */
+  steps = (((Card64)1) << 60)/total;
+
+  /* For PE number pe we get the starting point: */
+  steps = steps * pe;
+
+  /* Finally the starting point for each PE is randomly shifted
+     by an amount which is small compared to the length of
+     its interval (as long as there are much less than 2^30 PEs :-).
+     Therefore steps will still be far below the end of its interval: */
+  steps = steps + (Card64) (nextRandomParkMiller() * (((Card64)1) << 30));
+     
+  /* Now the RNG is initialized for PE pe as if it had already made steps 
+     many steps from the initial seed. */
+  forwardRandomLEcuyer(steps);
+
+  initRandomTabLEcuyer();
+}
+
+/* ------------------------------------------------------------------ */
+Float64 nextRandomLEcuyer(void)
+{
+  Int32 k;
+  Float64 result;
+  int j;
+
+  k = state1/IQ1;
+  state1 = IA1*(state1-k*IQ1)-k*IR1;
+  if (state1 < 0) { state1 += IM1; }
+
+  k = state2/IQ2;
+  state2 = IA2*(state2-k*IQ2)-k*IR2;
+  if (state2 < 0) { state2 += IM2; }
+
+  j = y/NDIV;
+  y = v[j] - state2;
+  v[j] = state1;
+
+  if (y < 1) { y += IMM1; }
+
+  result = AM1*y;
+  if (result >= 1.0) { result = RNMX; }
+  return result;
+}
diff --git a/src/benchmarks/zellularautomat/c/random.h b/src/benchmarks/zellularautomat/c/random.h
new file mode 100644
index 00000000..8f4e6767
--- /dev/null
+++ b/src/benchmarks/zellularautomat/c/random.h
@@ -0,0 +1,98 @@
+#include <limits.h>
+/* (c) 1996,1997 Thomas Worsch, Peter Sanders */
+/* =====================================================================
+ * The pseudo random number generator functions in this file are
+ * similar to those in 
+ *    Numerical Recipes in C, Second Edition, pages 279-282.
+ *
+ * The RNGs have names nextRandom<something> and they return a Float64.
+ *    They are initialized by a call to initRandom<something> which 
+ *    take an Int32 as seed.
+ */
+
+/* C++ compatibility */
+#ifdef __cplusplus
+#define CC extern "C"
+#else
+#define CC
+#endif
+
+/* try to find out how 32 bit and 64 bit
+ * interger types look like in this compiler
+ * this may fail
+ * e.g., if the compiler does not support 64 data types...
+ */
+#if UINT_MAX >> 31 == 1
+typedef int Int32;
+typedef unsigned int Card32;
+#elif USHRT_MAX >> 31 == 1
+typedef short Int32;
+typedef unsigned short Card32;
+#elif ULONG_MAX >> 31 == 1
+typedef short Int32;
+typedef unsigned short Card32;
+#else /* provoke error */
+typedef nonexisting Int32;
+typedef nonexisting Card32;
+#endif
+
+#if UINT_MAX >> 63 == 1
+typedef int Int64;
+typedef unsigned int Card64;
+#elif ULONG_MAX >> 63 == 1
+typedef long Int64;
+typedef unsigned long Card64;
+#else
+typedef long long int Int64;
+typedef unsigned long long int Card64;
+#endif
+
+typedef double Float64;
+
+/* =====================================================================
+ * The Minimal Standard pseudo RNG (Numerical Recipes, page 279)
+ *    by Park and Miller
+ * I added an initialization function and could therefore remove
+ *    the MASK mechanism from the original ran0 function.
+ */
+CC void initRandomParkMiller(Int32 seed);
+CC Float64 nextRandomParkMiller (void);
+
+
+/* =====================================================================
+ * Now a pseudo RNG with a longer period (roughly 10^18) by lEcuyer
+ *    (Numerical Recipes, page 282).
+ */
+CC void initRandomLEcuyer(Int32 seed);
+CC Float64 nextRandomLEcuyer (void);
+
+
+/* ------------------------------------------------------------------ */
+/*
+ * The following initialization function is intended for use on a
+ * parallel machine. 
+ * Each PE must call initParallelRandomLEcuyer once at the beginning.
+ * Calls to nextRandomLEcuyer will then return different (and hopefully 
+ *    pseudo unrelated) pseudo random numbers on different PEs.
+ * The calls to initParallelRandomLEcuyer on different PEs
+ *    *MUST* *SATISFY* *THE* *FOLLOWING* *CONDITIONS*:
+ *    - The parameter seed is the same on all PEs.
+ *    - The parameter total is the same on all PEs and it must be 
+ *      the total number of PEs using the RNG.
+ *    - The parameter pe must be different on each PE and for each
+ *      i in the set {0,1,...,total-1} there must be exactly one PE
+ *      calling initParallelRandomLEcuyer with parameter pe set to i.
+ * Please note:
+ *    - There are *NO* run time checks to see whether the above
+ *      conditions have been satisfied.
+ *    - The numbers generated on different PEs are probably only more or 
+ *      less unrelated as long as the number of calls of nextRandomLEcuyer
+ *      on each of the PEs is smaller than the period of the RNG
+ *      (approx. 10^18) divided by the number of PEs (probably below 10^4).
+ *      Since 10^14 calls of nextRandomLEcuyer will take quite some time, 
+ *      you are probably safe using this RNG.
+ *    - This function uses nextRandomParkMiller. If you want to have 
+ *      reproducible results, make sure that you do not use
+ *      RandomParkMiller yourself before calling initParallelRandomLEcuyer.
+ */
+CC void initParallelRandomLEcuyer(Int32 seed, Int32 pe, Int32 total);
-- 
GitLab