From ef3093f0b255c3fdd4f9a52183d61dec202dfa31 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Mon, 30 Oct 2023 11:59:33 +0100 Subject: [PATCH 01/28] Fix build for newer CUDA and conda-installed dependencies --- lib/Makefile | 4 ++-- lib/Makefile_ocl | 2 +- lib/include/cudadev.h | 4 +++- lib/include/defines.h | 9 ++++++++- lib/include/ocldev.h | 4 ++-- 5 files changed, 16 insertions(+), 7 deletions(-) diff --git a/lib/Makefile b/lib/Makefile index f70fd40..9dc8822 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -41,7 +41,7 @@ NVCCVERSION=$(shell "${NVCC}" --version | grep ^Cuda | sed 's/^.* //g') ifeq "${NVCCVERSION}" "V5.5.22" NVCCFLAGS ?= -arch sm_20 else - NVCCFLAGS ?= -arch sm_30 + NVCCFLAGS ?= -arch sm_50 endif #NVCCFLAGS = -arch sm_35 @@ -71,7 +71,7 @@ SRC = sapporohostclass.cpp sapporoG6lib.cpp sapporoYeblib.cpp sapporoG5lib.cpp s OBJ = $(SRC:%.cpp=%.o) LIBOBJ = sapporohostclass.o $(INTERFACEPATH)/sapporoG6lib.o $(INTERFACEPATH)/sapporoYeblib.o -LIBOBJ += $(INTERFACEPATH)/sapporoG5lib.o +LIBOBJ += $(INTERFACEPATH)/sapporoG5lib.o $(INTERFACEPATH)/sapporo6thlib.o TARGET = libsapporo.a diff --git a/lib/Makefile_ocl b/lib/Makefile_ocl index 3e26eab..1c6e19e 100644 --- a/lib/Makefile_ocl +++ b/lib/Makefile_ocl @@ -27,7 +27,7 @@ endif OFLAGS = -g -Wall -Wextra -Wstrict-aliasing=2 -fopenmp -D_OCL_ -D__INCLUDE_KERNELS__ -CXXFLAGS = ${testRunFlags} -fPIC $(OFLAGS) -I$(CUDA_TK)/include -msse4 +CXXFLAGS += ${testRunFlags} -fPIC $(OFLAGS) -I$(CUDA_TK)/include -msse4 testRunFlags= $(testRunFlags1) $(testRunFlags2) $(testRunFlags3) $(info $(testRunFlags)) diff --git a/lib/include/cudadev.h b/lib/include/cudadev.h index b2af740..042c13c 100644 --- a/lib/include/cudadev.h +++ b/lib/include/cudadev.h @@ -710,12 +710,14 @@ namespace dev { // jitOptionCount++; // } - + +#if CUDA_VERSION < 6000 if(computeMode < CU_TARGET_COMPUTE_20) { fprintf(stderr,"Sapporo2 requires at least a Fermi or newer NVIDIA architecture.\n"); exit(-1); } +#endif //Set the architecture // { diff --git a/lib/include/defines.h b/lib/include/defines.h index 16e01f7..e6f3216 100644 --- a/lib/include/defines.h +++ b/lib/include/defines.h @@ -55,8 +55,15 @@ inline const char* get_kernelName(const int integrator, case SIXTH: if(precision == DOUBLESINGLE) { +#ifdef _OCL_ + fprintf(stderr, "ERROR: Sixth order integrator with double single precision"); + fprintf(stderr, "ERROR: is not implemented in OpenCL, only in CUDA. Please"); + fprintf(stderr, "ERROR: file an issue on GitHub if you need this combination."); + exit(1); +#else perThreadSM = sizeof(float4)*2 + sizeof(float4) + sizeof(float3); - return "dev_evaluate_gravity_sixth_DS"; +#endif + return "dev_evaluate_gravity_sixth_DS"; } else if(precision == DOUBLE){ #ifdef _OCL_ diff --git a/lib/include/ocldev.h b/lib/include/ocldev.h index 453e67b..e621348 100644 --- a/lib/include/ocldev.h +++ b/lib/include/ocldev.h @@ -574,8 +574,8 @@ namespace dev { void copy(const memory &src, const cl_bool OCL_BLOCKING = CL_TRUE) { assert(ContextFlag); if (n != src.n) { - ocl_free(); - cmalloc(src.n, DeviceMemFlags); + ocl_free(); + allocate(src.n, DeviceMemFlags); } oclSafeCall(clEnqueueCopyBuffer(CommandQueue, src.DeviceMem, From 194beadf0ee7a4a3bc7dceb88a4aea9c17fd1265 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Mon, 30 Oct 2023 12:00:36 +0100 Subject: [PATCH 02/28] Redo build system --- .gitignore | 3 + lib/Makefile_new | 173 ++++++++++++++++++++++++++++++ lib/OpenCLKernels/kernels4th.cl | 2 +- lib/OpenCLKernels/kernels4thDP.cl | 2 +- lib/OpenCLKernels/kernels6th.cl | 2 +- lib/OpenCLKernels/kernelsG5DS.cl | 2 +- lib/OpenCLKernels/kernelsG5SP.cl | 2 +- 7 files changed, 181 insertions(+), 5 deletions(-) create mode 100644 lib/Makefile_new diff --git a/.gitignore b/.gitignore index dfec331..74334c5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ *.o *.a +*.so *~ *.ptx *.ptxh +*.cle +*.clh diff --git a/lib/Makefile_new b/lib/Makefile_new new file mode 100644 index 0000000..6d85bce --- /dev/null +++ b/lib/Makefile_new @@ -0,0 +1,173 @@ +CXX ?= g++ +CC ?= gcc + +.PHONY: all +all: libsapporo.a libsapporo.so emulated_interfaces + + +# Detect CUDA +ifndef CUDA_TK + NVCC := $(shell which nvcc || echo NOTFOUND) + ifeq ($(NVCC), NOTFOUND) + $(info The nvcc command is not available in your shell.) + $(info To compile with CUDA, please install it, set up your environment) + $(info according to the CUDA installation instructions, and try again.) + $(info ) + else + CUDA_TK := $(dir $(NVCC)).. + CUDA_AVAILABLE := 1 + endif +else + NVCC ?= $(CUDA_TK)/bin/nvcc + CUDA_AVAILABLE := 1 +endif + + +# Detect OpenCL +OPENCL_LDFLAGS := -lOpenCL +ifdef OPENCL + OPENCL_LDFLAGS := -L$(OPENCL)/lib -lOpenCL +endif + +OPENCL_STATUS := $(shell echo 'int main() {}' | g++ -x c++ $(OPENCL_LDFLAGS) - && rm a.out || echo NOTFOUND) + +ifeq ($(OPENCL_STATUS), NOTFOUND) + $(info OpenCL support was not detected on the system.) + $(info If it is installed in a non-standard location, then set OPENCL to) + $(info the installation prefix and try again.) + $(info ) +else + OPENCL_AVAILABLE := 1 +endif + + +# Select backend +ifndef BACKEND + ifdef CUDA_AVAILABLE + $(info BACKEND not set and CUDA was detected, using CUDA) + BACKEND := CUDA + else + ifdef OPENCL_AVAILABLE + $(info BACKEND not set and OpenCL was detected, using OpenCL) + BACKEND := OpenCL + else + $(error BACKEND not set and neither CUDA nor OpenGL was detected.) + endif + endif +else + ifeq ($(BACKEND), CUDA) + ifndef CUDA_AVAILABLE + $(error BACKEND set to CUDA but it was not found.) + endif + $(info Using selected backend CUDA) + else + ifeq ($(BACKEND), OpenCL) + ifndef OPENCL_AVAILABLE + $(error BACKEND set to OpenCL but it was not found.) + endif + else + $(error BACKEND set to unknown value "$(BACKEND)", please use CUDA or OpenCL) + endif + $(info Using selected backend OpenCL) + endif +endif +$(info ) + +# Testing/optimisation support +ifdef NTHREADS + CXXFLAGS += -DNTHREADS=$(NTHREADS) -DTIMING_STATS=1 +endif + +ifdef NBLOCKS_PER_MULTI + CXXFLAGS += -DNBLOCKS_PER_MULTI=$(NBLOCKS_PER_MULTI) -DTIMING_STATS=1 +endif + + +# CUDA kernels +ifeq ($(BACKEND), CUDA) + +INCLUDEPATH += -I$(CUDA_TK) +LDFLAGS += -lcuda -fopenmp + +CUDA_SRC = $(wildcard CUDAKernels/*.cu) +PTXH = $(CUDA_SRC:CUDAKernels/%.cu=include/%.ptxh) +NVCCFLAGS += -I./include -I. + +.PHONY: kernels +kernels: $(PTXH) + +%.ptx: %.cu + $(NVCC) $(NVCCFLAGS) -ptx $< -o $@ + +include/%.ptxh: CUDAKernels/%.ptx + xxd -i $< $@ + +endif + + +# OpenCL kernels +ifeq ($(BACKEND), OpenCL) + +ifdef OPENCL + CXXFLAGS += -I$(OPENCL)/include + LDFLAGS += -L$(OPENCL)/lib +endif + +CXXFLAGS += -D_OCL_ -D__INCLUDE_KERNELS__ +LDFLAGS += -lOpenCL -fopenmp + +OPENCL_SRC = $(wildcard OpenCLKernels/*.cl) +CLH = $(OPENCL_SRC:OpenCLKernels/%.cl=include/%.clh) + +.PHONY: kernels +kernels: $(CLH) + +%.cle: %.cl + $(CC) -E -IOpenCLKernels -o $@ - <$< + +include/%.clh: OpenCLKernels/%.cle + xxd -i $< $@ + +endif + + +# Main implementation +CXX_SRC := $(wildcard src/*.cpp) +OBJS := $(CXX_SRC:%.cpp=%.o) +INCLUDEPATH += -Iinclude -I. +CXXFLAGS += $(INCLUDEPATH) -fPIC -g -O3 -Wall -Wextra -Wstrict-aliasing=2 -fopenmp + +src/sapporohostclass.o: kernels + +%.o: %.cpp + $(CXX) $(CXXFLAGS) -c $< -o $@ + +libsapporo.a: $(OBJS) + ar qv $@ $^ + +libsapporo.so: $(OBJS) + $(CXX) -o $@ -shared $^ $(LDFLAGS) + + +# API compatibility libraries +EMU_SRC := $(wildcard interfaces/*lib.cpp) +EMU_STATIC_LIBS := $(EMU_SRC:interfaces/%lib.cpp=lib%.a) +EMU_SHARED_LIBS := $(EMU_SRC:interfaces/%lib.cpp=lib%.so) + +.PHONY: emulated_interfaces +emulated_interfaces: $(EMU_STATIC_LIBS) $(EMU_SHARED_LIBS) + +lib%.a: interfaces/%lib.o + ar qv $@ $^ + +lib%.so: interfaces/%lib.o + $(CXX) -o $@ -shared $^ -L. -lsapporo $(LDFLAGS) + + +# Clean-up +.PHONY: clean +clean: + rm -f src/*.o interfaces/*.o *.a *.so + rm -f CUDAKernels/*.ptx OpenCLKernels/*.cle + rm -f include/*.ptxh include/*.clh + diff --git a/lib/OpenCLKernels/kernels4th.cl b/lib/OpenCLKernels/kernels4th.cl index 9c6d670..495f4cd 100644 --- a/lib/OpenCLKernels/kernels4th.cl +++ b/lib/OpenCLKernels/kernels4th.cl @@ -8,7 +8,7 @@ OpenCL Fourth order DoubleSingle kernels */ -#include "OpenCL/sharedKernels.cl" +#include "sharedKernels.cl" #define CAST 1 diff --git a/lib/OpenCLKernels/kernels4thDP.cl b/lib/OpenCLKernels/kernels4thDP.cl index d44e7d5..3a8c855 100644 --- a/lib/OpenCLKernels/kernels4thDP.cl +++ b/lib/OpenCLKernels/kernels4thDP.cl @@ -8,7 +8,7 @@ OpenCL Fourth order Double Precision */ -#include "OpenCL/sharedKernels.cl" +#include "sharedKernels.cl" __inline void body_body_interaction(inout float2 *ds2_min, diff --git a/lib/OpenCLKernels/kernels6th.cl b/lib/OpenCLKernels/kernels6th.cl index 53cc577..ae0a475 100644 --- a/lib/OpenCLKernels/kernels6th.cl +++ b/lib/OpenCLKernels/kernels6th.cl @@ -8,7 +8,7 @@ OpenCL Double Precision */ -#include "OpenCL/sharedKernels.cl" +#include "sharedKernels.cl" diff --git a/lib/OpenCLKernels/kernelsG5DS.cl b/lib/OpenCLKernels/kernelsG5DS.cl index fea5f8f..449326e 100644 --- a/lib/OpenCLKernels/kernelsG5DS.cl +++ b/lib/OpenCLKernels/kernelsG5DS.cl @@ -9,7 +9,7 @@ OpenCL Double Single kernels */ -#include "OpenCL/sharedKernels.cl" +#include "sharedKernels.cl" diff --git a/lib/OpenCLKernels/kernelsG5SP.cl b/lib/OpenCLKernels/kernelsG5SP.cl index f1077cf..5de684f 100644 --- a/lib/OpenCLKernels/kernelsG5SP.cl +++ b/lib/OpenCLKernels/kernelsG5SP.cl @@ -8,7 +8,7 @@ CUDA single precisin kernels */ -#include "OpenCL/sharedKernels.cl" +#include "sharedKernels.cl" __inline void body_body_interaction(inout float4 *acc_i, From 18342fcae1939dabc9e5ba75c90370dabfa5831f Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Mon, 30 Oct 2023 12:01:05 +0100 Subject: [PATCH 03/28] Make tests compile with the new build system and conda --- testCodes/Makefile | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/testCodes/Makefile b/testCodes/Makefile index 3d9324f..0f1a852 100644 --- a/testCodes/Makefile +++ b/testCodes/Makefile @@ -1,21 +1,22 @@ -CXX = g++ -CC = gcc -LD = g++ -F90 = ifort +CXX ?= g++ +CC ?= gcc +LD ?= g++ +F90 ?= ifort .SUFFIXES: .o .cpp .ptx .cu SAPPOROPATH=../lib/ SAPLIB2 = sapporo SAPLIB = lib$(SAPLIB2).a +SAPLIBG6 = sapporoG6 -CUDA_TK = /usr/local/cuda +CUDA_TK ?= /usr/local/cuda -OFLAGS = -g -O3 -Wall -fopenmp -Wextra -Wstrict-aliasing=2 -fopenmp -CXXFLAGS = -fPIC $(OFLAGS) -I$(CUDA_TK)/include +OFLAGS = -g -O3 -Wall -Wextra -Wstrict-aliasing=2 -fopenmp +CXXFLAGS += -fPIC -fopenmp $(OFLAGS) -I$(CUDA_TK)/include -LDFLAGS = -lcuda -fopenmp -L$(CUDA_TK)/lib64 +LDFLAGS += -lcuda -L$(CUDA_TK)/lib64 INCLUDEPATH = $(SAPPOROPATH)/include CXXFLAGS += -I$(INCLUDEPATH) -I./ -I $(SAPPOROPATH) @@ -31,34 +32,34 @@ all: $(OBJ) $(PROG) kernels kernels: - ln -s $(SAPPOROPATH)/CUDAKernels/ CUDA/ + ln -s $(SAPPOROPATH)/CUDAKernels CUDA #$(PROG): $(OBJ) -# $(LD) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -lsapporo +# $(CXX) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -lsapporo test_gravity_block_cuda : test_gravity_block.o - $(LD) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) + $(CXX) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) test_gravity_block_g5_cuda: test_gravity_block_g5.o - $(LD) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) + $(CXX) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) test_gravity_block_6th_cuda : test_gravity_block_6th.o - $(LD) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) + $(CXX) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) test_performance_rangeN_cuda : test_performance_rangeN.o - $(LD) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) + $(CXX) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) test_performance_blockStep_cuda : test_performance_blockStep.o - $(LD) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) + $(CXX) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) test_performance_rangeN_6th_cuda : test_performance_rangeN_6th.o - $(LD) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) + $(CXX) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) test_performance_rangeN_g5_cuda : test_performance_rangeN_g5.o - $(LD) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) + $(CXX) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) test_integrator_cuda : test_integrator.o - $(LD) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) + $(CXX) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIBG6) $(LDFLAGS) %.o: $(SRCPATH)/%.cpp From ad2b2f2d45994dcfd25cb48de76e68e1daef8977 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Tue, 31 Oct 2023 08:55:06 +0100 Subject: [PATCH 04/28] Avoid clash with EasyBuild CUDA INCLUDEPATH variable --- lib/Makefile_new | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/Makefile_new b/lib/Makefile_new index 6d85bce..601c18a 100644 --- a/lib/Makefile_new +++ b/lib/Makefile_new @@ -86,7 +86,7 @@ endif # CUDA kernels ifeq ($(BACKEND), CUDA) -INCLUDEPATH += -I$(CUDA_TK) +INCLUDES = -I$(CUDA_TK) LDFLAGS += -lcuda -fopenmp CUDA_SRC = $(wildcard CUDAKernels/*.cu) @@ -113,6 +113,7 @@ ifdef OPENCL LDFLAGS += -L$(OPENCL)/lib endif +INCLUDES = CXXFLAGS += -D_OCL_ -D__INCLUDE_KERNELS__ LDFLAGS += -lOpenCL -fopenmp @@ -134,8 +135,8 @@ endif # Main implementation CXX_SRC := $(wildcard src/*.cpp) OBJS := $(CXX_SRC:%.cpp=%.o) -INCLUDEPATH += -Iinclude -I. -CXXFLAGS += $(INCLUDEPATH) -fPIC -g -O3 -Wall -Wextra -Wstrict-aliasing=2 -fopenmp +INCLUDES = -Iinclude -I. +CXXFLAGS += $(INCLUDES) -fPIC -g -O3 -Wall -Wextra -Wstrict-aliasing=2 -fopenmp src/sapporohostclass.o: kernels From a7134e7b334582f42637d7b363e8cd4b42a87148 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Tue, 31 Oct 2023 09:09:58 +0100 Subject: [PATCH 05/28] Keep the .ptx files, they're actually the ones that get used --- lib/Makefile_new | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/Makefile_new b/lib/Makefile_new index 601c18a..5629335 100644 --- a/lib/Makefile_new +++ b/lib/Makefile_new @@ -90,11 +90,13 @@ INCLUDES = -I$(CUDA_TK) LDFLAGS += -lcuda -fopenmp CUDA_SRC = $(wildcard CUDAKernels/*.cu) +PTX = $(CUDA_SRC:CUDAKernels/%.cu=CUDAKernels/%.ptx) +# The .ptxh version seems to be unused PTXH = $(CUDA_SRC:CUDAKernels/%.cu=include/%.ptxh) NVCCFLAGS += -I./include -I. .PHONY: kernels -kernels: $(PTXH) +kernels: $(PTX) $(PTXH) %.ptx: %.cu $(NVCC) $(NVCCFLAGS) -ptx $< -o $@ From cafc8235b5abbe746707fd3893f3810337b5453b Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Tue, 31 Oct 2023 11:23:42 +0100 Subject: [PATCH 06/28] Fix symlinking error on repeated builds --- testCodes/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/testCodes/Makefile b/testCodes/Makefile index 0f1a852..7362350 100644 --- a/testCodes/Makefile +++ b/testCodes/Makefile @@ -32,7 +32,7 @@ all: $(OBJ) $(PROG) kernels kernels: - ln -s $(SAPPOROPATH)/CUDAKernels CUDA + rm -f CUDA && ln -s $(SAPPOROPATH)/CUDAKernels CUDA #$(PROG): $(OBJ) # $(CXX) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -lsapporo @@ -67,7 +67,7 @@ test_integrator_cuda : test_integrator.o clean: - /bin/rm -rf *.o *.ptx *.a $(PROG) + /bin/rm -rf *.o *.ptx *.a $(PROG) CUDA $(OBJ): $(SAPPOROPATH)/$(SAPLIB) From ecabda6995bffac8c5f0776789a3a095c070b9c5 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Tue, 31 Oct 2023 11:25:08 +0100 Subject: [PATCH 07/28] Improve README a bit --- testCodes/README | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/testCodes/README b/testCodes/README index 4e667a4..3d1fff0 100644 --- a/testCodes/README +++ b/testCodes/README @@ -1,7 +1,7 @@ Sapporo2, test and example programs This folder contains a set of test and example programs that can -be used with Sapporo2. +be used with Sapporo2. =============================================================================== @@ -13,12 +13,18 @@ Makefile and Makefile_ocl (the CUDA_TK variable) To build the CUDA versions: 'make' -Tol build the OpenCL versions: -'make -f Makefile_ocl' +To build the OpenCL versions: +'make -f Makefile_ocl' -In both cases make sure you build the matching sapporo library in the 'lib' folder. +In both cases make sure you build the matching sapporo library in the 'lib' folder, and +use -Make sure that the GPU kernel sources and ptx files are in a subfolder 'CUDA' can be +'export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../lib' + +to ensure the linker can find the sapporo2 libraries, or better, use the absolute path +to the 'lib' directory. + +Make sure that the GPU kernel sources and ptx files are in a subfolder 'CUDA' can be symlinked from the lib folder. This is done by default in the Makefiles. =============================================================================== @@ -38,12 +44,12 @@ USAGE / Examples How to start a 4th order test using different precision: CUDA, 4th order default -./test_gravity_block_cuda +./test_gravity_block_cuda ./test_gravity_block_cuda 16364 -> number of particles is 16384 using CUDA ./test_gravity_block_ocl 16364 -> number of particles is 16384 using OpenCL ./test_gravity_block_cuda 16364 CUDA/kernels4thDP.ptx 1 1 -> - number of particles is 16384 using CUDA, using a full double precision kernel, with + number of particles is 16384 using CUDA, using a full double precision kernel, with integration order 1 (=FOURTH order) with precision 1 (=DOUBLE precision) ./test_gravity_block_ocl 16364 OpenCL/kernels4thDP.cl 1 1 -> same as above but now using OpenCL. From 5fde0ac9107f157f069e3f96e8f516b5e355f930 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Tue, 31 Oct 2023 11:25:37 +0100 Subject: [PATCH 08/28] Fix error when building for newer GPU architectures --- lib/CUDAKernels/kernels.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/CUDAKernels/kernels.cu b/lib/CUDAKernels/kernels.cu index ebb271c..66822d7 100644 --- a/lib/CUDAKernels/kernels.cu +++ b/lib/CUDAKernels/kernels.cu @@ -206,7 +206,7 @@ __device__ __forceinline__ double RSQRT(double val) { return rsqrt(val); } // template<> __device__ __forceinline__ double RSQRT(double val) { return 1.0/sqrt(val); } - +#if __CUDA_ARCH__ < 600 __device__ double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = @@ -220,6 +220,7 @@ __device__ double atomicAdd(double* address, double val) } while (assumed != old); return __longlong_as_double(old); } +#endif __device__ __forceinline__ double atomicMin(double *address, double val) From 3347293c85a9750b1a53c419276f8753a289c641 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Tue, 31 Oct 2023 11:27:46 +0100 Subject: [PATCH 09/28] Add .gitignore for the test directory --- testCodes/.gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 testCodes/.gitignore diff --git a/testCodes/.gitignore b/testCodes/.gitignore new file mode 100644 index 0000000..bd0f7ea --- /dev/null +++ b/testCodes/.gitignore @@ -0,0 +1,2 @@ +test_* +CUDA From 9d9ea58ace93f812bd063d5d8117405ad5e66bc7 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Tue, 31 Oct 2023 11:41:49 +0100 Subject: [PATCH 10/28] Replace old build system with new build system --- lib/Makefile | 204 ++++++++++++++++++++++++++++++----------------- lib/Makefile_new | 176 ---------------------------------------- lib/Makefile_ocl | 105 ------------------------ 3 files changed, 133 insertions(+), 352 deletions(-) delete mode 100644 lib/Makefile_new delete mode 100644 lib/Makefile_ocl diff --git a/lib/Makefile b/lib/Makefile index 9dc8822..5629335 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -1,114 +1,176 @@ -CXX = g++ -CC = gcc -LD = g++ -F90 = ifort +CXX ?= g++ +CC ?= gcc + +.PHONY: all +all: libsapporo.a libsapporo.so emulated_interfaces + + +# Detect CUDA +ifndef CUDA_TK + NVCC := $(shell which nvcc || echo NOTFOUND) + ifeq ($(NVCC), NOTFOUND) + $(info The nvcc command is not available in your shell.) + $(info To compile with CUDA, please install it, set up your environment) + $(info according to the CUDA installation instructions, and try again.) + $(info ) + else + CUDA_TK := $(dir $(NVCC)).. + CUDA_AVAILABLE := 1 + endif +else + NVCC ?= $(CUDA_TK)/bin/nvcc + CUDA_AVAILABLE := 1 +endif -.SUFFIXES: .o .cpp .ptx .cu -CUDA_TK ?= /usr/local/cuda +# Detect OpenCL +OPENCL_LDFLAGS := -lOpenCL +ifdef OPENCL + OPENCL_LDFLAGS := -L$(OPENCL)/lib -lOpenCL +endif +OPENCL_STATUS := $(shell echo 'int main() {}' | g++ -x c++ $(OPENCL_LDFLAGS) - && rm a.out || echo NOTFOUND) -testRunFlags1= -testRunFlags2= -testRunFlags3= +ifeq ($(OPENCL_STATUS), NOTFOUND) + $(info OpenCL support was not detected on the system.) + $(info If it is installed in a non-standard location, then set OPENCL to) + $(info the installation prefix and try again.) + $(info ) +else + OPENCL_AVAILABLE := 1 +endif -#Check for the defines +# Select backend +ifndef BACKEND + ifdef CUDA_AVAILABLE + $(info BACKEND not set and CUDA was detected, using CUDA) + BACKEND := CUDA + else + ifdef OPENCL_AVAILABLE + $(info BACKEND not set and OpenCL was detected, using OpenCL) + BACKEND := OpenCL + else + $(error BACKEND not set and neither CUDA nor OpenGL was detected.) + endif + endif +else + ifeq ($(BACKEND), CUDA) + ifndef CUDA_AVAILABLE + $(error BACKEND set to CUDA but it was not found.) + endif + $(info Using selected backend CUDA) + else + ifeq ($(BACKEND), OpenCL) + ifndef OPENCL_AVAILABLE + $(error BACKEND set to OpenCL but it was not found.) + endif + else + $(error BACKEND set to unknown value "$(BACKEND)", please use CUDA or OpenCL) + endif + $(info Using selected backend OpenCL) + endif +endif +$(info ) + +# Testing/optimisation support ifdef NTHREADS - testRunFlags1="-D NTHREADS=$(NTHREADS)" - testRunFlags3="-D TIMING_STATS=1" + CXXFLAGS += -DNTHREADS=$(NTHREADS) -DTIMING_STATS=1 endif ifdef NBLOCKS_PER_MULTI - testRunFlags2="-D NBLOCKS_PER_MULTI=$(NBLOCKS_PER_MULTI)" - testRunFlags3="-D TIMING_STATS=1" + CXXFLAGS += -DNBLOCKS_PER_MULTI=$(NBLOCKS_PER_MULTI) -DTIMING_STATS=1 endif -OFLAGS = -g -O3 -Wall -Wextra -Wstrict-aliasing=2 -fopenmp -#Use below if compiling with CPU_SUPPORT (SSE) -#CXXFLAGS += ${testRunFlags} -fPIC $(OFLAGS) -I$(CUDA_TK)/include -msse4 -CXXFLAGS += ${testRunFlags} -fPIC $(OFLAGS) -I$(CUDA_TK)/include +# CUDA kernels +ifeq ($(BACKEND), CUDA) -testRunFlags= $(testRunFlags1) $(testRunFlags2) $(testRunFlags3) -$(info $(testRunFlags)) +INCLUDES = -I$(CUDA_TK) +LDFLAGS += -lcuda -fopenmp -NVCC = $(CUDA_TK)/bin/nvcc +CUDA_SRC = $(wildcard CUDAKernels/*.cu) +PTX = $(CUDA_SRC:CUDAKernels/%.cu=CUDAKernels/%.ptx) +# The .ptxh version seems to be unused +PTXH = $(CUDA_SRC:CUDAKernels/%.cu=include/%.ptxh) +NVCCFLAGS += -I./include -I. +.PHONY: kernels +kernels: $(PTX) $(PTXH) -# Support older CUDA versions out of the box -NVCCVERSION=$(shell "${NVCC}" --version | grep ^Cuda | sed 's/^.* //g') -ifeq "${NVCCVERSION}" "V5.5.22" - NVCCFLAGS ?= -arch sm_20 -else - NVCCFLAGS ?= -arch sm_50 -endif +%.ptx: %.cu + $(NVCC) $(NVCCFLAGS) -ptx $< -o $@ -#NVCCFLAGS = -arch sm_35 -#NVCCFLAGS ?= -arch sm_30 -#NVCCFLAGS = -arch sm_20 -NVCCFLAGS += ${testRunFlags} +include/%.ptxh: CUDAKernels/%.ptx + xxd -i $< $@ -# Use with Mac OS X -# NVCCFLAGS = -arch sm_12 -Xcompiler="-Duint=unsigned\ int" +endif -LDFLAGS = -lcuda -fopenmp +# OpenCL kernels +ifeq ($(BACKEND), OpenCL) -INCLUDEPATH = ./include -CXXFLAGS += -I$(INCLUDEPATH) -I./ -NVCCFLAGS += -I$(INCLUDEPATH) -I./ +ifdef OPENCL + CXXFLAGS += -I$(OPENCL)/include + LDFLAGS += -L$(OPENCL)/lib +endif -INTERFACEPATH =./interfaces +INCLUDES = +CXXFLAGS += -D_OCL_ -D__INCLUDE_KERNELS__ +LDFLAGS += -lOpenCL -fopenmp -CUDAKERNELSPATH = ./CUDAKernels -CUDAKERNELS = kernels.cu +OPENCL_SRC = $(wildcard OpenCLKernels/*.cl) +CLH = $(OPENCL_SRC:OpenCLKernels/%.cl=include/%.clh) -CUDAPTX = $(CUDAKERNELS:%.cu=$(CUDAKERNELSPATH)/%.ptx) +.PHONY: kernels +kernels: $(CLH) -SRCPATH = src -SRC = sapporohostclass.cpp sapporoG6lib.cpp sapporoYeblib.cpp sapporoG5lib.cpp sapporo6thlib.cpp -OBJ = $(SRC:%.cpp=%.o) +%.cle: %.cl + $(CC) -E -IOpenCLKernels -o $@ - <$< -LIBOBJ = sapporohostclass.o $(INTERFACEPATH)/sapporoG6lib.o $(INTERFACEPATH)/sapporoYeblib.o -LIBOBJ += $(INTERFACEPATH)/sapporoG5lib.o $(INTERFACEPATH)/sapporo6thlib.o -TARGET = libsapporo.a +include/%.clh: OpenCLKernels/%.cle + xxd -i $< $@ +endif -all: $(OBJ) $(CUDAPTX) $(TARGET) -kernels: $(CUDAPTX) +# Main implementation +CXX_SRC := $(wildcard src/*.cpp) +OBJS := $(CXX_SRC:%.cpp=%.o) +INCLUDES = -Iinclude -I. +CXXFLAGS += $(INCLUDES) -fPIC -g -O3 -Wall -Wextra -Wstrict-aliasing=2 -fopenmp -$(TARGET): $(LIBOBJ) - ar qv $@ $^ +src/sapporohostclass.o: kernels -%.o: $(SRCPATH)/%.cpp +%.o: %.cpp $(CXX) $(CXXFLAGS) -c $< -o $@ -$(CUDAKERNELSPATH)/%.ptx: $(CUDAKERNELSPATH)/%.cu - $(NVCC) $(NVCCFLAGS) -ptx $< -o $@ - -$(INCLUDEPATH)/%.ptxh: $(CUDAKERNELSPATH)/%.ptx - xxd -i $< $@ - -clean: - /bin/rm -rf *.o *.ptx *.a - cd $(INTERFACEPATH); /bin/rm -rf *.o; cd .. - cd $(CUDAKERNELSPATH); /bin/rm -rf *.ptx; cd .. - rm -f *.ptxh $(INCLUDEPATH)/*.ptxh - -$(OBJ): $(INCLUDEPATH)/*.h +libsapporo.a: $(OBJS) + ar qv $@ $^ - -sapporohostclass.o : $(INCLUDEPATH)/kernels.ptxh $(INCLUDEPATH)/sapporohostclass.h $(INCLUDEPATH)/sapdevclass.h $(INCLUDEPATH)/defines.h -$(CUDAKERNELSPATH)/kernels.ptx : $(INCLUDEPATH)/defines.h - -libsapporo.a : sapporohostclass.o +libsapporo.so: $(OBJS) + $(CXX) -o $@ -shared $^ $(LDFLAGS) +# API compatibility libraries +EMU_SRC := $(wildcard interfaces/*lib.cpp) +EMU_STATIC_LIBS := $(EMU_SRC:interfaces/%lib.cpp=lib%.a) +EMU_SHARED_LIBS := $(EMU_SRC:interfaces/%lib.cpp=lib%.so) +.PHONY: emulated_interfaces +emulated_interfaces: $(EMU_STATIC_LIBS) $(EMU_SHARED_LIBS) +lib%.a: interfaces/%lib.o + ar qv $@ $^ +lib%.so: interfaces/%lib.o + $(CXX) -o $@ -shared $^ -L. -lsapporo $(LDFLAGS) +# Clean-up +.PHONY: clean +clean: + rm -f src/*.o interfaces/*.o *.a *.so + rm -f CUDAKernels/*.ptx OpenCLKernels/*.cle + rm -f include/*.ptxh include/*.clh diff --git a/lib/Makefile_new b/lib/Makefile_new deleted file mode 100644 index 5629335..0000000 --- a/lib/Makefile_new +++ /dev/null @@ -1,176 +0,0 @@ -CXX ?= g++ -CC ?= gcc - -.PHONY: all -all: libsapporo.a libsapporo.so emulated_interfaces - - -# Detect CUDA -ifndef CUDA_TK - NVCC := $(shell which nvcc || echo NOTFOUND) - ifeq ($(NVCC), NOTFOUND) - $(info The nvcc command is not available in your shell.) - $(info To compile with CUDA, please install it, set up your environment) - $(info according to the CUDA installation instructions, and try again.) - $(info ) - else - CUDA_TK := $(dir $(NVCC)).. - CUDA_AVAILABLE := 1 - endif -else - NVCC ?= $(CUDA_TK)/bin/nvcc - CUDA_AVAILABLE := 1 -endif - - -# Detect OpenCL -OPENCL_LDFLAGS := -lOpenCL -ifdef OPENCL - OPENCL_LDFLAGS := -L$(OPENCL)/lib -lOpenCL -endif - -OPENCL_STATUS := $(shell echo 'int main() {}' | g++ -x c++ $(OPENCL_LDFLAGS) - && rm a.out || echo NOTFOUND) - -ifeq ($(OPENCL_STATUS), NOTFOUND) - $(info OpenCL support was not detected on the system.) - $(info If it is installed in a non-standard location, then set OPENCL to) - $(info the installation prefix and try again.) - $(info ) -else - OPENCL_AVAILABLE := 1 -endif - - -# Select backend -ifndef BACKEND - ifdef CUDA_AVAILABLE - $(info BACKEND not set and CUDA was detected, using CUDA) - BACKEND := CUDA - else - ifdef OPENCL_AVAILABLE - $(info BACKEND not set and OpenCL was detected, using OpenCL) - BACKEND := OpenCL - else - $(error BACKEND not set and neither CUDA nor OpenGL was detected.) - endif - endif -else - ifeq ($(BACKEND), CUDA) - ifndef CUDA_AVAILABLE - $(error BACKEND set to CUDA but it was not found.) - endif - $(info Using selected backend CUDA) - else - ifeq ($(BACKEND), OpenCL) - ifndef OPENCL_AVAILABLE - $(error BACKEND set to OpenCL but it was not found.) - endif - else - $(error BACKEND set to unknown value "$(BACKEND)", please use CUDA or OpenCL) - endif - $(info Using selected backend OpenCL) - endif -endif -$(info ) - -# Testing/optimisation support -ifdef NTHREADS - CXXFLAGS += -DNTHREADS=$(NTHREADS) -DTIMING_STATS=1 -endif - -ifdef NBLOCKS_PER_MULTI - CXXFLAGS += -DNBLOCKS_PER_MULTI=$(NBLOCKS_PER_MULTI) -DTIMING_STATS=1 -endif - - -# CUDA kernels -ifeq ($(BACKEND), CUDA) - -INCLUDES = -I$(CUDA_TK) -LDFLAGS += -lcuda -fopenmp - -CUDA_SRC = $(wildcard CUDAKernels/*.cu) -PTX = $(CUDA_SRC:CUDAKernels/%.cu=CUDAKernels/%.ptx) -# The .ptxh version seems to be unused -PTXH = $(CUDA_SRC:CUDAKernels/%.cu=include/%.ptxh) -NVCCFLAGS += -I./include -I. - -.PHONY: kernels -kernels: $(PTX) $(PTXH) - -%.ptx: %.cu - $(NVCC) $(NVCCFLAGS) -ptx $< -o $@ - -include/%.ptxh: CUDAKernels/%.ptx - xxd -i $< $@ - -endif - - -# OpenCL kernels -ifeq ($(BACKEND), OpenCL) - -ifdef OPENCL - CXXFLAGS += -I$(OPENCL)/include - LDFLAGS += -L$(OPENCL)/lib -endif - -INCLUDES = -CXXFLAGS += -D_OCL_ -D__INCLUDE_KERNELS__ -LDFLAGS += -lOpenCL -fopenmp - -OPENCL_SRC = $(wildcard OpenCLKernels/*.cl) -CLH = $(OPENCL_SRC:OpenCLKernels/%.cl=include/%.clh) - -.PHONY: kernels -kernels: $(CLH) - -%.cle: %.cl - $(CC) -E -IOpenCLKernels -o $@ - <$< - -include/%.clh: OpenCLKernels/%.cle - xxd -i $< $@ - -endif - - -# Main implementation -CXX_SRC := $(wildcard src/*.cpp) -OBJS := $(CXX_SRC:%.cpp=%.o) -INCLUDES = -Iinclude -I. -CXXFLAGS += $(INCLUDES) -fPIC -g -O3 -Wall -Wextra -Wstrict-aliasing=2 -fopenmp - -src/sapporohostclass.o: kernels - -%.o: %.cpp - $(CXX) $(CXXFLAGS) -c $< -o $@ - -libsapporo.a: $(OBJS) - ar qv $@ $^ - -libsapporo.so: $(OBJS) - $(CXX) -o $@ -shared $^ $(LDFLAGS) - - -# API compatibility libraries -EMU_SRC := $(wildcard interfaces/*lib.cpp) -EMU_STATIC_LIBS := $(EMU_SRC:interfaces/%lib.cpp=lib%.a) -EMU_SHARED_LIBS := $(EMU_SRC:interfaces/%lib.cpp=lib%.so) - -.PHONY: emulated_interfaces -emulated_interfaces: $(EMU_STATIC_LIBS) $(EMU_SHARED_LIBS) - -lib%.a: interfaces/%lib.o - ar qv $@ $^ - -lib%.so: interfaces/%lib.o - $(CXX) -o $@ -shared $^ -L. -lsapporo $(LDFLAGS) - - -# Clean-up -.PHONY: clean -clean: - rm -f src/*.o interfaces/*.o *.a *.so - rm -f CUDAKernels/*.ptx OpenCLKernels/*.cle - rm -f include/*.ptxh include/*.clh - diff --git a/lib/Makefile_ocl b/lib/Makefile_ocl deleted file mode 100644 index 1c6e19e..0000000 --- a/lib/Makefile_ocl +++ /dev/null @@ -1,105 +0,0 @@ -CXX = g++ -CC = gcc -LD = g++ -F90 = ifort - -.SUFFIXES: .o .cpp .ptx .cu - -CUDA_TK = /usr/local/cuda -#CUDA_TK = /opt/AMDAPP/ - - -testRunFlags1= -testRunFlags2= -testRunFlags3= - -#Check for the defines - -ifdef NTHREADS - testRunFlags1="-D NTHREADS=$(NTHREADS)" - testRunFlags3="-D TIMING_STATS=1" -endif - -ifdef NBLOCKS_PER_MULTI - testRunFlags2="-D NBLOCKS_PER_MULTI=$(NBLOCKS_PER_MULTI)" - testRunFlags3="-D TIMING_STATS=1" -endif - -OFLAGS = -g -Wall -Wextra -Wstrict-aliasing=2 -fopenmp -D_OCL_ -D__INCLUDE_KERNELS__ - -CXXFLAGS += ${testRunFlags} -fPIC $(OFLAGS) -I$(CUDA_TK)/include -msse4 - -testRunFlags= $(testRunFlags1) $(testRunFlags2) $(testRunFlags3) -$(info $(testRunFlags)) - -NVCC = $(CUDA_TK)/bin/nvcc -NVCCFLAGS = -arch sm_20 ${testRunFlags} - -# Use with Mac OS X -# NVCCFLAGS = -arch sm_12 -Xcompiler="-Duint=unsigned\ int" - -LDFLAGS = -lOpenCL -fopenmp - - -INCLUDEPATH = ./include -CXXFLAGS += -I$(INCLUDEPATH) -I./ -NVCCFLAGS += -I$(INCLUDEPATH) -I./ - -INTERFACEPATH =./interfaces - -SRCPATH = src -SRC = sapporohostclass.cpp sapporoG6lib.cpp sapporoYeblib.cpp sapporoG5lib.cpp sapporo6thlib.cpp -OBJ = $(SRC:%.cpp=%_ocl.o) - -LIBOBJ = sapporohostclass_ocl.o $(INTERFACEPATH)/sapporoG6lib_ocl.o $(INTERFACEPATH)/sapporoYeblib_ocl.o $(INTERFACEPATH)/sapporoG5lib_ocl.o -TARGET = libsapporo_ocl.a - -OCLKERNELSPATH = ./OpenCLKernels -OCLKERNELS=kernels4th.cl kernels4thDP.cl kernels6th.cl kernelsG5DS.cl kernelsG5SP.cl sharedKernels.cl -OPENCL_CLH = $(OCLKERNELS:%.cl=$(INCLUDEPATH)/%.clh) - -all: $(OBJ) $(TARGET) - echo $(OPENCL_CLH) - -$(TARGET): $(LIBOBJ) - ar qv $@ $^ - -%_ocl.o: $(SRCPATH)/%.cpp - $(CXX) $(CXXFLAGS) -c $< -o $@ - -$(INTERFACEPATH)/%_ocl.o: $(INTERFACEPATH)/%.cpp - $(CXX) $(CXXFLAGS) -c $< -o $@ - - -$(CUDAKERNELSPATH)/%.ptx: $(CUDAKERNELSPATH)/%.cu - $(NVCC) $(NVCCFLAGS) -ptx $< -o $@ - - -$(OCLKERNELSPATH)/%.cle: $(OCLKERNELSPATH)/%.cl - rm -f OpenCL - ln -s $(OCLKERNELSPATH) OpenCL - $(CC) -E -I. -c - -o $@ < $< - rm OpenCL - -$(INCLUDEPATH)/%.clh: $(OCLKERNELSPATH)/%.cle - xxd -i $< $@ - -clean: - /bin/rm -rf *.o *.ptx *.a - cd $(INTERFACEPATH); /bin/rm -rf *.o; cd .. - cd $(CUDAKERNELSPATH); /bin/rm -rf *.ptx; cd .. - rm -f *.clh $(INCLUDEPATH)/*.clh - -$(OBJ): $(INCLUDEPATH)/*.h - - -sapporohostclass_ocl.o : $(OPENCL_CLH) $(INCLUDEPATH)/sapporohostclass.h $(INCLUDEPATH)/sapdevclass.h $(INCLUDEPATH)/defines.h - -libsapporo_ocl.a : sapporohostclass_ocl.o - - - - - - - From 54cae9fee357e9936b78c60d5bebc9dbf160c1af Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Tue, 31 Oct 2023 11:42:03 +0100 Subject: [PATCH 11/28] Update build instructions in the README --- README.extended | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/README.extended b/README.extended index c604e72..d81cb59 100644 --- a/README.extended +++ b/README.extended @@ -86,15 +86,33 @@ With some luck a simple 'make' in the lib folder is sufficient to build the library, if not then here are some pointers: CUDA -To build the CUDA library; Set the 'CUDA_TK' path to the location -where the CUDA toolkit is installed e.g.. CUDA_TK = /usr/local/cuda and -type: 'make' . + +If CUDA is installed via the nVidia installer, Conda, or HPC modules, +then it should be detected automatically by the build system. If it +somehow isn't, then you can try to set 'CUDA_TK' to the location where +it is installed, e.g. + +CUDA_TK=/usr/local/cuda make + OpenCL -To build the CUDA library; Set the 'CUDA_TK' path to the location -where the cuda or AMD OpenCL toolkit is installed eg. CUDA_TK = -/usr/local/cuda or CUDA_TK = /opt/AMDAPP/ and type: 'make -f -Makefile_ocl' . + +If OpenCL is installed in a standard location (e.g. via apt or yum), +then it should be detected automatically by the build system. If it +isn't, then you can set the 'OPENCL' variable to the location where +it is installed, e.g. + +OPENCL=/opt/opencl make + +If both CUDA and OpenCL are detected, then CUDA is used by default. +To select OpenCL, set BACKEND to OpenCL: + +BACKEND=OpenCL make + +You can combine these options as well: + +BACKEND=OpenCL OPENCL=/home/user/.local make + Interfaces: The library has built-in support for a couple of default interfaces to From feeb564d24fdb49e276a1c31168f0028543c1253 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Wed, 1 Nov 2023 17:57:33 +0100 Subject: [PATCH 12/28] Make dependency of interfaces on libsapporo explicit --- lib/Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/Makefile b/lib/Makefile index 5629335..02b17ef 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -160,6 +160,11 @@ EMU_SHARED_LIBS := $(EMU_SRC:interfaces/%lib.cpp=lib%.so) .PHONY: emulated_interfaces emulated_interfaces: $(EMU_STATIC_LIBS) $(EMU_SHARED_LIBS) +$(EMU_STATIC_LIBS): libsapporo.a + +$(EMU_SHARED_LIBS): libsapporo.so + + lib%.a: interfaces/%lib.o ar qv $@ $^ From b3d49516e737c76ce3761db5800f68dec25b5743 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Wed, 1 Nov 2023 18:09:02 +0100 Subject: [PATCH 13/28] Enable make clean even if there's no CUDA or OpenGL available --- lib/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/Makefile b/lib/Makefile index 02b17ef..5008d64 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -42,6 +42,7 @@ endif # Select backend +ifeq ($(filter clean,$(MAKECMDGOALS)),) ifndef BACKEND ifdef CUDA_AVAILABLE $(info BACKEND not set and CUDA was detected, using CUDA) @@ -71,6 +72,7 @@ else $(info Using selected backend OpenCL) endif endif +endif $(info ) # Testing/optimisation support From 674292caf3d540a800e600a8541d5283b9ef0d62 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Wed, 1 Nov 2023 18:29:35 +0100 Subject: [PATCH 14/28] Include kernels also when compiling with CUDA --- lib/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/Makefile b/lib/Makefile index 5008d64..34a2f63 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -89,6 +89,7 @@ endif ifeq ($(BACKEND), CUDA) INCLUDES = -I$(CUDA_TK) +CXXFLAGS += -D__INCLUDE_KERNELS__ LDFLAGS += -lcuda -fopenmp CUDA_SRC = $(wildcard CUDAKernels/*.cu) From 19ac0b9ac97d460af449c5b09c49fa2b08f64f07 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Thu, 2 Nov 2023 16:52:45 +0100 Subject: [PATCH 15/28] Make .gitignore only match the executables, not the source in the subdir --- testCodes/.gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testCodes/.gitignore b/testCodes/.gitignore index bd0f7ea..583b07d 100644 --- a/testCodes/.gitignore +++ b/testCodes/.gitignore @@ -1,2 +1,2 @@ -test_* +/test_* CUDA From 12e8f1382b4c0fc503b9e30be1554475180f9f9d Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Thu, 2 Nov 2023 17:15:44 +0100 Subject: [PATCH 16/28] Also update the OpenCL test makefile --- testCodes/Makefile_ocl | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/testCodes/Makefile_ocl b/testCodes/Makefile_ocl index 9a50894..f5a3bf2 100644 --- a/testCodes/Makefile_ocl +++ b/testCodes/Makefile_ocl @@ -1,7 +1,4 @@ -CXX = g++ -CC = gcc -LD = g++ -F90 = ifort +CXX ?= g++ .SUFFIXES: .o .cpp .ptx .cu @@ -9,13 +6,13 @@ SAPPOROPATH=../lib/ SAPLIB2 = sapporo_ocl SAPLIB = lib$(SAPLIB2).a -CUDA_TK = /usr/local/cuda +CUDA_TK ?= /usr/local/cuda #CUDA_TK = /opt/AMDAPP/ -OFLAGS = -g -O3 -Wall -Wextra -Wstrict-aliasing=2 -fopenmp -CXXFLAGS = -fPIC $(OFLAGS) -D_OCL_ +OFLAGS = -g -O3 -Wall -Wextra -Wstrict-aliasing=2 -fopenmp +CXXFLAGS += -fPIC -fopenmp $(OFLAGS) -D_OCL_ -LDFLAGS = -lOpenCL -fopenmp +LDFLAGS += -lOpenCL -fopenmp INCLUDEPATH = $(SAPPOROPATH)/include CXXFLAGS += -I$(INCLUDEPATH) -I./ -I $(SAPPOROPATH) -I$(CUDA_TK)/include @@ -33,28 +30,28 @@ kernels: ln -s $(SAPPOROPATH)/OpenCLKernels OpenCL test_gravity_block_ocl : test_gravity_block_ocl.o - $(LD) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) + $(CXX) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) test_gravity_block_g5_ocl: test_gravity_block_g5_ocl.o - $(LD) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) + $(CXX) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) test_gravity_block_6th_ocl : test_gravity_block_6th_ocl.o - $(LD) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) + $(CXX) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) test_performance_rangeN_ocl : test_performance_rangeN_ocl.o - $(LD) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) + $(CXX) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) test_performance_blockStep_ocl : test_performance_blockStep_ocl.o - $(LD) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) + $(CXX) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) test_performance_rangeN_6th_ocl : test_performance_rangeN_6th_ocl.o - $(LD) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) + $(CXX) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) test_performance_rangeN_g5_ocl : test_performance_rangeN_g5_ocl.o - $(LD) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) + $(CXX) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) test_integrator_ocl : test_integrator_ocl.o - $(LD) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) + $(CXX) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) %_ocl.o: $(SRCPATH)/%.cpp $(CXX) $(CXXFLAGS) -c $< -o $@ From 1bd149e2567204c6fd8665112494b62bbe61a8ca Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Thu, 2 Nov 2023 17:54:31 +0100 Subject: [PATCH 17/28] Rearrange directory structure --- lib/Makefile => Makefile | 47 +++++++++---------- {lib/CUDAKernels => src/CUDA}/kernels.cu | 2 +- .../OpenCL}/kernels4th.cl | 2 +- .../OpenCL}/kernels4thDP.cl | 2 +- .../OpenCL}/kernels6th.cl | 2 +- .../OpenCL}/kernelsG5DS.cl | 2 +- .../OpenCL}/kernelsG5SP.cl | 2 +- .../OpenCL}/sharedKernels.cl | 0 .../include/SSE_AVX/AVX => src/SSE_AVX}/avx.h | 4 +- .../SSE_AVX/AVX => src/SSE_AVX}/avx_fp32.h | 0 .../SSE_AVX/AVX => src/SSE_AVX}/avx_fp64.h | 0 .../include/SSE_AVX/SSE => src/SSE_AVX}/sse.h | 4 +- .../SSE_AVX/SSE => src/SSE_AVX}/sse_fp32.h | 0 .../SSE_AVX/SSE => src/SSE_AVX}/sse_fp64.h | 0 {lib/include => src}/cudadev.h | 0 {lib/include => src}/defines.h | 2 +- {lib/include => src}/hostFunc.h | 0 {lib => src}/interfaces/sapporo6thlib.cpp | 0 {lib => src}/interfaces/sapporoG5lib.cpp | 0 {lib => src}/interfaces/sapporoG6lib.cpp | 0 {lib => src}/interfaces/sapporoYeblib.cpp | 0 {lib/include => src}/ocldev.h | 0 {lib/include => src}/ocldev.h-default | 0 {lib/include => src}/sapdevclass.h | 6 +-- {lib/src => src}/sapporohostclass.cpp | 14 +++--- {lib/include => src}/sapporohostclass.h | 0 {lib/include => src}/vec.h | 0 {testCodes => tests}/.gitignore | 0 {testCodes => tests}/Makefile | 4 +- {testCodes => tests}/Makefile_ocl | 8 ++-- {testCodes => tests}/README | 0 .../runScripts/compile_timingTest.sh | 0 {testCodes => tests}/runScripts/profiler.conf | 0 .../runScripts/run_activeParticleSizeTest.sh | 0 {testCodes => tests}/runScripts/run_rangeN.sh | 0 {testCodes => tests}/sapporo2.config | 0 {testCodes => tests}/sapporo2.config.README | 0 {testCodes => tests}/src/g6lib.h | 0 .../src/test_gravity_block.cpp | 0 .../src/test_gravity_block_6th.cpp | 0 .../src/test_gravity_block_g5.cpp | 0 {testCodes => tests}/src/test_integrator.cpp | 0 .../src/test_performance_blockStep.cpp | 0 .../src/test_performance_rangeN.cpp | 0 .../src/test_performance_rangeN_6th.cpp | 0 .../src/test_performance_rangeN_g5.cpp | 0 46 files changed, 49 insertions(+), 52 deletions(-) rename lib/Makefile => Makefile (78%) rename {lib/CUDAKernels => src/CUDA}/kernels.cu (99%) rename {lib/OpenCLKernels => src/OpenCL}/kernels4th.cl (99%) rename {lib/OpenCLKernels => src/OpenCL}/kernels4thDP.cl (99%) rename {lib/OpenCLKernels => src/OpenCL}/kernels6th.cl (99%) rename {lib/OpenCLKernels => src/OpenCL}/kernelsG5DS.cl (99%) rename {lib/OpenCLKernels => src/OpenCL}/kernelsG5SP.cl (99%) rename {lib/OpenCLKernels => src/OpenCL}/sharedKernels.cl (100%) rename {lib/include/SSE_AVX/AVX => src/SSE_AVX}/avx.h (94%) rename {lib/include/SSE_AVX/AVX => src/SSE_AVX}/avx_fp32.h (100%) rename {lib/include/SSE_AVX/AVX => src/SSE_AVX}/avx_fp64.h (100%) rename {lib/include/SSE_AVX/SSE => src/SSE_AVX}/sse.h (94%) rename {lib/include/SSE_AVX/SSE => src/SSE_AVX}/sse_fp32.h (100%) rename {lib/include/SSE_AVX/SSE => src/SSE_AVX}/sse_fp64.h (100%) rename {lib/include => src}/cudadev.h (100%) rename {lib/include => src}/defines.h (98%) rename {lib/include => src}/hostFunc.h (100%) rename {lib => src}/interfaces/sapporo6thlib.cpp (100%) rename {lib => src}/interfaces/sapporoG5lib.cpp (100%) rename {lib => src}/interfaces/sapporoG6lib.cpp (100%) rename {lib => src}/interfaces/sapporoYeblib.cpp (100%) rename {lib/include => src}/ocldev.h (100%) rename {lib/include => src}/ocldev.h-default (100%) rename {lib/include => src}/sapdevclass.h (99%) rename {lib/src => src}/sapporohostclass.cpp (99%) rename {lib/include => src}/sapporohostclass.h (100%) rename {lib/include => src}/vec.h (100%) rename {testCodes => tests}/.gitignore (100%) rename {testCodes => tests}/Makefile (96%) rename {testCodes => tests}/Makefile_ocl (92%) rename {testCodes => tests}/README (100%) rename {testCodes => tests}/runScripts/compile_timingTest.sh (100%) rename {testCodes => tests}/runScripts/profiler.conf (100%) rename {testCodes => tests}/runScripts/run_activeParticleSizeTest.sh (100%) rename {testCodes => tests}/runScripts/run_rangeN.sh (100%) rename {testCodes => tests}/sapporo2.config (100%) rename {testCodes => tests}/sapporo2.config.README (100%) rename {testCodes => tests}/src/g6lib.h (100%) rename {testCodes => tests}/src/test_gravity_block.cpp (100%) rename {testCodes => tests}/src/test_gravity_block_6th.cpp (100%) rename {testCodes => tests}/src/test_gravity_block_g5.cpp (100%) rename {testCodes => tests}/src/test_integrator.cpp (100%) rename {testCodes => tests}/src/test_performance_blockStep.cpp (100%) rename {testCodes => tests}/src/test_performance_rangeN.cpp (100%) rename {testCodes => tests}/src/test_performance_rangeN_6th.cpp (100%) rename {testCodes => tests}/src/test_performance_rangeN_g5.cpp (100%) diff --git a/lib/Makefile b/Makefile similarity index 78% rename from lib/Makefile rename to Makefile index 34a2f63..fcc77e8 100644 --- a/lib/Makefile +++ b/Makefile @@ -92,19 +92,17 @@ INCLUDES = -I$(CUDA_TK) CXXFLAGS += -D__INCLUDE_KERNELS__ LDFLAGS += -lcuda -fopenmp -CUDA_SRC = $(wildcard CUDAKernels/*.cu) -PTX = $(CUDA_SRC:CUDAKernels/%.cu=CUDAKernels/%.ptx) -# The .ptxh version seems to be unused -PTXH = $(CUDA_SRC:CUDAKernels/%.cu=include/%.ptxh) -NVCCFLAGS += -I./include -I. +CUDA_SRC = $(wildcard src/CUDA/*.cu) +PTX = $(CUDA_SRC:src/CUDA/%.cu=src/CUDA/%.ptx) +PTXH = $(CUDA_SRC:src/CUDA/%.cu=src/CUDA/%.ptxh) +NVCCFLAGS += -Isrc -.PHONY: kernels -kernels: $(PTX) $(PTXH) +KERNELS = $(PTX) $(PTXH) %.ptx: %.cu $(NVCC) $(NVCCFLAGS) -ptx $< -o $@ -include/%.ptxh: CUDAKernels/%.ptx +src/CUDA/%.ptxh: src/CUDA/%.ptx xxd -i $< $@ endif @@ -122,28 +120,28 @@ INCLUDES = CXXFLAGS += -D_OCL_ -D__INCLUDE_KERNELS__ LDFLAGS += -lOpenCL -fopenmp -OPENCL_SRC = $(wildcard OpenCLKernels/*.cl) -CLH = $(OPENCL_SRC:OpenCLKernels/%.cl=include/%.clh) +OPENCL_SRC = $(wildcard src/OpenCL/*.cl) +CLE = $(OPENCL_SRC:src/OpenCL/%.cl=src/OpenCL/%.cle) +CLH = $(OPENCL_SRC:src/OpenCL/%.cl=src/OpenCL/%.clh) -.PHONY: kernels -kernels: $(CLH) +KERNELS = $(CLE) $(CLH) %.cle: %.cl - $(CC) -E -IOpenCLKernels -o $@ - <$< + $(CC) -E -Isrc -o $@ - <$< -include/%.clh: OpenCLKernels/%.cle +src/OpenCL/%.clh: src/OpenCL/%.cle xxd -i $< $@ endif # Main implementation -CXX_SRC := $(wildcard src/*.cpp) +CXX_SRC := $(wildcard src/*.cpp src/SSE_AVX/*.cpp) OBJS := $(CXX_SRC:%.cpp=%.o) -INCLUDES = -Iinclude -I. +INCLUDES = -Isrc CXXFLAGS += $(INCLUDES) -fPIC -g -O3 -Wall -Wextra -Wstrict-aliasing=2 -fopenmp -src/sapporohostclass.o: kernels +src/sapporohostclass.o: $(KERNELS) %.o: %.cpp $(CXX) $(CXXFLAGS) -c $< -o $@ @@ -156,9 +154,9 @@ libsapporo.so: $(OBJS) # API compatibility libraries -EMU_SRC := $(wildcard interfaces/*lib.cpp) -EMU_STATIC_LIBS := $(EMU_SRC:interfaces/%lib.cpp=lib%.a) -EMU_SHARED_LIBS := $(EMU_SRC:interfaces/%lib.cpp=lib%.so) +EMU_SRC := $(wildcard src/interfaces/*lib.cpp) +EMU_STATIC_LIBS := $(EMU_SRC:src/interfaces/%lib.cpp=lib%.a) +EMU_SHARED_LIBS := $(EMU_SRC:src/interfaces/%lib.cpp=lib%.so) .PHONY: emulated_interfaces emulated_interfaces: $(EMU_STATIC_LIBS) $(EMU_SHARED_LIBS) @@ -168,17 +166,16 @@ $(EMU_STATIC_LIBS): libsapporo.a $(EMU_SHARED_LIBS): libsapporo.so -lib%.a: interfaces/%lib.o +lib%.a: src/interfaces/%lib.o ar qv $@ $^ -lib%.so: interfaces/%lib.o +lib%.so: src/interfaces/%lib.o $(CXX) -o $@ -shared $^ -L. -lsapporo $(LDFLAGS) # Clean-up .PHONY: clean clean: - rm -f src/*.o interfaces/*.o *.a *.so - rm -f CUDAKernels/*.ptx OpenCLKernels/*.cle - rm -f include/*.ptxh include/*.clh + rm -f *.a *.so src/*.o src/SSE_AVX/SSE/*.o src/SSE_AVX/AVX/*.o + rm -f src/CUDA/*.ptx src/CUDA/*.ptxh src/OpenCL/*.cle src/OpenCL/*.clh diff --git a/lib/CUDAKernels/kernels.cu b/src/CUDA/kernels.cu similarity index 99% rename from lib/CUDAKernels/kernels.cu rename to src/CUDA/kernels.cu index 66822d7..b0ca6ad 100644 --- a/lib/CUDAKernels/kernels.cu +++ b/src/CUDA/kernels.cu @@ -15,7 +15,7 @@ Sixt order hermite, in double precision, including neighbour lists #include -#include "include/defines.h" +#include "defines.h" #define inout #define __out diff --git a/lib/OpenCLKernels/kernels4th.cl b/src/OpenCL/kernels4th.cl similarity index 99% rename from lib/OpenCLKernels/kernels4th.cl rename to src/OpenCL/kernels4th.cl index 495f4cd..9c6d670 100644 --- a/lib/OpenCLKernels/kernels4th.cl +++ b/src/OpenCL/kernels4th.cl @@ -8,7 +8,7 @@ OpenCL Fourth order DoubleSingle kernels */ -#include "sharedKernels.cl" +#include "OpenCL/sharedKernels.cl" #define CAST 1 diff --git a/lib/OpenCLKernels/kernels4thDP.cl b/src/OpenCL/kernels4thDP.cl similarity index 99% rename from lib/OpenCLKernels/kernels4thDP.cl rename to src/OpenCL/kernels4thDP.cl index 3a8c855..d44e7d5 100644 --- a/lib/OpenCLKernels/kernels4thDP.cl +++ b/src/OpenCL/kernels4thDP.cl @@ -8,7 +8,7 @@ OpenCL Fourth order Double Precision */ -#include "sharedKernels.cl" +#include "OpenCL/sharedKernels.cl" __inline void body_body_interaction(inout float2 *ds2_min, diff --git a/lib/OpenCLKernels/kernels6th.cl b/src/OpenCL/kernels6th.cl similarity index 99% rename from lib/OpenCLKernels/kernels6th.cl rename to src/OpenCL/kernels6th.cl index ae0a475..53cc577 100644 --- a/lib/OpenCLKernels/kernels6th.cl +++ b/src/OpenCL/kernels6th.cl @@ -8,7 +8,7 @@ OpenCL Double Precision */ -#include "sharedKernels.cl" +#include "OpenCL/sharedKernels.cl" diff --git a/lib/OpenCLKernels/kernelsG5DS.cl b/src/OpenCL/kernelsG5DS.cl similarity index 99% rename from lib/OpenCLKernels/kernelsG5DS.cl rename to src/OpenCL/kernelsG5DS.cl index 449326e..fea5f8f 100644 --- a/lib/OpenCLKernels/kernelsG5DS.cl +++ b/src/OpenCL/kernelsG5DS.cl @@ -9,7 +9,7 @@ OpenCL Double Single kernels */ -#include "sharedKernels.cl" +#include "OpenCL/sharedKernels.cl" diff --git a/lib/OpenCLKernels/kernelsG5SP.cl b/src/OpenCL/kernelsG5SP.cl similarity index 99% rename from lib/OpenCLKernels/kernelsG5SP.cl rename to src/OpenCL/kernelsG5SP.cl index 5de684f..f1077cf 100644 --- a/lib/OpenCLKernels/kernelsG5SP.cl +++ b/src/OpenCL/kernelsG5SP.cl @@ -8,7 +8,7 @@ CUDA single precisin kernels */ -#include "sharedKernels.cl" +#include "OpenCL/sharedKernels.cl" __inline void body_body_interaction(inout float4 *acc_i, diff --git a/lib/OpenCLKernels/sharedKernels.cl b/src/OpenCL/sharedKernels.cl similarity index 100% rename from lib/OpenCLKernels/sharedKernels.cl rename to src/OpenCL/sharedKernels.cl diff --git a/lib/include/SSE_AVX/AVX/avx.h b/src/SSE_AVX/avx.h similarity index 94% rename from lib/include/SSE_AVX/AVX/avx.h rename to src/SSE_AVX/avx.h index 3e6570f..50352b6 100644 --- a/lib/include/SSE_AVX/AVX/avx.h +++ b/src/SSE_AVX/avx.h @@ -31,8 +31,8 @@ namespace SIMD namespace SIMD { -#include "avx_fp32.h" -#include "avx_fp64.h" +#include "SSE_AVX/avx_fp32.h" +#include "SSE_AVX/avx_fp64.h" template T broadcast(const T x) { return T::template broadcast(x); } diff --git a/lib/include/SSE_AVX/AVX/avx_fp32.h b/src/SSE_AVX/avx_fp32.h similarity index 100% rename from lib/include/SSE_AVX/AVX/avx_fp32.h rename to src/SSE_AVX/avx_fp32.h diff --git a/lib/include/SSE_AVX/AVX/avx_fp64.h b/src/SSE_AVX/avx_fp64.h similarity index 100% rename from lib/include/SSE_AVX/AVX/avx_fp64.h rename to src/SSE_AVX/avx_fp64.h diff --git a/lib/include/SSE_AVX/SSE/sse.h b/src/SSE_AVX/sse.h similarity index 94% rename from lib/include/SSE_AVX/SSE/sse.h rename to src/SSE_AVX/sse.h index 2b276fc..0b4bcf4 100644 --- a/lib/include/SSE_AVX/SSE/sse.h +++ b/src/SSE_AVX/sse.h @@ -31,8 +31,8 @@ namespace SIMD namespace SIMD { -#include "sse_fp32.h" -#include "sse_fp64.h" +#include "SSE_AVX/sse_fp32.h" +#include "SSE_AVX/sse_fp64.h" template T broadcast(const T x) { return T::template broadcast(x); } diff --git a/lib/include/SSE_AVX/SSE/sse_fp32.h b/src/SSE_AVX/sse_fp32.h similarity index 100% rename from lib/include/SSE_AVX/SSE/sse_fp32.h rename to src/SSE_AVX/sse_fp32.h diff --git a/lib/include/SSE_AVX/SSE/sse_fp64.h b/src/SSE_AVX/sse_fp64.h similarity index 100% rename from lib/include/SSE_AVX/SSE/sse_fp64.h rename to src/SSE_AVX/sse_fp64.h diff --git a/lib/include/cudadev.h b/src/cudadev.h similarity index 100% rename from lib/include/cudadev.h rename to src/cudadev.h diff --git a/lib/include/defines.h b/src/defines.h similarity index 98% rename from lib/include/defines.h rename to src/defines.h index e6f3216..726c836 100644 --- a/lib/include/defines.h +++ b/src/defines.h @@ -79,7 +79,7 @@ inline const char* get_kernelName(const int integrator, //Here we come if all switch/case/if combo's failed fprintf(stderr,"ERROR: Unknown combination of integrator type ( %d ) and precision ( %d ) \n", integrator, precision); - fprintf(stderr,"ERROR: See 'include/defines.h' for the possible combinations \n"); + fprintf(stderr,"ERROR: See 'defines.h' for the possible combinations \n"); exit(0); return ""; } diff --git a/lib/include/hostFunc.h b/src/hostFunc.h similarity index 100% rename from lib/include/hostFunc.h rename to src/hostFunc.h diff --git a/lib/interfaces/sapporo6thlib.cpp b/src/interfaces/sapporo6thlib.cpp similarity index 100% rename from lib/interfaces/sapporo6thlib.cpp rename to src/interfaces/sapporo6thlib.cpp diff --git a/lib/interfaces/sapporoG5lib.cpp b/src/interfaces/sapporoG5lib.cpp similarity index 100% rename from lib/interfaces/sapporoG5lib.cpp rename to src/interfaces/sapporoG5lib.cpp diff --git a/lib/interfaces/sapporoG6lib.cpp b/src/interfaces/sapporoG6lib.cpp similarity index 100% rename from lib/interfaces/sapporoG6lib.cpp rename to src/interfaces/sapporoG6lib.cpp diff --git a/lib/interfaces/sapporoYeblib.cpp b/src/interfaces/sapporoYeblib.cpp similarity index 100% rename from lib/interfaces/sapporoYeblib.cpp rename to src/interfaces/sapporoYeblib.cpp diff --git a/lib/include/ocldev.h b/src/ocldev.h similarity index 100% rename from lib/include/ocldev.h rename to src/ocldev.h diff --git a/lib/include/ocldev.h-default b/src/ocldev.h-default similarity index 100% rename from lib/include/ocldev.h-default rename to src/ocldev.h-default diff --git a/lib/include/sapdevclass.h b/src/sapdevclass.h similarity index 99% rename from lib/include/sapdevclass.h rename to src/sapdevclass.h index 01b996d..c7c175b 100644 --- a/lib/include/sapdevclass.h +++ b/src/sapdevclass.h @@ -12,7 +12,7 @@ to allocate, load, start functions, etc. */ #ifdef _OCL_ - #include "include/ocldev.h" + #include "ocldev.h" typedef cl_float2 float2; typedef cl_float4 float4; @@ -28,7 +28,7 @@ to allocate, load, start functions, etc. typedef cl_int4 int4; #else - #include "include/cudadev.h" + #include "cudadev.h" #endif #include @@ -37,7 +37,7 @@ to allocate, load, start functions, etc. #include -#include "include/defines.h" +#include "defines.h" namespace sapporo2 { diff --git a/lib/src/sapporohostclass.cpp b/src/sapporohostclass.cpp similarity index 99% rename from lib/src/sapporohostclass.cpp rename to src/sapporohostclass.cpp index 6cac1d4..b610a13 100644 --- a/lib/src/sapporohostclass.cpp +++ b/src/sapporohostclass.cpp @@ -18,14 +18,14 @@ vel_j.w = eps2 #ifdef __INCLUDE_KERNELS__ #ifdef _OCL_ -#include "kernels4th.clh" -#include "kernels4thDP.clh" -#include "kernels6th.clh" -#include "kernelsG5DS.clh" -#include "kernelsG5SP.clh" -#include "sharedKernels.clh" +#include "OpenCL/kernels4th.clh" +#include "OpenCL/kernels4thDP.clh" +#include "OpenCL/kernels6th.clh" +#include "OpenCL/kernelsG5DS.clh" +#include "OpenCL/kernelsG5SP.clh" +#include "OpenCL/sharedKernels.clh" #else -#include "kernels.ptxh" +#include "CUDA/kernels.ptxh" #endif #endif diff --git a/lib/include/sapporohostclass.h b/src/sapporohostclass.h similarity index 100% rename from lib/include/sapporohostclass.h rename to src/sapporohostclass.h diff --git a/lib/include/vec.h b/src/vec.h similarity index 100% rename from lib/include/vec.h rename to src/vec.h diff --git a/testCodes/.gitignore b/tests/.gitignore similarity index 100% rename from testCodes/.gitignore rename to tests/.gitignore diff --git a/testCodes/Makefile b/tests/Makefile similarity index 96% rename from testCodes/Makefile rename to tests/Makefile index 7362350..0286390 100644 --- a/testCodes/Makefile +++ b/tests/Makefile @@ -5,7 +5,7 @@ F90 ?= ifort .SUFFIXES: .o .cpp .ptx .cu -SAPPOROPATH=../lib/ +SAPPOROPATH=../ SAPLIB2 = sapporo SAPLIB = lib$(SAPLIB2).a SAPLIBG6 = sapporoG6 @@ -19,7 +19,7 @@ CXXFLAGS += -fPIC -fopenmp $(OFLAGS) -I$(CUDA_TK)/include LDFLAGS += -lcuda -L$(CUDA_TK)/lib64 INCLUDEPATH = $(SAPPOROPATH)/include -CXXFLAGS += -I$(INCLUDEPATH) -I./ -I $(SAPPOROPATH) +CXXFLAGS += -I$(INCLUDEPATH) -I./ -I $(SAPPOROPATH)/src SRCPATH = src diff --git a/testCodes/Makefile_ocl b/tests/Makefile_ocl similarity index 92% rename from testCodes/Makefile_ocl rename to tests/Makefile_ocl index f5a3bf2..e729714 100644 --- a/testCodes/Makefile_ocl +++ b/tests/Makefile_ocl @@ -2,8 +2,8 @@ CXX ?= g++ .SUFFIXES: .o .cpp .ptx .cu -SAPPOROPATH=../lib/ -SAPLIB2 = sapporo_ocl +SAPPOROPATH=.. +SAPLIB2 = sapporo SAPLIB = lib$(SAPLIB2).a CUDA_TK ?= /usr/local/cuda @@ -14,8 +14,8 @@ CXXFLAGS += -fPIC -fopenmp $(OFLAGS) -D_OCL_ LDFLAGS += -lOpenCL -fopenmp -INCLUDEPATH = $(SAPPOROPATH)/include -CXXFLAGS += -I$(INCLUDEPATH) -I./ -I $(SAPPOROPATH) -I$(CUDA_TK)/include +INCLUDEPATH = $(SAPPOROPATH)/include +CXXFLAGS += -I$(INCLUDEPATH) -I./ -I $(SAPPOROPATH)/src -I$(CUDA_TK)/include SRCPATH = src diff --git a/testCodes/README b/tests/README similarity index 100% rename from testCodes/README rename to tests/README diff --git a/testCodes/runScripts/compile_timingTest.sh b/tests/runScripts/compile_timingTest.sh similarity index 100% rename from testCodes/runScripts/compile_timingTest.sh rename to tests/runScripts/compile_timingTest.sh diff --git a/testCodes/runScripts/profiler.conf b/tests/runScripts/profiler.conf similarity index 100% rename from testCodes/runScripts/profiler.conf rename to tests/runScripts/profiler.conf diff --git a/testCodes/runScripts/run_activeParticleSizeTest.sh b/tests/runScripts/run_activeParticleSizeTest.sh similarity index 100% rename from testCodes/runScripts/run_activeParticleSizeTest.sh rename to tests/runScripts/run_activeParticleSizeTest.sh diff --git a/testCodes/runScripts/run_rangeN.sh b/tests/runScripts/run_rangeN.sh similarity index 100% rename from testCodes/runScripts/run_rangeN.sh rename to tests/runScripts/run_rangeN.sh diff --git a/testCodes/sapporo2.config b/tests/sapporo2.config similarity index 100% rename from testCodes/sapporo2.config rename to tests/sapporo2.config diff --git a/testCodes/sapporo2.config.README b/tests/sapporo2.config.README similarity index 100% rename from testCodes/sapporo2.config.README rename to tests/sapporo2.config.README diff --git a/testCodes/src/g6lib.h b/tests/src/g6lib.h similarity index 100% rename from testCodes/src/g6lib.h rename to tests/src/g6lib.h diff --git a/testCodes/src/test_gravity_block.cpp b/tests/src/test_gravity_block.cpp similarity index 100% rename from testCodes/src/test_gravity_block.cpp rename to tests/src/test_gravity_block.cpp diff --git a/testCodes/src/test_gravity_block_6th.cpp b/tests/src/test_gravity_block_6th.cpp similarity index 100% rename from testCodes/src/test_gravity_block_6th.cpp rename to tests/src/test_gravity_block_6th.cpp diff --git a/testCodes/src/test_gravity_block_g5.cpp b/tests/src/test_gravity_block_g5.cpp similarity index 100% rename from testCodes/src/test_gravity_block_g5.cpp rename to tests/src/test_gravity_block_g5.cpp diff --git a/testCodes/src/test_integrator.cpp b/tests/src/test_integrator.cpp similarity index 100% rename from testCodes/src/test_integrator.cpp rename to tests/src/test_integrator.cpp diff --git a/testCodes/src/test_performance_blockStep.cpp b/tests/src/test_performance_blockStep.cpp similarity index 100% rename from testCodes/src/test_performance_blockStep.cpp rename to tests/src/test_performance_blockStep.cpp diff --git a/testCodes/src/test_performance_rangeN.cpp b/tests/src/test_performance_rangeN.cpp similarity index 100% rename from testCodes/src/test_performance_rangeN.cpp rename to tests/src/test_performance_rangeN.cpp diff --git a/testCodes/src/test_performance_rangeN_6th.cpp b/tests/src/test_performance_rangeN_6th.cpp similarity index 100% rename from testCodes/src/test_performance_rangeN_6th.cpp rename to tests/src/test_performance_rangeN_6th.cpp diff --git a/testCodes/src/test_performance_rangeN_g5.cpp b/tests/src/test_performance_rangeN_g5.cpp similarity index 100% rename from testCodes/src/test_performance_rangeN_g5.cpp rename to tests/src/test_performance_rangeN_g5.cpp From cc92791bbcf73cda3702d0fa254680b36f555957 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Thu, 2 Nov 2023 17:55:06 +0100 Subject: [PATCH 18/28] Rename license file so that GitHub will pick it up --- gpl-3.0.txt => LICENSE | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename gpl-3.0.txt => LICENSE (100%) diff --git a/gpl-3.0.txt b/LICENSE similarity index 100% rename from gpl-3.0.txt rename to LICENSE From 6c3f8fbfa8b16e9bea85abe4a6f09f7af974cf72 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Thu, 2 Nov 2023 19:43:23 +0100 Subject: [PATCH 19/28] Add public API headers for G6 and 6th order --- include/sapporo6thlib.h | 79 +++++++++++++++++ include/sapporoG6lib.h | 187 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 266 insertions(+) create mode 100644 include/sapporo6thlib.h create mode 100644 include/sapporoG6lib.h diff --git a/include/sapporo6thlib.h b/include/sapporo6thlib.h new file mode 100644 index 0000000..87e245d --- /dev/null +++ b/include/sapporo6thlib.h @@ -0,0 +1,79 @@ +/* 6th order integrator as implemented by Sapporo2 +*/ + +void initialize_special(int ndev, int *list); + +void initialize(); + +/* + +add = address +pos, vel, acc, jrk, snp, crk, +mass, +time = current particle time +id = unique particle id +eps2 = softening of j-particle + +*/ +void set_j_particle(int add, double pos[3], double vel[3], double acc[3], + double jrk[3], double snp[3], double crk[3], double mass, + double time, int id, double eps2); + +/* + +Set time of the prediction + +time = time to which particles are predicted +nj = amount of particles that are predicted + +*/ +void predict_all(double time, int nj); + +/* + +Do not execute prediction, but only copy the particles +into the predicted buffers. + +*/ +void no_predict_all(double time, int nj); + +/* + +Return the predicted values for a particle at an address + +addr = address of the particle + +id = the particle id +mass = the mass of the particle +eps2 = the softening value of the particle +pos = buffer to store predicted position +vel = buffer to store predicted velocity +acc = buffer to store predicted acceleration + +*/ +void pick_up_predictor_2(int addr, int &id, double &mass, double &eps2, + double pos[3], double vel[3], double acc[3]); + +/* + +Calculate the gravity on the i-particles + +//Input +ni = number of particles to be integrated +nj = number of sources +pos, vel, acc, mass, eps2 + +//Output +acc, jrk, snp, potential (phi) +nnb = nearest neighbour ID +nnb_r2 = distance to the nearest neighbour. (Squared distance + softening) +nnb_r2 = double r2 = EPS2 + dx*dx + dy*dy + dz*dz; + +*/ + +void calc_force_on_predictors(int ni, int nj, int ids[], double pos[][3], + double vel[][3],double acc[][3], double mass[], + double eps2[], double accNew[][3], double jrkNew[][3], + double snpNew[][3], double crkNew[][3], double phi[], + int nnb[], double nnb_r2[]); + diff --git a/include/sapporoG6lib.h b/include/sapporoG6lib.h new file mode 100644 index 0000000..804fe99 --- /dev/null +++ b/include/sapporoG6lib.h @@ -0,0 +1,187 @@ +/* GRAPE6 API as implemented by Sapporo2 + * + * See https://www.cfca.nao.ac.jp/files/grape6user.pdf for the documentation. + * + * Note that some functions appear to be missing, e.g. g6_set_nip() and + * g6_set_i_particle_scales_from_real_value(). +*/ + +// Fortran ABI +int g6_open_(int *id); + +int g6_close_(int *id); + +int g6_npipes_(); + +int g6_set_tunit_(double*); + +int g6_set_xunit_(double*); + +int g6_set_ti_(int *id, double *ti); + +int g6_set_j_particle_(int *cluster_id, + int *address, + int *index, + double *tj, double *dtj, + double *mass, + double k18[3], double j6[3], + double a2[3], double v[3], double x[3]); + +void g6calc_firsthalf_(int *cluster_id, + int *nj, int *ni, + int index[], + double xi[][3], double vi[][3], + double aold[][3], double j6old[][3], + double phiold[3], + double *eps2, double h2[]); + +int g6calc_lasthalf_(int *cluster_id, + int *nj, int *ni, + int index[], + double xi[][3], double vi[][3], + double *eps2, double h2[], + double acc[][3], double jerk[][3], double pot[]); + +int g6calc_lasthalf2_(int *cluster_id, + int *nj, int *ni, + int index[], + double xi[][3], double vi[][3], + double *eps2, double h2[], + double acc[][3], double jerk[][3], double pot[], + int *inn); + +int g6_initialize_jp_buffer_(int* cluster_id, int* buf_size) { cluster_id = cluster_id; buf_size=buf_size; return 0;} +int g6_flush_jp_buffer_(int* cluster_id) { cluster_id = cluster_id; return 0;} +int g6_reset_(int* cluster_id) {cluster_id = cluster_id; return 0;} +int g6_reset_fofpga_(int* cluster_id) {cluster_id = cluster_id; return 0;} + +int g6_read_neighbour_list_(int* cluster_id); + +int g6_get_neighbour_list_(int *cluster_id, + int *ipipe, + int *maxlength, + int *n_neighbours, + int neighbour_list[]); + +// This is not part of the GRAPE6 API, but is useful for debugging. +void get_j_part_data(int addr, int nj, + double *pos, + double *vel, + double *acc, + double *jrk, + double *ppos, + double *pvel, + double &mass, + double &eps2, + int &id); + + +// C ABI +// These forward to the Fortran versions above, which are actually implemented by +// Sapporo2. + +extern "C" { + +inline int g6_open(int id) { + g6_open_(&id); +} + +inline int g6_close(int id) { + g6_close(&id); +} + +inline int g6_npipes() { + return g6_npipes_(); +} + +inline int g6_set_tunit(double tu) { + return g6_set_tunit_(&tu); +} + +inline int g6_set_xunit(double xu) { + return g6_set_xunit_(&xu); +} + +inline int g6_set_ti(int id, double ti) { + return g6_set_ti_(&id, &ti); +} + +inline int g6_set_j_particle(int cluster_id, + int address, + int index, + double tj, double dtj, + double mass, + double k18[3], double j6[3], + double a2[3], double v[3], double x[3]) +{ + return g6_set_j_particle( + &cluster_id, &address, &index, &tj, &dtj, &mass, k18, j6, a2, v, x); +} + +inline void g6calc_firsthalf(int cluster_id, + int nj, int ni, + int index[], + double xi[][3], double vi[][3], + double aold[][3], double j6old[][3], + double phiold[3], + double eps2, double h2[]) +{ + g6calc_firsthalf( + &cluster_id, &nj, &ni, index, xi, vi, aold, j6old, piold, &eps2, h2); +} + +inline int g6calc_lasthalf(int cluster_id, + int nj, int ni, + int index[], + double xi[][3], double vi[][3], + double eps2, double h2[], + double acc[][3], double jerk[][3], double pot[]) +{ + return g6calc_lasthalf_( + &cluster_id, &nj, &ni, index, xi, vi, &eps2, h2, acc, jerk, pot); +} + +inline int g6calc_lasthalf2(int cluster_id, + int nj, int ni, + int index[], + double xi[][3], double vi[][3], + double eps2, double h2[], + double acc[][3], double jerk[][3], double pot[], + int *inn) +{ + return g6calc_lasthalf2_( + &cluster_id, &nj, &ni, index, xy, vi, &eps2, h2, acc, jerk, pot, inn); +} + +inline int g6_initialize_jp_buffer(int cluster_id, int buf_size) { + return g6_initialize_jp_buffer_(&cluster_id, &buf_size); +} + +inline int g6_flush_jp_buffer(int cluster_id) { + return g6_flush_jp_buffer_(&cluster_id); +} + +inline int g6_reset(int cluster_id) { + return g6_reset_(&cluster_id); +} + +inline int g6_reset_fofpga(int cluster_id) { + return g6_reset_fofpga_(&cluster_id); +} + +inline int g6_read_neighbour_list(int cluster_id) { + return g6_read_neighbour_list_(&cluster_id); +} + +inline int g6_get_neighbour_list(int cluster_id, + int ipipe, + int maxlength, + int n_neighbours, + int neighbour_list[]) +{ + return g6_get_neighbour_list( + &cluster_id, &ipipe, &maxlength, &n_neighbours, neighbour_list); +} + +} + From 644c7278f8e80b92f8e85f993c62d537d597e31d Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Mon, 6 Nov 2023 11:57:38 +0100 Subject: [PATCH 20/28] Add make install target --- Makefile | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/Makefile b/Makefile index fcc77e8..0ee1e2b 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,6 @@ CXX ?= g++ CC ?= gcc +PREFIX ?= /usr/local .PHONY: all all: libsapporo.a libsapporo.so emulated_interfaces @@ -173,6 +174,34 @@ lib%.so: src/interfaces/%lib.o $(CXX) -o $@ -shared $^ -L. -lsapporo $(LDFLAGS) +# Installation +INSTALLED_LIBS := $(PREFIX)/lib/libsapporo.a $(PREFIX)/lib/libsapporo.so +INSTALLED_LIBS += $(EMU_STATIC_LIBS:%.a=$(PREFIX)/lib/%.a) +INSTALLED_LIBS += $(EMU_SHARED_LIBS:%.so=$(PREFIX)/lib/%.so) + +INSTALLED_LIBS: $(PREFIX)/lib + +HEADERS := $(wildcard include/*) +INSTALLED_HEADERS := $(HEADERS:include/%=$(PREFIX)/include/%) + +INSTALLED_HEADERS: $(PREFIX)/include + +$(PREFIX)/include: + mkdir -p $(PREFIX)/include + +$(PREFIX)/include/%: include/% $(PREFIX)/include + install -m 644 $< $@ + +$(PREFIX)/lib: + mkdir -p $(PREFIX)/lib + +$(PREFIX)/lib/%: % $(PREFIX)/lib + install -m 644 $< $@ + +.PHONY: install +install: $(INSTALLED_LIBS) $(INSTALLED_HEADERS) + + # Clean-up .PHONY: clean clean: From d4ff5779a86db7ef16e47e3a2b8f2ff138e07579 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Wed, 15 Nov 2023 11:22:08 +0100 Subject: [PATCH 21/28] Improve CUDA builds --- Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 0ee1e2b..0559bfd 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,7 @@ CXX ?= g++ CC ?= gcc PREFIX ?= /usr/local + .PHONY: all all: libsapporo.a libsapporo.so emulated_interfaces @@ -101,7 +102,7 @@ NVCCFLAGS += -Isrc KERNELS = $(PTX) $(PTXH) %.ptx: %.cu - $(NVCC) $(NVCCFLAGS) -ptx $< -o $@ + $(NVCC) --forward-unknown-to-host-compiler $(CXXFLAGS) $(NVCCFLAGS) -ptx $< -o $@ src/CUDA/%.ptxh: src/CUDA/%.ptx xxd -i $< $@ @@ -139,7 +140,7 @@ endif # Main implementation CXX_SRC := $(wildcard src/*.cpp src/SSE_AVX/*.cpp) OBJS := $(CXX_SRC:%.cpp=%.o) -INCLUDES = -Isrc +INCLUDES += -Isrc CXXFLAGS += $(INCLUDES) -fPIC -g -O3 -Wall -Wextra -Wstrict-aliasing=2 -fopenmp src/sapporohostclass.o: $(KERNELS) From 0730f7adc08b6f3b51d5637f353bac50970f001d Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Wed, 15 Nov 2023 11:22:40 +0100 Subject: [PATCH 22/28] Add initial Conda build definition --- conda/conda_build_config.yaml | 4 +++ conda/meta.yaml | 52 +++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 conda/conda_build_config.yaml create mode 100644 conda/meta.yaml diff --git a/conda/conda_build_config.yaml b/conda/conda_build_config.yaml new file mode 100644 index 0000000..99df767 --- /dev/null +++ b/conda/conda_build_config.yaml @@ -0,0 +1,4 @@ +gpu_backend: + - cuda + - opencl + diff --git a/conda/meta.yaml b/conda/meta.yaml new file mode 100644 index 0000000..b0bcb33 --- /dev/null +++ b/conda/meta.yaml @@ -0,0 +1,52 @@ +{% set name = "sapporo2" %} +{% set version = "0.0.1" %} + +package: + name: {{ name|lower }} + version: {{ version }} + +source: + path: ../ + # git_rev: work + # git_url: https://github.com/LourensVeen/sapporo2.git + +build: + number: 0 + string: {{ gpu_backend }} + + script_env: + - BACKEND=CUDA # [gpu_backend == 'cuda'] + - BACKEND=OpenCL # [gpu_backend == 'opencl'] + script: make install + + missing_dso_whitelist: + - "*/libcuda.so" + +requirements: + build: + - {{ compiler('cxx') }} + - git + - git-lfs + - make + - cuda-compiler # [linux and gpu_backend == 'cuda'] + # - conda-verify + + host: + - ocl-icd # [linux and gpu_backend == 'opencl'] + - ocl_icd_wrapper_apple # [osx and gpu_backend == 'opencl'] + - khronos-opencl-icd-loader # [osx and gpu_backend == 'opencl'] + + run: + - cuda-runtime # [linux and gpu_backend == 'cuda'] + - libgcc-ng + - libstdcxx-ng + - _openmp_mutex + +test: + +about: + home: https://github.com/treecode/sapporo2 + summary: Library for emulating GRAPE6 n-body calculations + license: GPL-3.0-only + license_family: GPL + license_file: LICENSE From 6064eb53fb4bda20debd2f57b2bbc0dbd3cb1062 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Fri, 17 Nov 2023 16:33:57 +0100 Subject: [PATCH 23/28] Use CXX environment variable in OpenCL detection --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 0559bfd..6f9b61b 100644 --- a/Makefile +++ b/Makefile @@ -31,7 +31,7 @@ ifdef OPENCL OPENCL_LDFLAGS := -L$(OPENCL)/lib -lOpenCL endif -OPENCL_STATUS := $(shell echo 'int main() {}' | g++ -x c++ $(OPENCL_LDFLAGS) - && rm a.out || echo NOTFOUND) +OPENCL_STATUS := $(shell echo 'int main() {}' | $(CXX) -x c++ $(OPENCL_LDFLAGS) - && rm a.out || echo NOTFOUND) ifeq ($(OPENCL_STATUS), NOTFOUND) $(info OpenCL support was not detected on the system.) From 6ef3114ca25d4f46cdaacf3cb42e8f8449d907fe Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Tue, 21 Nov 2023 11:26:59 +0100 Subject: [PATCH 24/28] Fix conda OpenCL dependencies --- conda/meta.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conda/meta.yaml b/conda/meta.yaml index b0bcb33..8b9c870 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -29,12 +29,12 @@ requirements: - git-lfs - make - cuda-compiler # [linux and gpu_backend == 'cuda'] + - pocl # [linux and gpu_backend == 'opencl'] + - vim # - conda-verify host: - ocl-icd # [linux and gpu_backend == 'opencl'] - - ocl_icd_wrapper_apple # [osx and gpu_backend == 'opencl'] - - khronos-opencl-icd-loader # [osx and gpu_backend == 'opencl'] run: - cuda-runtime # [linux and gpu_backend == 'cuda'] From b615d2830321469c1bd2833ceb64db6a92f342d5 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Tue, 21 Nov 2023 11:27:24 +0100 Subject: [PATCH 25/28] Add build number to the conda build string --- conda/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/meta.yaml b/conda/meta.yaml index 8b9c870..b7fde44 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -12,7 +12,7 @@ source: build: number: 0 - string: {{ gpu_backend }} + string: {{ gpu_backend }}_{{ PKG_BUILDNUM }} script_env: - BACKEND=CUDA # [gpu_backend == 'cuda'] From 025dea9e324e2fc60e82cf3f998de13e1d926626 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Tue, 21 Nov 2023 11:27:48 +0100 Subject: [PATCH 26/28] Fix conda dso whitelist --- conda/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/meta.yaml b/conda/meta.yaml index b7fde44..aa16b56 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -20,7 +20,7 @@ build: script: make install missing_dso_whitelist: - - "*/libcuda.so" + - "*/libcuda.so*" requirements: build: From 512c2ddd061434f5e8c682e55d5705c7a70ac117 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Tue, 21 Nov 2023 14:43:26 +0100 Subject: [PATCH 27/28] Fix kernel names after directory rearrangement --- Makefile | 4 +++- src/ocldev.h | 28 ++++++++++++++-------------- tests/Makefile | 2 +- tests/Makefile_ocl | 2 +- 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/Makefile b/Makefile index 6f9b61b..fd28e3b 100644 --- a/Makefile +++ b/Makefile @@ -131,8 +131,10 @@ KERNELS = $(CLE) $(CLH) %.cle: %.cl $(CC) -E -Isrc -o $@ - <$< +# xxd names the variable after the file name argument, and we expect +# the variable to not have a src_ prefix, so we have to remove it. src/OpenCL/%.clh: src/OpenCL/%.cle - xxd -i $< $@ + cd src && xxd -i $(<:src/%=%) $(@:src/%=%) endif diff --git a/src/ocldev.h b/src/ocldev.h index e621348..68ea286 100644 --- a/src/ocldev.h +++ b/src/ocldev.h @@ -22,18 +22,18 @@ #endif #ifdef __INCLUDE_KERNELS__ -extern unsigned char OpenCLKernels_kernels4th_cle[]; -extern unsigned int OpenCLKernels_kernels4th_cle_len; -extern unsigned char OpenCLKernels_kernels4thDP_cle[]; -extern unsigned int OpenCLKernels_kernels4thDP_cle_len; -extern unsigned char OpenCLKernels_kernels6th_cle[]; -extern unsigned int OpenCLKernels_kernels6th_cle_len; -extern unsigned char OpenCLKernels_kernelsG5DS_cle[]; -extern unsigned int OpenCLKernels_kernelsG5DS_cle_len; -extern unsigned char OpenCLKernels_kernelsG5SP_cle[]; -extern unsigned int OpenCLKernels_kernelsG5SP_cle_len; -extern unsigned char OpenCLKernels_sharedKernels_cle[]; -extern unsigned int OpenCLKernels_sharedKernels_cle_len; +extern unsigned char OpenCL_kernels4th_cle[]; +extern unsigned int OpenCL_kernels4th_cle_len; +extern unsigned char OpenCL_kernels4thDP_cle[]; +extern unsigned int OpenCL_kernels4thDP_cle_len; +extern unsigned char OpenCL_kernels6th_cle[]; +extern unsigned int OpenCL_kernels6th_cle_len; +extern unsigned char OpenCL_kernelsG5DS_cle[]; +extern unsigned int OpenCL_kernelsG5DS_cle_len; +extern unsigned char OpenCL_kernelsG5SP_cle[]; +extern unsigned int OpenCL_kernelsG5SP_cle_len; +extern unsigned char OpenCL_sharedKernels_cle[]; +extern unsigned int OpenCL_sharedKernels_cle_len; #endif namespace dev { @@ -70,8 +70,8 @@ namespace dev { int data_len = 0; if(temp.rfind("kernels4th.cl") != string::npos) { - data = (char *) OpenCLKernels_kernels4th_cle; - data_len = OpenCLKernels_kernels4th_cle_len; + data = (char *) OpenCL_kernels4th_cle; + data_len = OpenCL_kernels4th_cle_len; fprintf(stderr, "Found compiled in version of file: %s\n", cFilename); } else { fprintf(stderr, "Could not find kernel file: %s\n", cFilename); diff --git a/tests/Makefile b/tests/Makefile index 0286390..585382b 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -32,7 +32,7 @@ all: $(OBJ) $(PROG) kernels kernels: - rm -f CUDA && ln -s $(SAPPOROPATH)/CUDAKernels CUDA + rm -f CUDA && ln -s $(SAPPOROPATH)/CUDA CUDA #$(PROG): $(OBJ) # $(CXX) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -lsapporo diff --git a/tests/Makefile_ocl b/tests/Makefile_ocl index e729714..eb4d7a3 100644 --- a/tests/Makefile_ocl +++ b/tests/Makefile_ocl @@ -27,7 +27,7 @@ PROG = test_gravity_block_ocl test_gravity_block_6th_ocl test_performance_rangeN all: $(OBJ) $(PROG) kernels kernels: - ln -s $(SAPPOROPATH)/OpenCLKernels OpenCL + rm -f OpenCL && ln -s $(SAPPOROPATH)/OpenCL OpenCL test_gravity_block_ocl : test_gravity_block_ocl.o $(CXX) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) From 0476d61c5349c3e5c97acc33cd0070f24d9b5ae4 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Tue, 21 Nov 2023 14:45:07 +0100 Subject: [PATCH 28/28] Fix OpenCL test link and clean up small things --- tests/.Makefile.swp | Bin 0 -> 12288 bytes tests/Makefile | 13 +------------ tests/Makefile_ocl | 16 +++------------- 3 files changed, 4 insertions(+), 25 deletions(-) create mode 100644 tests/.Makefile.swp diff --git a/tests/.Makefile.swp b/tests/.Makefile.swp new file mode 100644 index 0000000000000000000000000000000000000000..6f7c95f377c4421fe7791eb48457d3105f0024ad GIT binary patch literal 12288 zcmeI2O^@3|7{{j|wYxw8l@qsVH!TS(+e_M}OCW7i=cQ@0n<&|oJs={-I|*jJwq-l8 zNYK4-f)ihW3j&D?NOa){5*$H-PrwJL#1W3A|FIp%$tJs%N|B1j($8McGcV8l{me7A zHocWwwKcjbX9=BJFEE2LxnYMoAnKo9qNTJ3I&7$a}^l+eQEV1 zIiG!Hh1Xd+bDF+zsx!B`FbD;N0zv_yfKWgvAQTV^2nB=!|33wM=NQ?;w4R8jR*A01 zrmoSgxCjM=0zv_yfKWgvAQTV^2nB=!LII(GP(Uc~7%HG`5%S`3Le@^edHnx>^!@+W zbA;Rle}Lb?Z{QC27JLpq1-n28J)nUyD1im=>$8OX0=@)afIUzFWl#b+aO)%?SHNY^ zd4`Y;unI1L9FV|2PZM$v{0a8K9qp`5nKf)z>kZB`~ddAD!2sx zULfQf@HMbO8oUJVVgBEP@4!Cz4BQ4Ef)BtDaEGhq-;*-O((abzRLGu=VcX;>6f9Yy1apKr6WDl*0AJT*IS_l=G>^}<1Xgy5Z z9_i3Y-6sa-qb`lp#)B=!=;yR#Dat}Tr|?{cWzCaSKgde-+&OAmRPv_Mhr&} zYcbQ~nM*UxQ7@<;=C;U82*0I;FpW%j9|m=hUM9;&>wktRj)A~-FbETUCXMkjnPZO< z&yl@Pcs7+P34w~3DAH?7%?HW3fRvpfxSctFp1x~_DqUMw>zz_L*tJj~D>zNZ6>plT7wVYY zSq)1!hg*!|t1H~)g5J$wZAf+OnF!48#?6dk8ok_wgcooEQQfSmINJ`hb{ve|811ic zFK!tR5nADcG(g+Nc`Chu5XYJA`mTnn=evgPOPXnDo?#6N*|={E96Cbjl>$|Uo~s~< zYo@{-g*bRy2)16XR`|R2YAz~75qux*&e@EtMTYFSGA|juZ4_jg5V_r{RBD^$cAnxD qBY1?tYlf^3@g4f=O5lcp9oC0#!OdWKnG#hE|JHT>xVWn*CielWSp25| literal 0 HcmV?d00001 diff --git a/tests/Makefile b/tests/Makefile index 585382b..f453db9 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -5,7 +5,7 @@ F90 ?= ifort .SUFFIXES: .o .cpp .ptx .cu -SAPPOROPATH=../ +SAPPOROPATH=.. SAPLIB2 = sapporo SAPLIB = lib$(SAPLIB2).a SAPLIBG6 = sapporoG6 @@ -68,14 +68,3 @@ test_integrator_cuda : test_integrator.o clean: /bin/rm -rf *.o *.ptx *.a $(PROG) CUDA - - -$(OBJ): $(SAPPOROPATH)/$(SAPLIB) - - - - - - - - diff --git a/tests/Makefile_ocl b/tests/Makefile_ocl index eb4d7a3..11ea0c1 100644 --- a/tests/Makefile_ocl +++ b/tests/Makefile_ocl @@ -5,6 +5,7 @@ CXX ?= g++ SAPPOROPATH=.. SAPLIB2 = sapporo SAPLIB = lib$(SAPLIB2).a +SAPLIBG6 = sapporoG6 CUDA_TK ?= /usr/local/cuda #CUDA_TK = /opt/AMDAPP/ @@ -51,22 +52,11 @@ test_performance_rangeN_g5_ocl : test_performance_rangeN_g5_ocl.o $(CXX) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) test_integrator_ocl : test_integrator_ocl.o - $(CXX) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) $(LDFLAGS) + $(CXX) $(LDFLAGS) $^ -o $@ -L $(SAPPOROPATH) -l$(SAPLIB2) -l$(SAPLIBG6) $(LDFLAGS) %_ocl.o: $(SRCPATH)/%.cpp $(CXX) $(CXXFLAGS) -c $< -o $@ clean: - /bin/rm -rf *.o *.ptx *.a $(PROG) - - -$(OBJ): $(SAPPOROPATH)/$(SAPLIB) - - - - - - - - + /bin/rm -rf *.o *.ptx *.a $(PROG) OpenCL