#!/usr/bin/make -f export CXX=clang++ export CC=clang #export HIPCXX=clang++ #export HIPFLAGS=-std=c++23 -Wno-c++20-extensions export PATH:=/usr/lib/llvm-22/bin:$(PATH) export PYBUILD_NAME=ck4inductor export DEB_BUILD_MAINT_OPTIONS = hardening=+all export DEB_CFLAGS_MAINT_APPEND = -Wall -pedantic export DEB_LDFLAGS_MAINT_APPEND = -Wl,-O1 FULL_TEST ?= 0 # filter incompatible options from affecting device code CXXFLAGS := $(subst -fstack-protector-strong,-Xarch_host -fstack-protector-strong,$(CXXFLAGS)) CXXFLAGS := $(subst -fcf-protection,-Xarch_host -fcf-protection,$(CXXFLAGS)) ifdef CCACHE_DIR CMAKE_CACHE_FLAGS = \ -DENABLE_CCACHE_GEMM=ON -DENABLE_CCACHE_GEMM_PRESHUFFLE=ON \ -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache CCACHE=ccache endif CKCXXDEVFLAGS=-ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker -fbracket-depth=512 CKSTDFLAGS=-std=c++23 -Wno-c++20-extensions # Could also add -Wno-#pragma-messages # Looks like default _FORTIFY_SOURCE=2 stucks build in an infinity loop # https://clang.debian.net/status.php?version=13.0.0&key=BUILD_TIMEOUT # Indeed disabling _FORTIFY_SOURCE allow a reasonnable amount of memory to build FORTIFLAG0=-DCMAKE_C_FLAGS="-U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0" \ -DCMAKE_CXX_FLAGS="-U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0" FORTIFLAG1=-DCMAKE_C_FLAGS="-U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=1" \ -DCMAKE_CXX_FLAGS="-U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=1" FORTIFLAGX=-DCMAKE_C_FLAGS="-U_FORTIFY_SOURCE" \ -DCMAKE_CXX_FLAGS="-U_FORTIFY_SOURCE" FORTIFLAG=$(FORTIFLAGX) #-std=c++23 DEV_FLAGS=-DUSE_BITINT_EXTENSION_INT4=OFF -DBUILD_DEV=ON # For arches see https://rocm.docs.amd.com/en/latest/reference/gpu-arch-specs.html ROCM_TARGET_ARCH_FIXED="gfx11-generic;gfx12-generic" GPU_ARCHS_LIST=$(shell rocm-target-arch) HIP_PLATFORM="amd" CMAKE_FLAGS = \ -DCMAKE_BUILD_TYPE=Release \ -DHIP_PLATFORM=$(HIP_PLATFORM) \ -DMIOPEN_REQ_LIBS_ONLY=ON \ -DGPU_ARCHS=$(ROCM_TARGET_ARCH_FIXED) \ -DCMAKE_CXX_COMPILER=clang++ \ $(CMAKE_CACHE_FLAGS) \ $(FORTIFLAG) CMAKE_TST_FLAGS = \ -DCMAKE_BUILD_TYPE=Release \ -DHIP_PLATFORM=$(HIP_PLATFORM) \ -DBUILD_TESTING=ON \ -DSKIP_BROKEN_EXAMPLE=ON \ -DCMAKE_CXX_COMPILER=clang++ \ -DSKIP_LONG_BUILD=ON \ $(CMAKE_CACHE_FLAGS) \ $(FORTIFLAG) # -DCMAKE_HIP_STANDARD=23 \ # The next make not building tests # From CMakeLists.txt # In order to build just the CK library (without tests and examples) for all supported GPU targets # -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" # In order to build CK along with all tests and examples it should be OK # to set GPU_TARGETS to just 1 or 2 similar architectures. # # From TheRock builds # -DGPU_TARGETS="gfx1100;gfx1101;gfx1102" # When using GPU_TARGETS the build will only work if the architectures are similar # -DGPU_TARGETS="gfx908;gfx90a" # -DGPU_TARGETS="gfx1100;gfx1101;gfx1102" # None of the following worked (more target than managed for GPU_TARGETS # GPU_ARCHS disable tests # -DGPU_TARGETS="$(shell rocm-target-arch --sep ';')" # -DGPU_ARCHS="$(shell rocm-target-arch --sep ';')" # From The Rock # -GNinja # -DCMAKE_BUILD_TYPE=Release # -DHIP_PLATFORM=amd # -DBUILD_TESTING=ON # -DMIOPEN_REQ_LIBS_ONLY=ON # -DGPU_TARGETS="gfx1100;gfx1101;gfx1102" # LP_ENV flag: 1 for Launchpad environment (uses fixed thread count) # 0 for other environments (calculates thread count based on available CPUs) LP_ENV = 0 NPROC := $(shell nproc) ifeq ($(LP_ENV),1) # Launchpad environment: use fixed thread count THREADS_BASE := 1 THREADS_DOUBLE := 1 else # Other environments: calculate based on available CPUs THREADS_BASE := $(shell echo $$(($(NPROC) / 8))) THREADS_BASE := $(shell [ $(THREADS_BASE) -lt 1 ] && echo 1 || echo $(THREADS_BASE)) THREADS_DOUBLE := $(shell echo $$(($(THREADS_BASE) * 2))) endif %: dh $@ override_dh_auto_configure-arch: mkdir -p obj-$(DEB_HOST_GNU_TYPE)/_deps [ -L obj-$(DEB_HOST_GNU_TYPE)/_deps/gtest-src ] || \ ln -s /usr/src/googletest obj-$(DEB_HOST_GNU_TYPE)/_deps/gtest-src dh_auto_configure -O--buildsystem=cmake+ninja -- $(CMAKE_FLAGS) # Full test requires huge amount of memory/cpu at the moment # The composable kernel upstream build system is work in progress # Keep this here as it was tested and may be usefull in the future ifeq ($(FULL_TEST),1) for arch in $(GPU_ARCHS_LIST); do \ echo "Configure test for $$arch" ; \ mkdir -p build/$$arch/_deps ; \ [ -L build/$$arch/_deps/gtest-src ] || \ ln -s /usr/src/googletest build/$$arch/_deps/gtest-src ; \ dh_auto_configure --buildsystem=cmake+ninja --sourcedir=$(CURDIR) \ --builddir=build/$$arch -- $(CMAKE_TST_FLAGS) -DGPU_TARGETS="$$arch" ; \ done endif override_dh_auto_configure-indep: dh_auto_configure -O--buildsystem=pybuild override_dh_auto_build: @echo "=== Starting build with heartbeat monitoring ===" @(while true; do echo "#"; sleep 1m; done) & HB_PID=$$!; \ trap "kill $$HB_PID 2>/dev/null || true; echo ''; echo '=== Heartbeat stopped ==='" EXIT; \ set -e; \ dh_auto_build -O--buildsystem=pybuild; \ dh_auto_build -O--buildsystem=cmake+ninja -- -j$(THREADS_BASE); \ if [ "$(FULL_TEST)" = "1" ]; then \ for arch in $(GPU_ARCHS_LIST); do \ echo "===== Building for $$arch ====="; \ dh_auto_build --buildsystem=cmake+ninja --builddir=build/$$arch -- -j$(THREADS_DOUBLE) examples tests || exit 1; \ done; \ fi; \ echo "=== Build completed successfully ===" # .o files take around ~28GB which cause launchpad builders # to fail due to 'No space left on device' execute_before_dh_auto_install-arch: find $(CURDIR) -name '*.o' -delete : execute_after_dh_auto_build-indep: export http_proxy=127.0.0.1:9 execute_after_dh_auto_build-indep: export https_proxy=127.0.0.1:9 execute_after_dh_auto_build-indep: ifeq (,$(filter nodoc,$(DEB_BUILD_OPTIONS))) perl -pi -e 's/^FULL_PATH_NAMES.*/FULL_PATH_NAMES = NO/' docs/doxygen/Doxyfile perl -pi -e 's|^STRIP_FROM_PATH.*|STRIP_FROM_PATH = $(CURDIR)/|' docs/doxygen/Doxyfile perl -pi -e 's/^TIMESTAMP.*/TIMESTAMP = NO/' docs/doxygen/Doxyfile export DOXYGEN_STRIP_FROM_PATH=$(CURDIR)/; \ rocm-docs-build rm -f build/html/doxygen/html/jquery.js find build/html -name Composable-Kernel-math.html \ -exec perl -ni -e 'print unless /cdn\.jsdelivr\.net/' {} \; find build/html -name Composable-Kernel-Glossary.html \ -exec perl -ni -e 'print unless /cdn\.jsdelivr\.net/' {} \; find build/html -type f -name '*.html' -exec sed -i 's|$(CURDIR)/||g' {} + endif override_dh_auto_install: ifeq ($(FULL_TEST),1) for arch in $(GPU_ARCHS_LIST); do \ make -C $(CURDIR)/build/$$arch/test install DESTDIR=$(CURDIR)/build/cktests/$$arch ; \ make -C $(CURDIR)/build/$$arch/example install DESTDIR=$(CURDIR)/build/cktests/$$arch ; \ cd $(CURDIR)/build/$$arch ; find test -name CTestTestfile.cmake | cpio -pdumv $(CURDIR)/build/cktests/$$arch ; \ cd $(CURDIR)/build/$$arch ; find example -name CTestTestfile.cmake | cpio -pdumv $(CURDIR)/build/cktests/$$arch ; \ find $(CURDIR)/build/cktests/$$arch -type f -name 'CTestTestfile.cmake' -exec sed -i "s|$(CURDIR)/build/$$arch/bin|/usr/libexec/rocm/libcomposable-kernel-tests/$$arch/usr/bin|g" {} + ; \ find $(CURDIR)/build/cktests/$$arch -type f -name 'CTestTestfile.cmake' -exec sed -i "s|$(CURDIR)||g" {} + ; \ done endif dh_auto_install -O--buildsystem=cmake+ninja -O--builddirectory=$(CURDIR)/obj-$(DEB_HOST_GNU_TYPE) dh_auto_install --destdir=debian/python3-ck4inductor/ -O--buildsystem=pybuild # Fix the README location in the temporary install directory if [ -f debian/tmp/usr/include/ck/README.md ]; then \ mkdir -p debian/libcomposable-kernel-dev/usr/share/doc/libcomposable-kernel-dev/; \ mv debian/tmp/usr/include/ck/README.md \ debian/libcomposable-kernel-dev/usr/share/doc/libcomposable-kernel-dev/README.ck-headers.md; \ fi find debian -type f -name LICENSE -exec rm -f {} \; override_dh_clean: rm -rf example/ck_tile/01_fmha/codegen/__pycache__/ \ example/ck_tile/01_fmha/codegen/ops/__pycache__/ \ tile_engine/ops/gemm/__pycache__/ \ build/ \ docs/_doxygen/ \ docs/doxygen/html/ \ docs/doxygen/xml/ \ docs/sphinx/_toc.yml \ rocm_composable_kernel.egg-info/ \ .pybuild/ dh_clean override_dh_strip: dh_strip --no-automatic-dbgsym override_dh_fixperms: dh_fixperms find debian -name 'CTestTestfile.cmake' -exec chmod -x {} \; override_dh_gencontrol: dh_gencontrol -- -Vrocm:GPU-Architecture=$(subst ;, ,$(ROCM_TARGET_ARCH_FIXED))