wfcommons · quantumsteve · Feb 9, 2026 · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026
diff --git a/Makefile b/Makefile
diff --git a/bin/cpu-benchmark.cpp b/bin/cpu-benchmark.cpp
diff --git a/bin/cuda/CMakeLists.txt b/bin/cuda/CMakeLists.txt
@@ -0,0 +1,5 @@
+cmake_minimum_required(VERSION 3.21) # HIP language support requires 3.21
+cmake_policy(VERSION 3.21.3...3.27)
+project(MyProj LANGUAGES CUDA)
+add_executable(gpu_benchmark gpu_benchmark.cu)
+
diff --git a/bin/gpu_benchmark.cu → bin/cuda/gpu_benchmark.cu b/bin/gpu_benchmark.cu → bin/cuda/gpu_benchmark.cu
@@ -3,6 +3,16 @@
 #include <cstdlib>  // For std::atoi
 #include "gpu_benchmark.h"
 
+// The macro wraps any CUDA API call
+#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
+   if (code != cudaSuccess) {
+      fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
+
 // Kernel function to perform a simple workload
 __global__ void simpleKernel(int* data, int size) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -22,10 +32,10 @@ void runBenchmark(int max_work) {
     }
 
     // Allocate GPU memory
-    cudaMalloc(&d_data, max_work * sizeof(int));
+    CUDA_CHECK(cudaMalloc(&d_data, max_work * sizeof(int)));
 
     // Copy data to GPU
-    cudaMemcpy(d_data, h_data, max_work * sizeof(int), cudaMemcpyHostToDevice);
+    CUDA_CHECK(cudaMemcpy(d_data, h_data, max_work * sizeof(int), cudaMemcpyHostToDevice));
 
     // Kernel configuration
     int threadsPerBlock = 256;
@@ -35,13 +45,13 @@ void runBenchmark(int max_work) {
     simpleKernel<<<blocksPerGrid, threadsPerBlock>>>(d_data, max_work);
 
     // Ensure the kernel has finished executing
-    cudaDeviceSynchronize();
+    CUDA_CHECK(cudaDeviceSynchronize());
 
     // Copy results back to host (optional, just for validation)
-    cudaMemcpy(h_data, d_data, max_work * sizeof(int), cudaMemcpyDeviceToHost);
+    CUDA_CHECK(cudaMemcpy(h_data, d_data, max_work * sizeof(int), cudaMemcpyDeviceToHost));
 
     // Cleanup
-    cudaFree(d_data);
+    CUDA_CHECK(cudaFree(d_data));
     delete[] h_data;
 
     std::cout << "Benchmark completed!" << std::endl;
@@ -58,10 +68,10 @@ void runBenchmarkTime(int max_work, int runtime_in_seconds) {
     }
 
     // Allocate GPU memory
-    cudaMalloc(&d_data, max_work * sizeof(int));
+    CUDA_CHECK(cudaMalloc(&d_data, max_work * sizeof(int)));
 
     // Copy data to GPU
-    cudaMemcpy(d_data, h_data, max_work * sizeof(int), cudaMemcpyHostToDevice);
+    CUDA_CHECK(cudaMemcpy(d_data, h_data, max_work * sizeof(int), cudaMemcpyHostToDevice));
 
     // Start the timer
     auto start = std::chrono::high_resolution_clock::now();
@@ -77,10 +87,10 @@ void runBenchmarkTime(int max_work, int runtime_in_seconds) {
     }
 
     // Copy results back to host (optional, just for validation)
-    cudaMemcpy(h_data, d_data, max_work * sizeof(int), cudaMemcpyDeviceToHost);
+    CUDA_CHECK(cudaMemcpy(h_data, d_data, max_work * sizeof(int), cudaMemcpyDeviceToHost));
 
     // Cleanup
-    cudaFree(d_data);
+    CUDA_CHECK(cudaFree(d_data));
     delete[] h_data;
 
     std::cout << "Benchmark completed!" << std::endl;

diff --git a/bin/gpu_benchmark.h → bin/cuda/gpu_benchmark.h b/bin/gpu_benchmark.h → bin/cuda/gpu_benchmark.h
diff --git a/bin/hip/CMakeLists.txt b/bin/hip/CMakeLists.txt
@@ -0,0 +1,5 @@
+cmake_minimum_required(VERSION 3.21) # HIP language support requires 3.21
+cmake_policy(VERSION 3.21.3...3.27)
+project(MyProj LANGUAGES HIP)
+add_executable(gpu_benchmark gpu_benchmark.hip)
+
diff --git a/bin/hip/gpu_benchmark.h b/bin/hip/gpu_benchmark.h
@@ -0,0 +1,11 @@
+#ifndef GPU_BENCHMARK_H
+#define GPU_BENCHMARK_H
+
+#include <hip/hip_runtime.h>
+
+void runBenchmark(int max_work);
+void runBenchmarkTime(int max_work, int runtime_in_seconds);
+
+#endif // GPU_BENCHMARK_H
+
+
diff --git a/bin/hip/gpu_benchmark.hip b/bin/hip/gpu_benchmark.hip
@@ -0,0 +1,135 @@
+#include "hip/hip_runtime.h"
+#include <iostream>
+#include <chrono>
+#include <cstdlib>  // For std::atoi
+#include "gpu_benchmark.h"
+
+#define HIP_CHECK(expression)                  \
+{                                              \
+    const hipError_t status = expression;      \
+    if(status != hipSuccess){                  \
+        std::cerr << "HIP error "              \
+                  << status << ": "            \
+                  << hipGetErrorString(status) \
+                  << " at " << __FILE__ << ":" \
+                  << __LINE__ << std::endl;    \
+    }                                          \
+}
+
+// Kernel function to perform a simple workload
+__global__ void simpleKernel(int* data, int size) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < size) {
+        data[idx] = data[idx] * data[idx];  // Simple workload: squaring each element
+    }
+}
+
+// Function to run the GPU benchmark with no time limit
+void runBenchmark(int max_work) {
+    int* h_data = new int[max_work];
+    int* d_data;
+
+    // Initialize data
+    for (int i = 0; i < max_work; i++) {
+        h_data[i] = i;
+    }
+
+    // Allocate GPU memory
+    HIP_CHECK(hipMalloc(&d_data, max_work * sizeof(int)));
+
+    // Copy data to GPU
+    HIP_CHECK(hipMemcpy(d_data, h_data, max_work * sizeof(int), hipMemcpyHostToDevice));
+
+    // Kernel configuration
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (max_work + threadsPerBlock - 1) / threadsPerBlock;
+
+    // Run the kernel
+    simpleKernel<<<blocksPerGrid, threadsPerBlock>>>(d_data, max_work);
+
+    // Ensure the kernel has finished executing
+    HIP_CHECK(hipDeviceSynchronize());
+
+    // Copy results back to host (optional, just for validation)
+    HIP_CHECK(hipMemcpy(h_data, d_data, max_work * sizeof(int), hipMemcpyDeviceToHost));
+
+    // Cleanup
+    HIP_CHECK(hipFree(d_data));
+    delete[] h_data;
+
+    std::cout << "Benchmark completed!" << std::endl;
+}
+
+// Function to run the GPU benchmark for a specified time
+void runBenchmarkTime(int max_work, int runtime_in_seconds) {
+    int* h_data = new int[max_work];
+    int* d_data;
+
+    // Initialize data
+    for (int i = 0; i < max_work; i++) {
+        h_data[i] = i;
+    }
+
+    // Allocate GPU memory
+    HIP_CHECK(hipMalloc(&d_data, max_work * sizeof(int)));
+
+    // Copy data to GPU
+    HIP_CHECK(hipMemcpy(d_data, h_data, max_work * sizeof(int), hipMemcpyHostToDevice));
+
+    // Start the timer
+    auto start = std::chrono::high_resolution_clock::now();
+
+    // Kernel configuration
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (max_work + threadsPerBlock - 1) / threadsPerBlock;
+
+    // Run the workload loop until the specified runtime is reached
+    while (std::chrono::duration_cast<std::chrono::seconds>(std::chrono::high_resolution_clock::now() - start).count() < runtime_in_seconds) {
+        simpleKernel<<<blocksPerGrid, threadsPerBlock>>>(d_data, max_work);
+        HIP_CHECK(hipDeviceSynchronize());  // Ensure the kernel has finished executing
+    }
+
+    // Copy results back to host (optional, just for validation)
+    HIP_CHECK(hipMemcpy(h_data, d_data, max_work * sizeof(int), hipMemcpyDeviceToHost));
+
+    // Cleanup
+    HIP_CHECK(hipFree(d_data));
+    delete[] h_data;
+
+    std::cout << "Benchmark completed!" << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+    // Check for the correct number of command line arguments
+    if (argc == 2) {
+        // Parse the command line arguments
+        int max_work = std::atoi(argv[1]);
+
+        // Validate the input arguments
+        if (max_work <= 0) {
+            std::cerr << "max_work must be a positive integer." << std::endl;
+            return 1;
+        }
+
+        runBenchmark(max_work);
+
+    } else if (argc == 3) {
+        // Parse the command line arguments
+        int max_work = std::atoi(argv[1]);
+        int runtime_in_seconds = std::atoi(argv[2]);
+
+        // Validate the input arguments
+        if (max_work <= 0 || runtime_in_seconds <= 0) {
+            std::cerr << "Both max_work and runtime_in_seconds must be positive integers." << std::endl;
+            return 1;
+        }
+
+        runBenchmarkTime(max_work, runtime_in_seconds);
+
+    } else {
+        std::cerr << "Usage: " << argv[0] << " <max_work> [runtime_in_seconds]" << std::endl;
+        return 1;
+    }
+
+    return 0;
+}