Blog of roxlu, co-founder of Apollo Media. Contact info[shift+2]apollomedia.nl.

Basic CUDA example

For a project where we want to have a massive flocking simulation I'm researching the use of CUDA. These are my first baby steps :). To get started, best thing is to start reading the CUDA C Programming Guide.

I'm using a GTX 670 graphics card and running cuda 5.0 on Arch Linux. To compile my application I'm using CMake, see below for the CMakeLists file.

intro.cu

// -*- mode: C -*-
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda.h>
 
#define K_THREADS 64
#define K_INDEX() ((gridDim.x * blockIdx.y + blockIdx.x) * blockDim.x + threadIdx.x)
#define RND() ((rand() & 0x7FFF) / float(0x8000))
#define ERRORCHECK() cErrorCheck(__FILE__, __LINE__)
 
#define TIMER_CREATE(t)                      \
  cudaEvent_t t##_start, t##_end;            \
  cudaEventCreate(&t##_start);               \
  cudaEventCreate(&t##_end);               
 
 
#define TIMER_START(t)                          \
  cudaEventRecord(t##_start);                   \
  cudaEventSynchronize(t##_start);              \
 
 
#define TIMER_END(t)                                          \
  cudaEventRecord(t##_start);                                 \
  cudaEventSynchronize(t##_start);                            \
  cudaEventRecord(t##_end);                                   \
  cudaEventSynchronize(t##_end);                              \
  cudaEventElapsedTime(&t, t##_start, t##_end);               
 
 
inline void cErrorCheck(const char *file, int line) {
  cudaThreadSynchronize();
  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess) {
    printf("Error: %s\n", cudaGetErrorString(err));
    printf(" @ %s: %d\n", file, line);
    exit(-1);
  }
}
 
inline dim3 K_GRID(int n, int threads = K_THREADS) {
  int blocks = (int)ceilf(sqrtf((float)n/threads));
  dim3 grid(blocks, blocks);
  return grid;
}
 
typedef struct data  {
  int n;
  float4 *r, *v, *f;
} data;
 
data cpu, gpu;
 
#define N 20
 
__global__ void repulsion(data gpu);
__global__ void integration(data gpu);
 
 
int main() {
  printf("Cuda Test 1\n");
 
  int count = 0;
  cudaGetDeviceCount(&count);
  printf(" %d CUDA devices found\n", count);
  if(!count) {
    ::exit(EXIT_FAILURE);
  }
 
  cudaFree(0);
 
  cpu.n = N;
 
  cpu.r = (float4*)malloc(N * sizeof(float4));
  cpu.v = (float4*)malloc(N * sizeof(float4));
  cpu.f = (float4*)malloc(N * sizeof(float4));
 
  for(int i = 0; i < N; ++i) {
    cpu.v[i] = make_float4(0,0,0,0);
    cpu.r[i] = make_float4(RND(), RND(), RND(), 0);
    cpu.f[i] = make_float4(0,0.01,0,0);
  }
 
  gpu = cpu;
  cudaMalloc(&gpu.r, N * sizeof(float4));
  cudaMalloc(&gpu.v, N * sizeof(float4));
  cudaMalloc(&gpu.f, N * sizeof(float4));
 
  cudaMemcpy(gpu.r, cpu.r, cpu.n * sizeof(float4), cudaMemcpyHostToDevice);
  cudaMemcpy(gpu.v, cpu.v, cpu.n * sizeof(float4), cudaMemcpyHostToDevice);
  cudaMemcpy(gpu.f, cpu.f, cpu.n * sizeof(float4), cudaMemcpyHostToDevice);
 
  ERRORCHECK();
  float rep;
  TIMER_CREATE(rep);
  TIMER_START(rep);
 
  integration <<< K_GRID(cpu.n), K_THREADS >>>(gpu);
 
  TIMER_END(rep);
  printf("Took: %f ms\n", rep);
  ERRORCHECK();
 
  cudaMemcpy(cpu.r, gpu.r, cpu.n * sizeof(float4), cudaMemcpyDeviceToHost);
  cudaMemcpy(cpu.v, gpu.v, cpu.n * sizeof(float4), cudaMemcpyDeviceToHost);
  cudaMemcpy(cpu.f, gpu.f, cpu.n * sizeof(float4), cudaMemcpyDeviceToHost);
 
  cudaFreeHost(cpu.r);
  cudaFreeHost(cpu.v);
  cudaFreeHost(cpu.f);
  cudaFree(gpu.r);
  cudaFree(gpu.v);
  cudaFree(gpu.f);
 
  cudaDeviceReset();
 
  printf("Results: \n");
  for(int i = 0; i < N; ++i) {
    printf("%f, %f, %f \n", cpu.r[i].x, cpu.r[i].y, cpu.r[i].z);
  }
 
  printf("Ready...\n");
  return 0;
}
 
__global__ void repulsion(data gpu) {
  int idx = K_INDEX();
  if(idx < N) {
    gpu.r[idx].x = 1;
    gpu.r[idx].y = 1;
    gpu.r[idx].z = 1;
  }
}
 
#define MULT4(v, s) v.x *= s; v.y *= s; v.z *= s; v.w *= s;
#define ADD4(v1, v2) v1.x += v2.x; v1.y += v2.y; v1.z += v2.z; v1.w += v2.w;
 
__global__ void integration(data gpu) {
  int i = K_INDEX();
  if(i < N) {
    MULT4(gpu.f[i], 0.01);
    MULT4(gpu.v[i], 0.01);
    ADD4(gpu.v[i], gpu.f[i]);
    ADD4(gpu.r[i], gpu.v[i]);
    gpu.f[i] = make_float4(0,0,0,0);
  }
 
}

CMakeLists.txt

cmake_minimum_required(VERSION 2.8)
 
find_package(CUDA)
 
set(src ${CMAKE_CURRENT_LIST_DIR}/../)
 
set(CUDA_NVCC_FLAGS "-arch=sm_20")
 
cuda_compile(INTRO_O ${src}/intro.cu)
 
cuda_add_executable(intro ${INTRO_O})
 
install(TARGETS intro DESTINATION .)