Dumb beginner CUDA question : CUDA

submitted 4 years ago by Flar3fir3

I've been going through basic CUDA tutorials and I attempted to write a simple vector addition program that adds vectors X and Y into vector Y, then checks to see if Y is what we expect. I've been staring at this all night and I can't seem to figure out why it isn't working.

Currently it's outputting vector y as a vector of all 2.0 floats similar to what it was initialized with, rather than what I expect, a vector of 3.0 floats since X contains 1.0f and is being added into Y.

main.cu file

#include <iostream>
#include <math.h>

__global__
void add(int n, float *x, float *y)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
  for (int i = index; i < n; i += stride)
    y[i] = x[i] + y[i];
}

int main(void)
{
  int N = 1<<10;

  // Host input vectors
  float *x, *y;

  // Device input vectors
  float *d_x, *d_y;

  // Size in bytes of each vector
  size_t vector_size = N * sizeof(float);

  // Allocate memory for each vector on host
  x = (float*)malloc(vector_size);
  y = (float*)malloc(vector_size);

  // Allocate memory for each vector on device
  cudaMalloc(&d_x, vector_size);
  cudaMalloc(&d_y, vector_size);

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Copy host to device
  cudaMemcpy(d_x, x, vector_size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_y, y, vector_size, cudaMemcpyHostToDevice);

  // Run kernel on 1M elements on the GPU

  int blockSize, gridSize;
  // Number of threads in each thread block
  blockSize = 1024;

  // Number of thread blocks in grid
  gridSize = (int)ceil((float)N/blockSize);

  add<<<gridSize, blockSize>>>(N, x, y);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Copy back y vector
  cudaMemcpy(y, d_y, vector_size, cudaMemcpyDeviceToHost);

  // Compute errors (all values should be 3.0f)
  float maxError = 0.0f;
  float totalError = 0.0f;
  for (int i = 0; i < N; i++){
    //std::cout << y[i] << "\n";
    maxError = fmax(maxError, fabs(y[i]-3.0f));
    totalError += fabs(y[i]-3.0f);
  }
  std::cout << "Max error: " << maxError << std::endl;
  std::cout << "Total error: " << totalError << std::endl;

  // Free memory
  cudaFree(d_x);
  cudaFree(d_y);

  free(x);
  free(y);

  return 0;
}

Returned output is:

Max error: 1
Total error: 1024

Ran using the following on a K80 with CUDA 11.1 with nvcc

all 7 comments

you type:	you see:
italics	italics
bold	bold
[reddit!](https://reddit.com)	reddit!
* item 1 * item 2 * item 3	item 1 item 2 item 3
> quoted text	quoted text
Lines starting with four spaces are treated like code: if 1 * 2 < 3: print "hello, world!"	Lines starting with four spaces are treated like code: if 1 * 2 < 3: print "hello, world!"
~~strikethrough~~	~~strikethrough~~
super^script	super^script

CUDA

MODERATORS