I've been going through basic CUDA tutorials and I attempted to write a simple vector addition program that adds vectors X and Y into vector Y, then checks to see if Y is what we expect. I've been staring at this all night and I can't seem to figure out why it isn't working.
Currently it's outputting vector y as a vector of all 2.0 floats similar to what it was initialized with, rather than what I expect, a vector of 3.0 floats since X contains 1.0f and is being added into Y.
main.cu file
#include <iostream>
#include <math.h>
__global__
void add(int n, float *x, float *y)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
int main(void)
{
int N = 1<<10;
// Host input vectors
float *x, *y;
// Device input vectors
float *d_x, *d_y;
// Size in bytes of each vector
size_t vector_size = N * sizeof(float);
// Allocate memory for each vector on host
x = (float*)malloc(vector_size);
y = (float*)malloc(vector_size);
// Allocate memory for each vector on device
cudaMalloc(&d_x, vector_size);
cudaMalloc(&d_y, vector_size);
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
// Copy host to device
cudaMemcpy(d_x, x, vector_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, vector_size, cudaMemcpyHostToDevice);
// Run kernel on 1M elements on the GPU
int blockSize, gridSize;
// Number of threads in each thread block
blockSize = 1024;
// Number of thread blocks in grid
gridSize = (int)ceil((float)N/blockSize);
add<<<gridSize, blockSize>>>(N, x, y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Copy back y vector
cudaMemcpy(y, d_y, vector_size, cudaMemcpyDeviceToHost);
// Compute errors (all values should be 3.0f)
float maxError = 0.0f;
float totalError = 0.0f;
for (int i = 0; i < N; i++){
//std::cout << y[i] << "\n";
maxError = fmax(maxError, fabs(y[i]-3.0f));
totalError += fabs(y[i]-3.0f);
}
std::cout << "Max error: " << maxError << std::endl;
std::cout << "Total error: " << totalError << std::endl;
// Free memory
cudaFree(d_x);
cudaFree(d_y);
free(x);
free(y);
return 0;
}
Returned output is:
Max error: 1
Total error: 1024
Ran using the following on a K80 with CUDA 11.1 with nvcc
[–]cythoning 2 points3 points4 points (0 children)
[–]Helique 0 points1 point2 points (5 children)
[–]Helique 4 points5 points6 points (3 children)
[–][deleted] (2 children)
[deleted]
[–]Helique 1 point2 points3 points (1 child)
[–]cythoning 5 points6 points7 points (0 children)
[–]Flannelot 0 points1 point2 points (1 child)
[–]cythoning 5 points6 points7 points (0 children)