Commit 7c0a471e authored by Mark Ruzindana's avatar Mark Ruzindana
Browse files

Merge branch 'master' of https://github.com/rallenblack/flag_gpu

Merging modified matlab files
parents 748bcaa7 518bb408
...@@ -15,82 +15,18 @@ ...@@ -15,82 +15,18 @@
using namespace std; using namespace std;
// CUDA-specific function prototypes void beamform();
void print_matrix(const cuComplex *A, int nr_rows_A, int nr_cols_A, int nr_sheets_A);
void print_matrix2(const float *A, int nr_rows_A, int nr_cols_A);
void GPU_fill(cuComplex *A, int nr_rows_A, int nr_cols_A);
void GPU_fill2(cuComplex *A, int nr_rows_A, int nr_cols_A); __global__
void transpose(signed char* data, signed char* tra_data);
__global__ __global__
void data_restructure(signed char * data, cuComplex * data_restruc); void data_restructure(signed char * data, cuComplex * data_restruc);
void beamform();
__global__ __global__
void sti_reduction(cuComplex * data_in, float * data_out); void sti_reduction(cuComplex * data_in, float * data_out);
// Fill the array A(nr_rows_A, nr_cols_A) with random numbers on GPU
void GPU_fill(cuComplex *A, int nr_rows_A, int nr_cols_A) {
cuComplex *G;
G = new cuComplex[nr_rows_A*nr_cols_A];
for(int i = 0; i < nr_rows_A*nr_cols_A; ++i){
G[i].x = (i + 1)%(nr_rows_A*nr_cols_A/(BN_BIN));
G[i].y = (i + 1)%(nr_rows_A*nr_cols_A/(BN_BIN));
}
cudaMemcpy(A,G,nr_rows_A * nr_cols_A * sizeof(cuComplex),cudaMemcpyHostToDevice);
delete[] G;
}
void GPU_fill2(cuComplex *A, int nr_rows_A, int nr_cols_A) {
cuComplex *G;
G = new cuComplex[nr_rows_A*nr_cols_A];
for(int i = 0; i < nr_rows_A*nr_cols_A; ++i){
G[i].x = i%(nr_rows_A*nr_cols_A/(BN_BIN));
G[i].y = i%(nr_rows_A*nr_cols_A/(BN_BIN));
}
cudaMemcpy(A,G,nr_rows_A * nr_cols_A * sizeof(cuComplex),cudaMemcpyHostToDevice);
delete[] G;
}
void print_matrix(const cuComplex *A, int nr_rows_A, int nr_cols_A, int nr_sheets_A) {
for(int i = 0; i < nr_rows_A; ++i){
for(int j = 0; j < nr_cols_A; ++j){
for(int k = 0; k < nr_sheets_A; ++k){
// cout << A[j * nr_rows_A + i].x << "+" << A[j * nr_rows_A + i].y << "i" <<" ";
printf("%i,%i,%i: %e + %e i\n",i,j,k,A[k*nr_rows_A*nr_cols_A + j * nr_rows_A + i].x, A[k*nr_rows_A*nr_cols_A + j * nr_rows_A + i].y);
}
}
// cout << endl;
}
// cout << endl;
// for(int i = 0; i < nr_rows_A*nr_cols_A; ++i){
// printf("%i,: %e + %e i\n",i,A[i].x, A[i].y);
// }
}
void print_matrix2(const float *A, int nr_rows_A, int nr_cols_A) {
// for(int j = 0; j < nr_cols_A; ++j){
// for(int i = 0; i < nr_rows_A; ++i){
// //cout << A[j * nr_rows_A + i].x << "+" << A[j * nr_rows_A + i].y << "i" <<" ";
// printf("%i,%i: %e\n",i,j,A[j * nr_rows_A + i]);
// }
// cout << endl;
// }
// cout << endl;
for(int i = 0; i < nr_rows_A*nr_cols_A; ++i){
printf("%i,: %e\n",i,A[i]);
}
}
// Struct defintion for beamformer metadata // Struct defintion for beamformer metadata
typedef struct bf_metadata_struct { typedef struct bf_metadata_struct {
float offsets[14]; float offsets[14];
...@@ -99,11 +35,16 @@ typedef struct bf_metadata_struct { ...@@ -99,11 +35,16 @@ typedef struct bf_metadata_struct {
char weight_filename[65]; char weight_filename[65];
long long unsigned int xid; long long unsigned int xid;
} bf_metadata; } bf_metadata;
static bf_metadata my_metadata; static bf_metadata my_metadata;
static cuComplex * d_weights = NULL; static cuComplex * d_weights = NULL;
void update_weights(char * filename){ void update_weights(char * filename){
printf("In update_weights()...\n");
printf("RTBF: In update_weights()...\n");
char weight_filename[128]; char weight_filename[128];
strcpy(weight_filename, filename); strcpy(weight_filename, filename);
FILE * weights; FILE * weights;
...@@ -115,19 +56,24 @@ void update_weights(char * filename){ ...@@ -115,19 +56,24 @@ void update_weights(char * filename){
bf_weights = (float *)malloc(2*BN_WEIGHTS*sizeof(float)); bf_weights = (float *)malloc(2*BN_WEIGHTS*sizeof(float));
weights_dc = (float complex *)malloc(BN_WEIGHTS*sizeof(float complex *)); weights_dc = (float complex *)malloc(BN_WEIGHTS*sizeof(float complex *));
weights_dc_n = (float complex *)malloc(BN_WEIGHTS*sizeof(float complex *)); weights_dc_n = (float complex *)malloc(BN_WEIGHTS*sizeof(float complex *));
// open weight file
weights = fopen(weight_filename, "r"); weights = fopen(weight_filename, "r");
int j; int j;
if (weights != NULL) { if (weights != NULL) {
fread(bf_weights, sizeof(float), 2*BN_WEIGHTS, weights); fread(bf_weights, sizeof(float), 2*BN_WEIGHTS, weights);
fread(my_metadata.offsets, sizeof(float), 14, weights); fread(my_metadata.offsets, sizeof(float), 14, weights);
fread(my_metadata.cal_filename, sizeof(char), 64, weights); fread(my_metadata.cal_filename, sizeof(char), 64, weights);
my_metadata.cal_filename[64] = '\0';
fread(my_metadata.algorithm, sizeof(char), 64, weights); fread(my_metadata.algorithm, sizeof(char), 64, weights);
my_metadata.algorithm[64] = '\0';
fread(&(my_metadata.xid), sizeof(long long unsigned int), 1, weights); fread(&(my_metadata.xid), sizeof(long long unsigned int), 1, weights);
my_metadata.cal_filename[64] = '\0';
my_metadata.algorithm[64] = '\0';
// Extract all path information from weight_filename for metadata // Extract all path information from weight_filename for metadata
char * short_filename = strrchr(weight_filename, '/'); char * short_filename = strrchr(weight_filename, '/');
if (short_filename != NULL) { if (short_filename != NULL) {
...@@ -137,8 +83,6 @@ void update_weights(char * filename){ ...@@ -137,8 +83,6 @@ void update_weights(char * filename){
strcpy(my_metadata.weight_filename, weight_filename); strcpy(my_metadata.weight_filename, weight_filename);
} }
// Convert to complex numbers (do a conjugate at the same time) // Convert to complex numbers (do a conjugate at the same time)
for(j = 0; j < BN_WEIGHTS; j++){ for(j = 0; j < BN_WEIGHTS; j++){
weights_dc_n[j] = bf_weights[2*j] - bf_weights[(2*j)+1]*I; weights_dc_n[j] = bf_weights[2*j] - bf_weights[(2*j)+1]*I;
...@@ -159,14 +103,16 @@ void update_weights(char * filename){ ...@@ -159,14 +103,16 @@ void update_weights(char * filename){
} }
fclose(weights); fclose(weights);
} }
free(bf_weights);
// Copy weights to device // Copy weights to device
cudaMemcpy(d_weights, weights_dc, BN_WEIGHTS*sizeof(cuComplex), cudaMemcpyHostToDevice); //r_weights instead of weights_dc //*BN_TIME cudaMemcpy(d_weights, weights_dc, BN_WEIGHTS*sizeof(cuComplex), cudaMemcpyHostToDevice); //r_weights instead of weights_dc //*BN_TIME
// free memory
free(weights_dc); free(weights_dc);
free(weights_dc_n);
free(bf_weights);
return;
} }
void bf_get_offsets(float * offsets){ void bf_get_offsets(float * offsets){
...@@ -202,20 +148,25 @@ long long unsigned int bf_get_xid(){ ...@@ -202,20 +148,25 @@ long long unsigned int bf_get_xid(){
return my_metadata.xid; return my_metadata.xid;
} }
static cuComplex **d_arr_A = NULL; static cuComplex **d_arr_B = NULL; static cuComplex **d_arr_C = NULL;
static cuComplex * d_beamformed = NULL; static cuComplex * d_beamformed = NULL;
static cuComplex * d_data = NULL; static cuComplex * d_data = NULL;
static signed char * d_data1 = NULL; // Device memory for input data static signed char * d_data1 = NULL; // Device memory for input data
static signed char * d_data2 = NULL;
static float * d_outputs; static float * d_outputs;
static cublasHandle_t handle; static cublasHandle_t handle;
static cuComplex **d_arr_A = NULL;
static cuComplex **d_arr_B = NULL;
static cuComplex **d_arr_C = NULL;
void init_beamformer(){ void init_beamformer(){
// Allocate memory for the weights, data, beamformer output, and sti output.
// Allocate memory for the weights, data, beamformer output, and sti output.
cudaMalloc((void **)&d_weights, BN_WEIGHTS*sizeof(cuComplex)); //*BN_TIME cudaMalloc((void **)&d_weights, BN_WEIGHTS*sizeof(cuComplex)); //*BN_TIME
cudaMalloc((void **)&d_data1, 2*BN_SAMP*sizeof(signed char)); cudaMalloc((void **)&d_data1, 2*BN_SAMP*sizeof(signed char));
//cudaMalloc((void **)&d_data2, 2*BN_SAMP*sizeof(signed char));
cudaMalloc((void **)&d_data, BN_SAMP*sizeof(cuComplex)); cudaMalloc((void **)&d_data, BN_SAMP*sizeof(cuComplex));
cudaError_t err_malloc = cudaMalloc((void **)&d_beamformed, BN_TBF*sizeof(cuComplex)); cudaError_t err_malloc = cudaMalloc((void **)&d_beamformed, BN_TBF*sizeof(cuComplex));
...@@ -225,13 +176,10 @@ void init_beamformer(){ ...@@ -225,13 +176,10 @@ void init_beamformer(){
cudaMalloc((void **)&d_outputs, BN_POL*(BN_OUTPUTS*sizeof(float)/2)); cudaMalloc((void **)&d_outputs, BN_POL*(BN_OUTPUTS*sizeof(float)/2));
/********************************************************** /**********************************************************
* Create a handle for CUBLAS * Create a handle for CUBLAS
**********************************************************/ **********************************************************/
cublasCreate(&handle); cublasCreate(&handle);
// This is all memory allocated to arrays that are used by gemmBatched.
// Allocate 3 arrays on CPU
cudaError_t cudaStat; cudaError_t cudaStat;
int nr_rows_A, nr_cols_A, nr_rows_B, nr_cols_B, nr_rows_C, nr_cols_C; int nr_rows_A, nr_cols_A, nr_rows_B, nr_cols_B, nr_rows_C, nr_cols_C;
...@@ -243,8 +191,11 @@ void init_beamformer(){ ...@@ -243,8 +191,11 @@ void init_beamformer(){
nr_rows_C = BN_BEAM; nr_rows_C = BN_BEAM;
nr_cols_C = BN_TIME; nr_cols_C = BN_TIME;
// Allocate memory to host arrays. // Allocate memory to host arrays - This is all memory allocated to arrays that are used by gemmBatched. Allocate 3 arrays on CPU
const cuComplex **h_arr_A = 0; const cuComplex **h_arr_B = 0; cuComplex **h_arr_C = 0; const cuComplex **h_arr_A = 0;
const cuComplex **h_arr_B = 0;
cuComplex **h_arr_C = 0;
h_arr_A = (const cuComplex **)malloc(nr_rows_A * nr_cols_A *BN_BIN*sizeof(const cuComplex*)); h_arr_A = (const cuComplex **)malloc(nr_rows_A * nr_cols_A *BN_BIN*sizeof(const cuComplex*));
h_arr_B = (const cuComplex **)malloc(nr_rows_B * nr_cols_B *BN_BIN*sizeof(const cuComplex*)); h_arr_B = (const cuComplex **)malloc(nr_rows_B * nr_cols_B *BN_BIN*sizeof(const cuComplex*));
h_arr_C = (cuComplex **)malloc(nr_rows_C * nr_cols_C *BN_BIN*sizeof(cuComplex*)); h_arr_C = (cuComplex **)malloc(nr_rows_C * nr_cols_C *BN_BIN*sizeof(cuComplex*));
...@@ -256,9 +207,6 @@ void init_beamformer(){ ...@@ -256,9 +207,6 @@ void init_beamformer(){
h_arr_C[i] = d_beamformed + i*nr_rows_C*nr_cols_C; h_arr_C[i] = d_beamformed + i*nr_rows_C*nr_cols_C;
} }
// delete[] d_A;
// delete[] d_B;
// Allocate memory to arrays on device. // Allocate memory to arrays on device.
cudaStat = cudaMalloc((void **)&d_arr_A,nr_rows_A * nr_cols_A * BN_BIN * sizeof(cuComplex*)); cudaStat = cudaMalloc((void **)&d_arr_A,nr_rows_A * nr_cols_A * BN_BIN * sizeof(cuComplex*));
assert(!cudaStat); assert(!cudaStat);
...@@ -275,19 +223,12 @@ void init_beamformer(){ ...@@ -275,19 +223,12 @@ void init_beamformer(){
cudaStat = cudaMemcpy(d_arr_C,h_arr_C,nr_rows_C * nr_cols_C * BN_BIN * sizeof(cuComplex*),cudaMemcpyHostToDevice); cudaStat = cudaMemcpy(d_arr_C,h_arr_C,nr_rows_C * nr_cols_C * BN_BIN * sizeof(cuComplex*),cudaMemcpyHostToDevice);
assert(!cudaStat); assert(!cudaStat);
free(h_arr_A);
} free(h_arr_B);
free(h_arr_C);
__global__
void data_restructure(signed char * data, cuComplex * data_restruc){
int e = threadIdx.x;
int t = blockIdx.x;
int f = blockIdx.y;
//Restructure data so that the frequency bin is the slowest moving index return;
data_restruc[f*BN_TIME*BN_ELE_BLOC + t*BN_ELE_BLOC + e].x = data[2*(t*BN_BIN*BN_ELE_BLOC + f*BN_ELE_BLOC + e)]*1.0f;
data_restruc[f*BN_TIME*BN_ELE_BLOC + t*BN_ELE_BLOC + e].y = data[2*(t*BN_BIN*BN_ELE_BLOC + f*BN_ELE_BLOC + e) + 1]*1.0f;
} }
signed char * data_in(char * input_filename){ signed char * data_in(char * input_filename){
...@@ -347,7 +288,7 @@ void beamform() { ...@@ -347,7 +288,7 @@ void beamform() {
nr_rows_C = BN_BEAM; nr_rows_C = BN_BEAM;
// Leading dimensions are always the rows of each matrix since the data is stored in a column-wise order. // Leading dimensions are always the rows of each matrix since the data is stored in a column-wise order.
int lda=nr_rows_A,ldb=nr_rows_B,ldc=nr_rows_C; int lda=nr_rows_A, ldb=nr_rows_B, ldc=nr_rows_C;
cuComplex alf; cuComplex alf;
cuComplex bet; cuComplex bet;
...@@ -380,6 +321,11 @@ void beamform() { ...@@ -380,6 +321,11 @@ void beamform() {
ldc, // Leading dimension of each batch or matrix in array C. ldc, // Leading dimension of each batch or matrix in array C.
batchCount); // Number of batches in each array. batchCount); // Number of batches in each array.
if (stat == CUBLAS_STATUS_INVALID_VALUE) {
printf("RTBF: Invalid CUBLAS values\n");
} else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) {
printf("RTBF: Execution failed.\n");
}
if(stat != CUBLAS_STATUS_SUCCESS){ if(stat != CUBLAS_STATUS_SUCCESS){
cerr << "cublasCgemmBatched failed" << endl; cerr << "cublasCgemmBatched failed" << endl;
...@@ -387,16 +333,80 @@ void beamform() { ...@@ -387,16 +333,80 @@ void beamform() {
} }
assert(!cudaGetLastError()); assert(!cudaGetLastError());
//Free GPU memory }
// cudaFree(d_A);
// cudaFree(d_B); __global__
// cudaFree(d_C); void transpose(signed char* data, signed char* tra_data) {
int i = threadIdx.x;
int c = threadIdx.y;
int m = blockIdx.x;
int f = blockIdx.y;
int t = blockIdx.z;
//int Nm = gridDim.x; // number of mcnts (packets)
int Nf = gridDim.y; // number of f-engines (ROACHES)
int Nt = gridDim.z; // time samples per mcnt
int Ni = blockDim.x; // inputs per f-engine (aka antenna elements per ROACH)
int Nc = blockDim.y; // bins per mcnt
int in_idx = i + Ni*c + Nc*Ni*t + Nt*Nc*Ni*f + Nf*Nt*Nc*Ni*m;
int out_idx = i + Ni*f + Nf*Ni*c + Nc*Nf*Ni*t + Nt*Nc*Nf*Ni*m;
// Destroy the handle tra_data[2*out_idx] = data[2*in_idx];
//cublasDestroy(handle); tra_data[2*out_idx + 1] = data[2*in_idx+1];
return;
} }
__global__
void data_restructure(signed char * data, cuComplex * data_restruc){
/*
Repurpose the transpose thread in the hashpipe codes by performing the transpose in the GPU.
The motivation was, why transpose then transpose again? Why not just perform one transpose
in the GPU which would be faster anyway.
*/
int i = threadIdx.x;
int c = threadIdx.y;
int m = blockIdx.x;
int f = blockIdx.y;
int t = blockIdx.z;
int Nm = gridDim.x; // number of mcnts (packets)
int Nf = gridDim.y; // number of f-engines (ROACHES)
int Nt = gridDim.z; // time samples per mcnt
int Ni = blockDim.x; // inputs per f-engine (aka antenna elements per ROACH)
int Nc = blockDim.y; // bins per mcnt
int in_idx = i + Ni*c + Nc*Ni*t + Nt*Nc*Ni*f + Nf*Nt*Nc*Ni*m;
int out_idx = i + Ni*f + Nf*Ni*t + Nt*Nf*Ni*m + Nm*Nt*Nf*Ni*c;
data_restruc[out_idx].x = data[2*in_idx]*1.0f;
data_restruc[out_idx].y = data[2*in_idx + 1]*1.0f;
return;
/*
// Original Code
int e = threadIdx.x;
int t = blockIdx.x;
int f = blockIdx.y;
//Restructure data so that the frequency bin is the slowest moving index
data_restruc[f*BN_TIME*BN_ELE_BLOC + t*BN_ELE_BLOC + e].x = data[2*(t*BN_BIN*BN_ELE_BLOC + f*BN_ELE_BLOC + e)]*1.0f;
data_restruc[f*BN_TIME*BN_ELE_BLOC + t*BN_ELE_BLOC + e].y = data[2*(t*BN_BIN*BN_ELE_BLOC + f*BN_ELE_BLOC + e) + 1]*1.0f;
return;
*/
}
__global__ __global__
void sti_reduction(cuComplex * data_in, float * data_out) { void sti_reduction(cuComplex * data_in, float * data_out) {
...@@ -406,7 +416,7 @@ void sti_reduction(cuComplex * data_in, float * data_out) { ...@@ -406,7 +416,7 @@ void sti_reduction(cuComplex * data_in, float * data_out) {
int s = blockIdx.z; int s = blockIdx.z;
int h = sample_idx(s*BN_TIME_STI + t,b,f); // Preprocessor macro used for the output of the beamformer. More detail can be seen in the header file. (First set of beams) int h = sample_idx(s*BN_TIME_STI + t,b,f); // Preprocessor macro used for the output of the beamformer. More detail can be seen in the header file. (First set of beams)
int h1 = sample_idx(s*BN_TIME_STI + t,b+BN_BEAM1,f); // Preprocessor macro used for the output of the beamformer. More detail can be seen in the header file. (Last set of beams) int h1 = sample_idx(s*BN_TIME_STI + t,b+BN_BEAM1,f); // Preprocessor macro used for the output of the beamformer. More detail can be seen in the header file. (Last set of beams)
// Temporary variables used for updating. // Temporary variables used for updating.
float beam_power1; float beam_power1;
...@@ -467,9 +477,13 @@ void sti_reduction(cuComplex * data_in, float * data_out) { ...@@ -467,9 +477,13 @@ void sti_reduction(cuComplex * data_in, float * data_out) {
data_out[output_idx(2,b,s,f)] = reduced_array1[0].x*scale; // XY* real. data_out[output_idx(2,b,s,f)] = reduced_array1[0].x*scale; // XY* real.
data_out[output_idx(3,b,s,f)] = reduced_array1[0].y*scale; // XY* imaginary. data_out[output_idx(3,b,s,f)] = reduced_array1[0].y*scale; // XY* imaginary.
} }
return;
} }
void run_beamformer(signed char * data_in, float * data_out) { void run_beamformer(signed char * data_in, float * data_out) {
cudaError_t err_code;
// Specify grid and block dimensions // Specify grid and block dimensions
dim3 dimBlock(BN_STI_BLOC, 1, 1); dim3 dimBlock(BN_STI_BLOC, 1, 1);
dim3 dimGrid(BN_BIN, BN_BEAM1, BN_STI); dim3 dimGrid(BN_BIN, BN_BEAM1, BN_STI);
...@@ -478,19 +492,43 @@ void run_beamformer(signed char * data_in, float * data_out) { ...@@ -478,19 +492,43 @@ void run_beamformer(signed char * data_in, float * data_out) {
dim3 dimBlock_d(BN_ELE_BLOC, 1, 1); dim3 dimBlock_d(BN_ELE_BLOC, 1, 1);
dim3 dimGrid_d(BN_TIME, BN_BIN, 1); dim3 dimGrid_d(BN_TIME, BN_BIN, 1);
signed char * d_restruct_in = d_data1; int Nm = 200;
int Nf = 8;
int Nt = 20;
int Nc = 25;
int Ni = 8;
dim3 gridDim_transpose(Nm, Nf, Nt);
dim3 blockDim_transpose(Ni, Nc, 1);
signed char* d_tra_data_in = d_data1;
//signed char* d_tra_data_out = d_data2;
//signed char * d_restruct_in = d_data1;
cuComplex * d_restruct_out = d_data; cuComplex * d_restruct_out = d_data;
cudaMemcpy(d_restruct_in, data_in, 2*BN_SAMP*sizeof(signed char), cudaMemcpyHostToDevice); //cudaMemcpy(d_restruct_in, data_in, 2*BN_SAMP*sizeof(signed char), cudaMemcpyHostToDevice);
cudaMemcpy(d_tra_data_in, data_in, 2*BN_SAMP*sizeof(signed char), cudaMemcpyHostToDevice);
err_code = cudaGetLastError();
if (err_code != cudaSuccess) {
printf("RTBF: cudaMemcpy Failed: %s\n", cudaGetErrorString(err_code));
}
// Restructure data for cublasCgemmBatched function. // Transpose the data
data_restructure<<<dimGrid_d, dimBlock_d>>>(d_restruct_in, d_restruct_out); // transpose<<<gridDim_transpose, blockDim_transpose>>>(d_tra_data_in, d_tra_data_out);
// if (err_code != cudaSuccess) {
// printf("RTBF: CUDA Error (transpose): %s\n", cudaGetErrorString(err_code));
// }
// printf("Starting beamformer\n"); // Restructure data for cublasCgemmBatched function.
data_restructure<<<dimGrid_d, dimBlock_d>>>(d_tra_data_in, d_restruct_out);
//data_restructure<<<gridDim_transpose, blockDim_transpose>>>(d_restruct_in, d_restruct_out);
//data_restructure<<<dimGrid_d, dimBlock_d>>>(d_restruct_in, d_restruct_out);
if (err_code != cudaSuccess) {
printf("RTBF: CUDA Error (data_restructure): %s\n", cudaGetErrorString(err_code));
}
// Call beamformer function containing cublasCgemmBatched() // Call beamformer function containing cublasCgemmBatched()
beamform(); beamform();
cudaError_t err_code = cudaGetLastError(); err_code = cudaGetLastError();
if (err_code != cudaSuccess) { if (err_code != cudaSuccess) {
printf("CUDA Error (beamform): %s\n", cudaGetErrorString(err_code)); printf("CUDA Error (beamform): %s\n", cudaGetErrorString(err_code));
} }
...@@ -498,13 +536,9 @@ void run_beamformer(signed char * data_in, float * data_out) { ...@@ -498,13 +536,9 @@ void run_beamformer(signed char * data_in, float * data_out) {
cuComplex * d_sti_in = d_beamformed; cuComplex * d_sti_in = d_beamformed;
float * d_sti_out = d_outputs; float * d_sti_out = d_outputs;
// printf("Starting sti_reduction\n");
// Call STI reduction kernel. // Call STI reduction kernel.
sti_reduction<<<dimGrid, dimBlock>>>(d_sti_in, d_sti_out); sti_reduction<<<dimGrid, dimBlock>>>(d_sti_in, d_sti_out);
// printf("Finishing sti_reduction\n");
err_code = cudaGetLastError(); err_code = cudaGetLastError();
if (err_code != cudaSuccess) { if (err_code != cudaSuccess) {
printf("CUDA Error (sti_reduction): %s\n", cudaGetErrorString(err_code)); printf("CUDA Error (sti_reduction): %s\n", cudaGetErrorString(err_code));
...@@ -513,8 +547,7 @@ void run_beamformer(signed char * data_in, float * data_out) { ...@@ -513,8 +547,7 @@ void run_beamformer(signed char * data_in, float * data_out) {
// Copy output data from device to host. // Copy output data from device to host.
cudaMemcpy(data_out, d_sti_out, BN_POL*(BN_OUTPUTS*sizeof(float)/2),cudaMemcpyDeviceToHost); cudaMemcpy(data_out, d_sti_out, BN_POL*(BN_OUTPUTS*sizeof(float)/2),cudaMemcpyDeviceToHost);
// cudaFree(d_data); return;
// cudaFree(d_outputs);
} }
...@@ -532,6 +565,10 @@ void rtbfCleanup() { ...@@ -532,6 +565,10 @@ void rtbfCleanup() {
cudaFree(d_data1); cudaFree(d_data1);
} }
if (d_data2 != NULL) {
cudaFree(d_data2);
}
if (d_outputs != NULL) { if (d_outputs != NULL) {
cudaFree(d_outputs); cudaFree(d_outputs);
} }
...@@ -553,4 +590,5 @@ void rtbfCleanup() { ...@@ -553,4 +590,5 @@ void rtbfCleanup() {
} }