Commit 93d4b889 authored by Mitch Burnett's avatar Mitch Burnett
Browse files

This commit was made at the end of me leaving GBO during the middle of the...

This commit was made at the end of me leaving GBO during the middle of the July/Aug commissioning. Many of these updates have improved the performance of the pipeline. Performance measured in the frequency of stalled/hanging banks and bad blocks. We tuned core assignments, input block counts and more. Decreasesd the amount of time accessed by shared memory. Prevented packets sneaking through at the end of scans in the net thread. Removed the cpu transpose for the beamformer and implemented on the GPU.
parent d0bbd449
......@@ -15,82 +15,18 @@
using namespace std;
// CUDA-specific function prototypes
void print_matrix(const cuComplex *A, int nr_rows_A, int nr_cols_A, int nr_sheets_A);
void print_matrix2(const float *A, int nr_rows_A, int nr_cols_A);
void GPU_fill(cuComplex *A, int nr_rows_A, int nr_cols_A);
void beamform();
void GPU_fill2(cuComplex *A, int nr_rows_A, int nr_cols_A);
__global__
void transpose(signed char* data, signed char* tra_data);
__global__
void data_restructure(signed char * data, cuComplex * data_restruc);
void beamform();
__global__
void sti_reduction(cuComplex * data_in, float * data_out);
// Fill the array A(nr_rows_A, nr_cols_A) with random numbers on GPU
void GPU_fill(cuComplex *A, int nr_rows_A, int nr_cols_A) {
cuComplex *G;
G = new cuComplex[nr_rows_A*nr_cols_A];
for(int i = 0; i < nr_rows_A*nr_cols_A; ++i){
G[i].x = (i + 1)%(nr_rows_A*nr_cols_A/(BN_BIN));
G[i].y = (i + 1)%(nr_rows_A*nr_cols_A/(BN_BIN));
}
cudaMemcpy(A,G,nr_rows_A * nr_cols_A * sizeof(cuComplex),cudaMemcpyHostToDevice);
delete[] G;
}
void GPU_fill2(cuComplex *A, int nr_rows_A, int nr_cols_A) {
cuComplex *G;
G = new cuComplex[nr_rows_A*nr_cols_A];
for(int i = 0; i < nr_rows_A*nr_cols_A; ++i){
G[i].x = i%(nr_rows_A*nr_cols_A/(BN_BIN));
G[i].y = i%(nr_rows_A*nr_cols_A/(BN_BIN));
}
cudaMemcpy(A,G,nr_rows_A * nr_cols_A * sizeof(cuComplex),cudaMemcpyHostToDevice);
delete[] G;
}
void print_matrix(const cuComplex *A, int nr_rows_A, int nr_cols_A, int nr_sheets_A) {
for(int i = 0; i < nr_rows_A; ++i){
for(int j = 0; j < nr_cols_A; ++j){
for(int k = 0; k < nr_sheets_A; ++k){
// cout << A[j * nr_rows_A + i].x << "+" << A[j * nr_rows_A + i].y << "i" <<" ";
printf("%i,%i,%i: %e + %e i\n",i,j,k,A[k*nr_rows_A*nr_cols_A + j * nr_rows_A + i].x, A[k*nr_rows_A*nr_cols_A + j * nr_rows_A + i].y);
}
}
// cout << endl;
}
// cout << endl;
// for(int i = 0; i < nr_rows_A*nr_cols_A; ++i){
// printf("%i,: %e + %e i\n",i,A[i].x, A[i].y);
// }
}
void print_matrix2(const float *A, int nr_rows_A, int nr_cols_A) {
// for(int j = 0; j < nr_cols_A; ++j){
// for(int i = 0; i < nr_rows_A; ++i){
// //cout << A[j * nr_rows_A + i].x << "+" << A[j * nr_rows_A + i].y << "i" <<" ";
// printf("%i,%i: %e\n",i,j,A[j * nr_rows_A + i]);
// }
// cout << endl;
// }
// cout << endl;
for(int i = 0; i < nr_rows_A*nr_cols_A; ++i){
printf("%i,: %e\n",i,A[i]);
}
}
// Struct defintion for beamformer metadata
typedef struct bf_metadata_struct {
float offsets[14];
......@@ -99,11 +35,16 @@ typedef struct bf_metadata_struct {
char weight_filename[65];
long long unsigned int xid;
} bf_metadata;
static bf_metadata my_metadata;
static cuComplex * d_weights = NULL;
void update_weights(char * filename){
printf("In update_weights()...\n");
printf("RTBF: In update_weights()...\n");
char weight_filename[128];
strcpy(weight_filename, filename);
FILE * weights;
......@@ -115,19 +56,24 @@ void update_weights(char * filename){
bf_weights = (float *)malloc(2*BN_WEIGHTS*sizeof(float));
weights_dc = (float complex *)malloc(BN_WEIGHTS*sizeof(float complex *));
weights_dc_n = (float complex *)malloc(BN_WEIGHTS*sizeof(float complex *));
// open weight file
weights = fopen(weight_filename, "r");
int j;
if (weights != NULL) {
fread(bf_weights, sizeof(float), 2*BN_WEIGHTS, weights);
fread(my_metadata.offsets, sizeof(float), 14, weights);
fread(my_metadata.cal_filename, sizeof(char), 64, weights);
my_metadata.cal_filename[64] = '\0';
fread(my_metadata.algorithm, sizeof(char), 64, weights);
my_metadata.algorithm[64] = '\0';
fread(&(my_metadata.xid), sizeof(long long unsigned int), 1, weights);
my_metadata.cal_filename[64] = '\0';
my_metadata.algorithm[64] = '\0';
// Extract all path information from weight_filename for metadata
char * short_filename = strrchr(weight_filename, '/');
if (short_filename != NULL) {
......@@ -137,8 +83,6 @@ void update_weights(char * filename){
strcpy(my_metadata.weight_filename, weight_filename);
}
// Convert to complex numbers (do a conjugate at the same time)
for(j = 0; j < BN_WEIGHTS; j++){
weights_dc_n[j] = bf_weights[2*j] - bf_weights[(2*j)+1]*I;
......@@ -159,14 +103,16 @@ void update_weights(char * filename){
}
fclose(weights);
}
free(bf_weights);
// Copy weights to device
cudaMemcpy(d_weights, weights_dc, BN_WEIGHTS*sizeof(cuComplex), cudaMemcpyHostToDevice); //r_weights instead of weights_dc //*BN_TIME
// free memory
free(weights_dc);
free(weights_dc_n);
free(bf_weights);
return;
}
void bf_get_offsets(float * offsets){
......@@ -202,20 +148,25 @@ long long unsigned int bf_get_xid(){
return my_metadata.xid;
}
static cuComplex **d_arr_A = NULL; static cuComplex **d_arr_B = NULL; static cuComplex **d_arr_C = NULL;
static cuComplex * d_beamformed = NULL;
static cuComplex * d_data = NULL;
static signed char * d_data1 = NULL; // Device memory for input data
static signed char * d_data2 = NULL;
static float * d_outputs;
static cublasHandle_t handle;
static cuComplex **d_arr_A = NULL;
static cuComplex **d_arr_B = NULL;
static cuComplex **d_arr_C = NULL;
void init_beamformer(){
// Allocate memory for the weights, data, beamformer output, and sti output.
// Allocate memory for the weights, data, beamformer output, and sti output.
cudaMalloc((void **)&d_weights, BN_WEIGHTS*sizeof(cuComplex)); //*BN_TIME
cudaMalloc((void **)&d_data1, 2*BN_SAMP*sizeof(signed char));
//cudaMalloc((void **)&d_data2, 2*BN_SAMP*sizeof(signed char));
cudaMalloc((void **)&d_data, BN_SAMP*sizeof(cuComplex));
cudaError_t err_malloc = cudaMalloc((void **)&d_beamformed, BN_TBF*sizeof(cuComplex));
......@@ -225,13 +176,10 @@ void init_beamformer(){
cudaMalloc((void **)&d_outputs, BN_POL*(BN_OUTPUTS*sizeof(float)/2));
/**********************************************************
* Create a handle for CUBLAS
**********************************************************/
cublasCreate(&handle);
// This is all memory allocated to arrays that are used by gemmBatched.
// Allocate 3 arrays on CPU
/**********************************************************
* Create a handle for CUBLAS
**********************************************************/
cublasCreate(&handle);
cudaError_t cudaStat;
int nr_rows_A, nr_cols_A, nr_rows_B, nr_cols_B, nr_rows_C, nr_cols_C;
......@@ -243,8 +191,11 @@ void init_beamformer(){
nr_rows_C = BN_BEAM;
nr_cols_C = BN_TIME;
// Allocate memory to host arrays.
const cuComplex **h_arr_A = 0; const cuComplex **h_arr_B = 0; cuComplex **h_arr_C = 0;
// Allocate memory to host arrays - This is all memory allocated to arrays that are used by gemmBatched. Allocate 3 arrays on CPU
const cuComplex **h_arr_A = 0;
const cuComplex **h_arr_B = 0;
cuComplex **h_arr_C = 0;
h_arr_A = (const cuComplex **)malloc(nr_rows_A * nr_cols_A *BN_BIN*sizeof(const cuComplex*));
h_arr_B = (const cuComplex **)malloc(nr_rows_B * nr_cols_B *BN_BIN*sizeof(const cuComplex*));
h_arr_C = (cuComplex **)malloc(nr_rows_C * nr_cols_C *BN_BIN*sizeof(cuComplex*));
......@@ -256,9 +207,6 @@ void init_beamformer(){
h_arr_C[i] = d_beamformed + i*nr_rows_C*nr_cols_C;
}
// delete[] d_A;
// delete[] d_B;
// Allocate memory to arrays on device.
cudaStat = cudaMalloc((void **)&d_arr_A,nr_rows_A * nr_cols_A * BN_BIN * sizeof(cuComplex*));
assert(!cudaStat);
......@@ -275,19 +223,12 @@ void init_beamformer(){
cudaStat = cudaMemcpy(d_arr_C,h_arr_C,nr_rows_C * nr_cols_C * BN_BIN * sizeof(cuComplex*),cudaMemcpyHostToDevice);
assert(!cudaStat);
}
__global__
void data_restructure(signed char * data, cuComplex * data_restruc){
int e = threadIdx.x;
int t = blockIdx.x;
int f = blockIdx.y;
free(h_arr_A);
free(h_arr_B);
free(h_arr_C);
//Restructure data so that the frequency bin is the slowest moving index
data_restruc[f*BN_TIME*BN_ELE_BLOC + t*BN_ELE_BLOC + e].x = data[2*(t*BN_BIN*BN_ELE_BLOC + f*BN_ELE_BLOC + e)]*1.0f;
data_restruc[f*BN_TIME*BN_ELE_BLOC + t*BN_ELE_BLOC + e].y = data[2*(t*BN_BIN*BN_ELE_BLOC + f*BN_ELE_BLOC + e) + 1]*1.0f;
return;
}
signed char * data_in(char * input_filename){
......@@ -347,7 +288,7 @@ void beamform() {
nr_rows_C = BN_BEAM;
// Leading dimensions are always the rows of each matrix since the data is stored in a column-wise order.
int lda=nr_rows_A,ldb=nr_rows_B,ldc=nr_rows_C;
int lda=nr_rows_A, ldb=nr_rows_B, ldc=nr_rows_C;
cuComplex alf;
cuComplex bet;
......@@ -380,6 +321,11 @@ void beamform() {
ldc, // Leading dimension of each batch or matrix in array C.
batchCount); // Number of batches in each array.
if (stat == CUBLAS_STATUS_INVALID_VALUE) {
printf("RTBF: Invalid CUBLAS values\n");
} else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) {
printf("RTBF: Execution failed.\n");
}
if(stat != CUBLAS_STATUS_SUCCESS){
cerr << "cublasCgemmBatched failed" << endl;
......@@ -387,16 +333,80 @@ void beamform() {
}
assert(!cudaGetLastError());
//Free GPU memory
// cudaFree(d_A);
// cudaFree(d_B);
// cudaFree(d_C);
}
__global__
void transpose(signed char* data, signed char* tra_data) {
int i = threadIdx.x;
int c = threadIdx.y;
int m = blockIdx.x;
int f = blockIdx.y;
int t = blockIdx.z;
//int Nm = gridDim.x; // number of mcnts (packets)
int Nf = gridDim.y; // number of f-engines (ROACHES)
int Nt = gridDim.z; // time samples per mcnt
int Ni = blockDim.x; // inputs per f-engine (aka antenna elements per ROACH)
int Nc = blockDim.y; // bins per mcnt
int in_idx = i + Ni*c + Nc*Ni*t + Nt*Nc*Ni*f + Nf*Nt*Nc*Ni*m;
int out_idx = i + Ni*f + Nf*Ni*c + Nc*Nf*Ni*t + Nt*Nc*Nf*Ni*m;
// Destroy the handle
//cublasDestroy(handle);
tra_data[2*out_idx] = data[2*in_idx];
tra_data[2*out_idx + 1] = data[2*in_idx+1];
return;
}
__global__
void data_restructure(signed char * data, cuComplex * data_restruc){
/*
Repurpose the transpose thread in the hashpipe codes by performing the transpose in the GPU.
The motivation was, why transpose then transpose again? Why not just perform one transpose
in the GPU which would be faster anyway.
*/
int i = threadIdx.x;
int c = threadIdx.y;
int m = blockIdx.x;
int f = blockIdx.y;
int t = blockIdx.z;
int Nm = gridDim.x; // number of mcnts (packets)
int Nf = gridDim.y; // number of f-engines (ROACHES)
int Nt = gridDim.z; // time samples per mcnt
int Ni = blockDim.x; // inputs per f-engine (aka antenna elements per ROACH)
int Nc = blockDim.y; // bins per mcnt
int in_idx = i + Ni*c + Nc*Ni*t + Nt*Nc*Ni*f + Nf*Nt*Nc*Ni*m;
int out_idx = i + Ni*f + Nf*Ni*t + Nt*Nf*Ni*m + Nm*Nt*Nf*Ni*c;
data_restruc[out_idx].x = data[2*in_idx]*1.0f;
data_restruc[out_idx].y = data[2*in_idx + 1]*1.0f;
return;
/*
// Original Code
int e = threadIdx.x;
int t = blockIdx.x;
int f = blockIdx.y;
//Restructure data so that the frequency bin is the slowest moving index
data_restruc[f*BN_TIME*BN_ELE_BLOC + t*BN_ELE_BLOC + e].x = data[2*(t*BN_BIN*BN_ELE_BLOC + f*BN_ELE_BLOC + e)]*1.0f;
data_restruc[f*BN_TIME*BN_ELE_BLOC + t*BN_ELE_BLOC + e].y = data[2*(t*BN_BIN*BN_ELE_BLOC + f*BN_ELE_BLOC + e) + 1]*1.0f;
return;
*/
}
__global__
void sti_reduction(cuComplex * data_in, float * data_out) {
......@@ -406,7 +416,7 @@ void sti_reduction(cuComplex * data_in, float * data_out) {
int s = blockIdx.z;
int h = sample_idx(s*BN_TIME_STI + t,b,f); // Preprocessor macro used for the output of the beamformer. More detail can be seen in the header file. (First set of beams)
int h1 = sample_idx(s*BN_TIME_STI + t,b+BN_BEAM1,f); // Preprocessor macro used for the output of the beamformer. More detail can be seen in the header file. (Last set of beams)
int h1 = sample_idx(s*BN_TIME_STI + t,b+BN_BEAM1,f); // Preprocessor macro used for the output of the beamformer. More detail can be seen in the header file. (Last set of beams)
// Temporary variables used for updating.
float beam_power1;
......@@ -467,9 +477,13 @@ void sti_reduction(cuComplex * data_in, float * data_out) {
data_out[output_idx(2,b,s,f)] = reduced_array1[0].x*scale; // XY* real.
data_out[output_idx(3,b,s,f)] = reduced_array1[0].y*scale; // XY* imaginary.
}
return;
}
void run_beamformer(signed char * data_in, float * data_out) {
cudaError_t err_code;
// Specify grid and block dimensions
dim3 dimBlock(BN_STI_BLOC, 1, 1);
dim3 dimGrid(BN_BIN, BN_BEAM1, BN_STI);
......@@ -478,19 +492,43 @@ void run_beamformer(signed char * data_in, float * data_out) {
dim3 dimBlock_d(BN_ELE_BLOC, 1, 1);
dim3 dimGrid_d(BN_TIME, BN_BIN, 1);
signed char * d_restruct_in = d_data1;
int Nm = 200;
int Nf = 8;
int Nt = 20;
int Nc = 25;
int Ni = 8;
dim3 gridDim_transpose(Nm, Nf, Nt);
dim3 blockDim_transpose(Ni, Nc, 1);
signed char* d_tra_data_in = d_data1;
//signed char* d_tra_data_out = d_data2;
//signed char * d_restruct_in = d_data1;
cuComplex * d_restruct_out = d_data;
cudaMemcpy(d_restruct_in, data_in, 2*BN_SAMP*sizeof(signed char), cudaMemcpyHostToDevice);
//cudaMemcpy(d_restruct_in, data_in, 2*BN_SAMP*sizeof(signed char), cudaMemcpyHostToDevice);
cudaMemcpy(d_tra_data_in, data_in, 2*BN_SAMP*sizeof(signed char), cudaMemcpyHostToDevice);
err_code = cudaGetLastError();
if (err_code != cudaSuccess) {
printf("RTBF: cudaMemcpy Failed: %s\n", cudaGetErrorString(err_code));
}
// Restructure data for cublasCgemmBatched function.
data_restructure<<<dimGrid_d, dimBlock_d>>>(d_restruct_in, d_restruct_out);
// Transpose the data
// transpose<<<gridDim_transpose, blockDim_transpose>>>(d_tra_data_in, d_tra_data_out);
// if (err_code != cudaSuccess) {
// printf("RTBF: CUDA Error (transpose): %s\n", cudaGetErrorString(err_code));
// }
// printf("Starting beamformer\n");
// Restructure data for cublasCgemmBatched function.
data_restructure<<<dimGrid_d, dimBlock_d>>>(d_tra_data_in, d_restruct_out);
//data_restructure<<<gridDim_transpose, blockDim_transpose>>>(d_restruct_in, d_restruct_out);
//data_restructure<<<dimGrid_d, dimBlock_d>>>(d_restruct_in, d_restruct_out);
if (err_code != cudaSuccess) {
printf("RTBF: CUDA Error (data_restructure): %s\n", cudaGetErrorString(err_code));
}
// Call beamformer function containing cublasCgemmBatched()
beamform();
cudaError_t err_code = cudaGetLastError();
err_code = cudaGetLastError();
if (err_code != cudaSuccess) {
printf("CUDA Error (beamform): %s\n", cudaGetErrorString(err_code));
}
......@@ -498,13 +536,9 @@ void run_beamformer(signed char * data_in, float * data_out) {
cuComplex * d_sti_in = d_beamformed;
float * d_sti_out = d_outputs;
// printf("Starting sti_reduction\n");
// Call STI reduction kernel.
sti_reduction<<<dimGrid, dimBlock>>>(d_sti_in, d_sti_out);
// printf("Finishing sti_reduction\n");
err_code = cudaGetLastError();
if (err_code != cudaSuccess) {
printf("CUDA Error (sti_reduction): %s\n", cudaGetErrorString(err_code));
......@@ -513,8 +547,7 @@ void run_beamformer(signed char * data_in, float * data_out) {
// Copy output data from device to host.
cudaMemcpy(data_out, d_sti_out, BN_POL*(BN_OUTPUTS*sizeof(float)/2),cudaMemcpyDeviceToHost);
// cudaFree(d_data);
// cudaFree(d_outputs);
return;
}
......@@ -532,6 +565,10 @@ void rtbfCleanup() {
cudaFree(d_data1);
}
if (d_data2 != NULL) {
cudaFree(d_data2);
}
if (d_outputs != NULL) {
cudaFree(d_outputs);
}
......@@ -553,4 +590,5 @@ void rtbfCleanup() {
}
// Free up and release cublas handle
cublasDestroy(handle);
}
......@@ -117,7 +117,6 @@ int runPFB(signed char* inputData_h, float* outputData_h, params pfbParams) {
int outDataSize = countFFT * g_iNumSubBands * g_iNFFT;
//CUDASafeCallWithCleanUp(cudaMemcpy(outputData_h, fftOutPtr, outDataSize*sizeof(cufftComplex), cudaMemcpyDeviceToHost));
//printf("making sure new build...\n");
CUDASafeCallWithCleanUp(cudaMemcpy(outputData_h, g_pf2FFTOut_d, outDataSize*sizeof(cufftComplex), cudaMemcpyDeviceToHost));
return iRet;
......
......@@ -16,7 +16,7 @@
#define FALSE 0
#define TRUE 1
#define DEBUG 1
//#define DEBUG 1
#define DEF_CUDA_DEVICE 0
......
......@@ -41,9 +41,15 @@ flag_x_threads = flag_net_thread.c \
flag_correlator_thread.c \
flag_corsave_thread.c
# CPU Transpose
#flag_b_threads = flag_net_thread.c \
# flag_transpose_thread.c \
# flag_beamform_thread.c \
# flag_beamsave_thread.c
# GPU Transpose
flag_b_threads = flag_net_thread.c \
flag_transpose_thread.c \
flag_beamform_thread.c \
flag_transpose_beamform_thread.c \
flag_beamsave_thread.c
flag_f_threads = flag_net_thread.c \
......
......@@ -138,12 +138,12 @@ cmd_t check_cmd(int fifo_fd)
}
else if (strncasecmp(cmd,"STOP",MAX_CMD_LEN)==0)
{
printf("FIFO: A STOP was issued to the hashpipe codes!!!!!!!!!!!!!!!!!!\n");
printf("FIFO: A STOP was issued to the hashpipe codes!!!!!!!!!!!!!!!!!!\n");
return STOP;
}
else if (strncasecmp(cmd,"QUIT",MAX_CMD_LEN)==0)
{
printf("FIFO: A QUIT was issued to the hashpipe codes!!!!!!!!!!!!!!!!!!\n");
printf("FIFO: A QUIT was issued to the hashpipe codes!!!!!!!!!!!!!!!!!!\n");
return QUIT;
}
else
......
......@@ -94,17 +94,16 @@ static void * run(hashpipe_thread_args_t * args) {
state cur_state = ACQUIRE;
state next_state = ACQUIRE;
int64_t good_data = 1;
char weight_flag[8];
int weight_flag;
char netstat[17];
char weight_file[17];
// Indicate in shared memory buffer that this thread is ready to start
hashpipe_status_lock_safe(&st);
hputi4(st.buf, "CORREADY", 1);
hputi4(st.buf, "RBFREADY", 1);
hashpipe_status_unlock_safe(&st);
int check_count = 0;
// Main loop for thread
while (run_threads()) {
......@@ -117,26 +116,23 @@ static void * run(hashpipe_thread_args_t * args) {
hashpipe_status_lock_safe(&st);
hgetl(st.buf, "CLEANB", &cleanb);
hgets(st.buf, "NETSTAT", 16, netstat);
hgets(st.buf, "WFLAG", 8, weight_flag);
hgetl(st.buf, "WFLAG", &weight_flag);
hashpipe_status_unlock_safe(&st);
if (cleanb == 0 && strcmp(netstat, "CLEANUP") == 0) {
next_state = CLEANUP;
printf("BF: Entering CLEANUP state\n");
break;
}
if (strcmp(weight_flag,"1") == 0){
if(weight_flag) {
hashpipe_status_lock_safe(&st);
hgets(st.buf,"BWEIFILE",16,weight_file);
hashpipe_status_unlock_safe(&st);
sprintf(w_dir, "%s\%s", weightdir, weight_file);
printf("BF: Weight file name: %s\n", w_dir);
printf("RTBF: Weight file name: %s\n", w_dir);
printf("RTB: Initializing beamformer weights...\n");
// update_weights(weight_file);
printf("RTBF: Initializing beamformer weights...\n");
update_weights(w_dir);
printf("RTB: Finished updating weights...\n");
printf("RTBF: Finished updating weights...\n");
// Put metadata into status shared memory
float offsets[BN_BEAM];
char cal_filename[65];
......@@ -169,10 +165,7 @@ static void * run(hashpipe_thread_args_t * args) {
hputs(st.buf, "BALGORIT", algorithm);
hputs(st.buf, "BWFILE", weight_filename);
hgeti4(st.buf, "XID", &act_xid);
hashpipe_status_unlock_safe(&st);
hashpipe_status_lock_safe(&st);
hputs(st.buf,"WFLAG","0");
hputl(st.buf,"WFLAG",0);
hashpipe_status_unlock_safe(&st);
}
}
......@@ -186,71 +179,78 @@ static void * run(hashpipe_thread_args_t * args) {
// If CLEANUP, don't continue processing
if (next_state != CLEANUP) {
// Print out the header information for this block
flag_gpu_input_header_t tmp_header;
memcpy(&tmp_header, &db_in->block[curblock_in].header, sizeof(flag_gpu_input_header_t));
good_data = tmp_header.good_data;
hashpipe_status_lock_safe(&st);
hputi4(st.buf, "BEAMMCNT", tmp_header.mcnt);
hashpipe_status_unlock_safe(&st);
// Wait for output block to become free
while ((rv=flag_gpu_beamformer_output_databuf_wait_free(db_out, curblock_out)) != HASHPIPE_OK) {
if (rv==HASHPIPE_TIMEOUT) {
continue;
} else {
</