Commit 8efa733c authored by Mark Ruzindana's avatar Mark Ruzindana
Browse files

PFB scalloping fix code modifications and additions.

parent 61e94c60
......@@ -369,7 +369,7 @@ void data_restructure(signed char * data, cuComplex * data_restruc){
in the GPU which would be faster anyway.
*/
/*
int i = threadIdx.x;
int c = threadIdx.y;
......@@ -391,8 +391,9 @@ void data_restructure(signed char * data, cuComplex * data_restruc){
data_restruc[out_idx].y = data[2*in_idx + 1]*1.0f;
return;
*/
/*
// Original Code
int e = threadIdx.x;
int t = blockIdx.x;
......@@ -404,7 +405,7 @@ void data_restructure(signed char * data, cuComplex * data_restruc){
return;
*/
}
......@@ -501,13 +502,13 @@ void run_beamformer(signed char * data_in, float * data_out) {
dim3 gridDim_transpose(Nm, Nf, Nt);
dim3 blockDim_transpose(Ni, Nc, 1);
//signed char* d_tra_data_in = d_data1;
signed char* d_tra_data_in = d_data1;
//signed char* d_tra_data_out = d_data2;
signed char * d_restruct_in = d_data1;
//signed char * d_restruct_in = d_data1;
cuComplex * d_restruct_out = d_data;
cudaMemcpy(d_restruct_in, data_in, 2*BN_SAMP*sizeof(signed char), cudaMemcpyHostToDevice);
//cudaMemcpy(d_tra_data_in, data_in, 2*BN_SAMP*sizeof(signed char), cudaMemcpyHostToDevice);
//cudaMemcpy(d_restruct_in, data_in, 2*BN_SAMP*sizeof(signed char), cudaMemcpyHostToDevice);
cudaMemcpy(d_tra_data_in, data_in, 2*BN_SAMP*sizeof(signed char), cudaMemcpyHostToDevice);
err_code = cudaGetLastError();
if (err_code != cudaSuccess) {
printf("RTBF: cudaMemcpy Failed: %s\n", cudaGetErrorString(err_code));
......@@ -520,9 +521,9 @@ void run_beamformer(signed char * data_in, float * data_out) {
// }
// Restructure data for cublasCgemmBatched function.
//data_restructure<<<gridDim_transpose, blockDim_transpose>>>(d_tra_data_in, d_restruct_out);
data_restructure<<<gridDim_transpose, blockDim_transpose>>>(d_tra_data_in, d_restruct_out);
//data_restructure<<<gridDim_transpose, blockDim_transpose>>>(d_restruct_in, d_restruct_out);
data_restructure<<<dimGrid_d, dimBlock_d>>>(d_restruct_in, d_restruct_out);
//data_restructure<<<dimGrid_d, dimBlock_d>>>(d_restruct_in, d_restruct_out);
if (err_code != cudaSuccess) {
printf("RTBF: CUDA Error (data_restructure): %s\n", cudaGetErrorString(err_code));
}
......
......@@ -7,12 +7,12 @@
#include <math.h>
#define BN_ELE 38 // Number of elements/antennas in the array
#define BN_BIN 25 // Number of frequency bins
#define BN_TIME 4000 //40 // Number of decimated time samples
#define BN_BIN 20 // 25 // Number of frequency bins
#define BN_TIME 8000 // 4000 //40 // Number of decimated time samples
#define BN_BEAM 14 // Number of beams we are forming
#define BN_POL 4
#define BN_BEAM1 (BN_BEAM/2) // Number of beams we are forming
#define BN_TIME_STI 40 //40 // Number of decimated time samples per integrated beamformer output
#define BN_TIME_STI 80 //40 // Number of decimated time samples per integrated beamformer output
#define BN_STI (BN_TIME/BN_TIME_STI) // Number of short time integrations
#define BN_STI_BLOC 64
#define BN_ELE_BLOC 64
......
......@@ -7,9 +7,9 @@
#define NF 8 // Number of Fengines
#define NI 8 // Number of inputs per Fengine
#define NA (NF*NI) // Number of total antennas
#define NC 25 // Number of frequency channels
#define NC 20 // 25 // Number of frequency channels
#define NT 20 // Number of time samples per packet/mcnt
#define NM 200 // Number of packets/mcnts per block
#define NM 400 // 200 // Number of packets/mcnts per block
#define pow1 32768 // Next power of 2 >= Nm*Nt
#define nblocks2 (pow1/1024) // Block size for second kernel
#define nblocks1 (pow1/nblocks2) // Block size for first kernel
......
......@@ -10,7 +10,7 @@ import getopt
import math
import numpy
import scipy.signal as sp
import matplotlib.pyplot as plotter
#import matplotlib.pyplot as plotter
# function definitions
def PrintUsage(ProgName):
......@@ -142,16 +142,16 @@ FileCoeff = open("coeff_" \
if ("signedchar" == DataType):
FileCoeff.write(PFBCoeffInt8)
# plot the coefficients
if (Plot):
plotter.plot(PFBCoeffInt8)
#if (Plot):
# plotter.plot(PFBCoeffInt8)
else:
FileCoeff.write(PFBCoeffFloat32)
# plot the coefficients
if (Plot):
plotter.plot(PFBCoeffFloat32)
#if (Plot):
# plotter.plot(PFBCoeffFloat32)
FileCoeff.close()
if (Plot):
plotter.show()
#if (Plot):
# plotter.show()
......@@ -60,6 +60,21 @@ __global__ void PFB_kernel(char2* pc2Data,
return;
}
// Discard channels and perform FFT shift (part of scalloping solution)
__global__ void Discard_Shift_kernel(float2* pf2FFTOut, float2* pf2DiscShift)
{
int pt = threadIdx.x; // N-point FFT index
int sb = blockIdx.x; // Number of elements x coarse channels (time series) index
int st = blockIdx.y; // Windows index (4000/32 = 125 windows)
int i = blockIdx.z; // Chunks of channels to recover
// Both pre-processor macros are defined in kernels.h
pf2DiscShift[fftshift_idx(pt,i,sb,st)].x = pf2FFTOut[recover_idx(pt,i,sb,st)].x;
pf2DiscShift[fftshift_idx(pt,i,sb,st)].y = pf2FFTOut[recover_idx(pt,i,sb,st)].y;
return;
}
// When PFB disabled just perform FFT.
__global__ void CopyDataForFFT(char2 *pc2Data, float2 *pf2FFTIn)
{
......
......@@ -4,8 +4,17 @@
#include <cuda.h>
#include <cufft.h>
#define N_POINTS_FFT 64
#define N_ELEMENTS 64
#define N_FINE_CHANS 5
// Discard channels and perform FFT shift
#define recover_idx(pt,i,sb,st) ((pt+(48*i)) + N_POINTS_FFT*(sb) + N_POINTS_FFT*N_ELEMENTS*N_FINE_CHANS*(st))
#define fftshift_idx(pt,i,sb,st) ((pt+(16*(1-i))) + (N_POINTS_FFT/2)*(sb) + (N_POINTS_FFT/2)*N_ELEMENTS*N_FINE_CHANS*(st))
// stuct of parameters for PFB. Values indicate default values.
#define DEFAULT_PFB_PARAMS {4000, 32, 8, 25, 5, 64, 320, 0, (char*)"hanning\0", (char*)"float\0", (char*)"\0", 1};
//#define DEFAULT_PFB_PARAMS {4000, 32, 8, 25, 5, 64, 320, 0, (char*)"hanning\0", (char*)"float\0", (char*)"\0", 1};
#define DEFAULT_PFB_PARAMS {8000, 64, 8, 20, 5, 64, 320, 0, (char*)"hanning\0", (char*)"float\0", (char*)"\0", 1};
// plot 1 mean to hide the plot of the filter before continuing.
typedef struct {
int samples;
......@@ -24,6 +33,7 @@ typedef struct {
__global__ void PFB_kernel(char2* pc2Data, float2* pf2FFTIn, float* pfPFBCoeff, params pfbParams);
__global__ void Discard_Shift_kernel(float2* pf2FFTOut, float2* pf2DiscShift);
__global__ void map(char* dataIn, char2* dataOut, int channelSelect, params pfbParams);
__global__ void CopyDataForFFT(char2* pc2Data, float2* pf2FFTIn);
__global__ void saveData(char2* dataIn, char2* dataOut);
......
#ifdef __cplusplus
extern "C" {
#include "kernels.h"
}
#endif
__global__ void map(char* dataIn,
char2* dataOut,
int channelSelect,
params pfbParams)
{
// select the channel range
int channelMin = pfbParams.fine_channels*channelSelect;
int absIdx = 2 * blockDim.y*(blockIdx.x*pfbParams.coarse_channels + (channelMin+blockIdx.y)) + 2 * threadIdx.y; // times 2 because we are mapping a sequence of values to char2 array.
int mapIdx = blockDim.y*(blockIdx.x*gridDim.y + blockIdx.y) + threadIdx.y;
dataOut[mapIdx].x = dataIn[absIdx];
dataOut[mapIdx].y = dataIn[absIdx+1];
return;
}
/* prepare data for PFB */
__global__ void PFB_kernel(char2* pc2Data,
float2* pf2FFTIn,
float* pfPFBCoeff,
params pfbParams)
{
int blkIdx = blockIdx.y * gridDim.x + blockIdx.x;
int i = blkIdx*blockDim.x + threadIdx.x;
int absCoeff = (blockIdx.x * blockDim.x) + threadIdx.x;
int iNFFT = (gridDim.x * blockDim.x);
int j = 0;
int iAbsIdx = 0;
int coeffIdx = 0;
float2 f2PFBOut = make_float2(0.0, 0.0);
char2 c2Data = make_char2(0, 0);
for (j = 0; j < pfbParams.taps; ++j)
{
/* calculate the absolute index */
iAbsIdx = (j * iNFFT) + i;
coeffIdx = (j * iNFFT) + absCoeff;
/* get the address of the block */
c2Data = pc2Data[iAbsIdx];
f2PFBOut.x += (float) c2Data.x * pfPFBCoeff[coeffIdx];
f2PFBOut.y += (float) c2Data.y * pfPFBCoeff[coeffIdx];
}
pf2FFTIn[i] = f2PFBOut;
return;
}
// Discard channels and perform FFT shift (part of scalloping solution)
__global__ void Discard_Shift_kernel(float2* pf2FFTOut, float2* pf2DiscShift)
{
int pt = threadIdx.x; // N-point FFT index
int sb = blockIdx.x; // Number of elements x coarse channels (time series) index
int st = blockIdx.y; // Windows index (4000/32 = 125 windows)
int i = blockIdx.z; // Chunks of channels to recover
// Both pre-processor macros are defined in kernels.h
pf2DiscShift[fftshift_idx(pt,i,sb,st)].x = pf2FFTOut[recover_idx(pt,i,sb,st)].x;
pf2DiscShift[fftshift_idx(pt,i,sb,st)].y = pf2FFTOut[recover_idx(pt,i,sb,st)].y;
return;
}
// When PFB disabled just perform FFT.
__global__ void CopyDataForFFT(char2 *pc2Data, float2 *pf2FFTIn)
{
int blkIdx = blockIdx.y * gridDim.x + blockIdx.x;
int i = blkIdx*blockDim.x + threadIdx.x;
pf2FFTIn[i].x = (float) pc2Data[i].x;
pf2FFTIn[i].y = (float) pc2Data[i].y;
return;
}
// prepares for the next PFB.
__global__ void saveData(char2* dataIn, char2* dataOut){
int i = blockIdx.y*(gridDim.x*blockDim.x) + blockIdx.x*blockDim.x + threadIdx.x;
dataOut[i] = dataIn[i];
return;
}
#ifndef __KERNELS_H__
#define __KERNELS_H__
#include <cuda.h>
#include <cufft.h>
#define N_POINTS_FFT 64
#define N_ELEMENTS 64
#define N_FINE_CHANS 5
// Discard channels and perform FFT shift
#define recover_idx(pt,i,sb,st) ((pt+(48*i)) + N_POINTS_FFT*(sb) + N_POINTS_FFT*N_ELEMENTS*N_FINE_CHANS*(st))
#define fftshift_idx(pt,i,sb,st) ((pt+(16*(1-i))) + (N_POINTS_FFT/2)*(sb) + (N_POINTS_FFT/2)*N_ELEMENTS*N_FINE_CHANS*(st))
// stuct of parameters for PFB. Values indicate default values.
//#define DEFAULT_PFB_PARAMS {4000, 32, 8, 25, 5, 64, 320, 0, (char*)"hanning\0", (char*)"float\0", (char*)"\0", 1};
#define DEFAULT_PFB_PARAMS {8000, 64, 8, 20, 5, 64, 320, 0, (char*)"hanning\0", (char*)"float\0", (char*)"\0", 1};
// plot 1 mean to hide the plot of the filter before continuing.
typedef struct {
int samples;
int nfft;
int taps;
int coarse_channels;
int fine_channels;
int elements;
int subbands;
int select;
char* window;
char* dataType;
char* coeffPath;
int plot;
} params;
__global__ void PFB_kernel(char2* pc2Data, float2* pf2FFTIn, float* pfPFBCoeff, params pfbParams);
__global__ void Discard_Shift_kernel(float2* pf2FFTOut, float2* pf2DiscShift);
__global__ void map(char* dataIn, char2* dataOut, int channelSelect, params pfbParams);
__global__ void CopyDataForFFT(char2* pc2Data, float2* pf2FFTIn);
__global__ void saveData(char2* dataIn, char2* dataOut);
#endif
#ifdef __cplusplus
extern "C" {
#include "kernels.h"
}
#endif
__global__ void map(char* dataIn,
char2* dataOut,
int channelSelect,
params pfbParams)
{
// select the channel range
int channelMin = pfbParams.fine_channels*channelSelect;
int absIdx = 2 * blockDim.y*(blockIdx.x*pfbParams.coarse_channels + (channelMin+blockIdx.y)) + 2 * threadIdx.y; // times 2 because we are mapping a sequence of values to char2 array.
int mapIdx = blockDim.y*(blockIdx.x*gridDim.y + blockIdx.y) + threadIdx.y;
dataOut[mapIdx].x = dataIn[absIdx];
dataOut[mapIdx].y = dataIn[absIdx+1];
return;
}
/* prepare data for PFB */
__global__ void PFB_kernel(char2* pc2Data,
float2* pf2FFTIn,
float* pfPFBCoeff,
params pfbParams)
{
int blkIdx = blockIdx.y * gridDim.x + blockIdx.x;
int i = blkIdx*blockDim.x + threadIdx.x;
int absCoeff = (blockIdx.x * blockDim.x) + threadIdx.x;
int iNFFT = (gridDim.x * blockDim.x);
int j = 0;
int iAbsIdx = 0;
int coeffIdx = 0;
float2 f2PFBOut = make_float2(0.0, 0.0);
char2 c2Data = make_char2(0, 0);
for (j = 0; j < pfbParams.taps; ++j)
{
/* calculate the absolute index */
iAbsIdx = (j * iNFFT) + i;
coeffIdx = (j * iNFFT) + absCoeff;
/* get the address of the block */
c2Data = pc2Data[iAbsIdx];
f2PFBOut.x += (float) c2Data.x * pfPFBCoeff[coeffIdx];
f2PFBOut.y += (float) c2Data.y * pfPFBCoeff[coeffIdx];
}
pf2FFTIn[i] = f2PFBOut;
return;
}
// When PFB disabled just perform FFT.
__global__ void CopyDataForFFT(char2 *pc2Data, float2 *pf2FFTIn)
{
int blkIdx = blockIdx.y * gridDim.x + blockIdx.x;
int i = blkIdx*blockDim.x + threadIdx.x;
pf2FFTIn[i].x = (float) pc2Data[i].x;
pf2FFTIn[i].y = (float) pc2Data[i].y;
return;
}
// prepares for the next PFB.
__global__ void saveData(char2* dataIn, char2* dataOut){
int i = blockIdx.y*(gridDim.x*blockDim.x) + blockIdx.x*blockDim.x + threadIdx.x;
dataOut[i] = dataIn[i];
return;
}
#ifndef __KERNELS_H__
#define __KERNELS_H__
#include <cuda.h>
#include <cufft.h>
// stuct of parameters for PFB. Values indicate default values.
#define DEFAULT_PFB_PARAMS {4000, 32, 8, 25, 5, 64, 320, 0, (char*)"hanning\0", (char*)"float\0", (char*)"\0", 1};
// plot 1 mean to hide the plot of the filter before continuing.
typedef struct {
int samples;
int nfft;
int taps;
int coarse_channels;
int fine_channels;
int elements;
int subbands;
int select;
char* window;
char* dataType;
char* coeffPath;
int plot;
} params;
__global__ void PFB_kernel(char2* pc2Data, float2* pf2FFTIn, float* pfPFBCoeff, params pfbParams);
__global__ void map(char* dataIn, char2* dataOut, int channelSelect, params pfbParams);
__global__ void CopyDataForFFT(char2* pc2Data, float2* pf2FFTIn);
__global__ void saveData(char2* dataIn, char2* dataOut);
#endif
......@@ -13,6 +13,7 @@ char2* g_pc2DataRead_d = NULL;
float2* g_pf2FFTIn_d = NULL;
float2* g_pf2FFTOut_d = NULL;
float2* g_pf2DiscShift_d = NULL;
float *g_pfPFBCoeff = NULL;
float *g_pfPFBCoeff_d = NULL;
......@@ -85,6 +86,10 @@ int runPFB(signed char* inputData_h, float* outputData_h, params pfbParams) {
while(!g_IsProcDone) {
//FFT
iRet = doFFT();
// New Code ///////////////////////////////////
Discard_Shift_kernel<<<g_dimGPFB, g_dimBPFB>>>(g_pf2FFTOut_d, g_pf2DiscShift_d);
/////////////////////////////////////////////////
if(iRet != EXIT_SUCCESS) {
(void) fprintf(stderr, "ERROR: FFT failed\n");
cleanUp();
......@@ -96,6 +101,7 @@ int runPFB(signed char* inputData_h, float* outputData_h, params pfbParams) {
// step input and output buffers.
g_pf2FFTIn_d += g_iNumSubBands * g_iNFFT;
g_pf2FFTOut_d += g_iNumSubBands * g_iNFFT;
g_pf2DiscShift_d += g_iNumSubBands * (g_iNFFT/2);
lProcData += g_iNumSubBands * g_iNFFT;
if(lProcData >= ltotData - NUM_TAPS*g_iNumSubBands*g_iNFFT){ // >= process 117 ffts leaving 256 time samples, > process 118 ffts leaving 224 time samples.
......@@ -114,10 +120,14 @@ int runPFB(signed char* inputData_h, float* outputData_h, params pfbParams) {
//wind back in/out ptrs - should put in another pointer as a process read ptr instead of updating the global ptr.
g_pf2FFTOut_d = g_pf2FFTOut_d - countFFT*g_iNumSubBands*g_iNFFT;
g_pf2FFTIn_d = g_pf2FFTIn_d -countFFT*g_iNumSubBands*g_iNFFT;
g_pf2DiscShift_d = g_pf2DiscShift_d - countFFT*g_iNumSubBands*(g_iNFFT/2);
int outDataSize = countFFT * g_iNumSubBands * g_iNFFT;
//int outDataSize = countFFT * g_iNumSubBands * g_iNFFT;
// Modified variable outDataSize 1/2 of g_iNFFT due to the discard of half the channels //////
int outDataSize = countFFT * g_iNumSubBands * (g_iNFFT/2);
//CUDASafeCallWithCleanUp(cudaMemcpy(outputData_h, fftOutPtr, outDataSize*sizeof(cufftComplex), cudaMemcpyDeviceToHost));
CUDASafeCallWithCleanUp(cudaMemcpy(outputData_h, g_pf2FFTOut_d, outDataSize*sizeof(cufftComplex), cudaMemcpyDeviceToHost));
CUDASafeCallWithCleanUp(cudaMemcpy(outputData_h, g_pf2DiscShift_d, outDataSize*sizeof(cufftComplex), cudaMemcpyDeviceToHost));
////////////////////////////////////////////////////////////////////////////////////////////
return iRet;
......@@ -302,11 +312,20 @@ int initPFB(int iCudaDevice, params pfbParams){
//int sizeDataBlock_in = g_iNumSubBands * g_iNFFT * sizeof(float2);
int sizeDataBlock_in = pfbParams.samples*g_iNumSubBands * sizeof(float2);
int sizeTotalDataBlock_out = pfbParams.samples*g_iNumSubBands * sizeof(float2); // output fft array same size as output data for convinence the full size is not used. In the pfb function the output data will be the fft counter times block amount in the fft.
// New variables ///////////////////////////////////
int g_iNwindows = 125; // with 8000 time samples, but with 4032 samples, Nwindows = 63
int sizeTotalDataBlock_disc = (g_iNFFT/2)*g_iNwindows*g_iNumSubBands * sizeof(float2); // Not sure about the value of this yet.
//////////////////////////////////////////////////
CUDASafeCallWithCleanUp(cudaMalloc((void **) &g_pf2FFTIn_d, sizeDataBlock_in));
CUDASafeCallWithCleanUp(cudaMalloc((void **) &g_pf2FFTOut_d, sizeTotalDataBlock_out)); // goal will be to update the output ptr each time it fires.
// New Code ////////////////////////////////////////////////////////
CUDASafeCallWithCleanUp(cudaMalloc((void **) &g_pf2DiscShift_d, sizeTotalDataBlock_disc)); // Discard and shift array
////////////////////////////////////////////////////////////////////
CUDASafeCallWithCleanUp(cudaMemset((void *) g_pf2FFTIn_d, 0, sizeDataBlock_in));
CUDASafeCallWithCleanUp(cudaMemset((void *) g_pf2FFTOut_d, 0, sizeTotalDataBlock_out));
// New Code ////////////////////////////////////////////////////////////
CUDASafeCallWithCleanUp(cudaMemset((void *) g_pf2DiscShift_d, 0, sizeTotalDataBlock_disc));
////////////////////////////////////////////////////////////////////////
// set kernel parameters
(void) fprintf(stdout, "\tSetting kernel parameters...\n");
......@@ -320,8 +339,11 @@ int initPFB(int iCudaDevice, params pfbParams){
g_dimGPFB.x = (g_iNumSubBands * g_iNFFT) / g_dimBPFB.x;
g_dimGCopy.x = (g_iNumSubBands * g_iNFFT) / g_dimBCopy.x;
g_dimGPFB.y = 125;
g_dimGCopy.y = 125;
g_dimGPFB.y = 125; // with 8000 time samples, but with 4032 samples, g_dimBPFB.y = 63
g_dimGCopy.y = 125; // same as g_dimGPFB.y
// g_dimBPFB.y = 63; // No. of windows given 64 point FFTs and 4000 time samples per block. Since 4000/64=62.5, I need to see whether 32 samples should be discarded.
// g_dimGPFB.y = 63; // No. of windows given 64 point FFTs and 4000 time samples per block
// map kernel params
mapGSize.x = pfbParams.samples;
......@@ -444,6 +466,10 @@ void cleanUp() {
(void) cudaFree(g_pf2FFTOut_d);
g_pf2FFTOut_d = NULL;
}
if (g_pf2DiscShift_d != NULL) {
(void) cudaFree(g_pf2DiscShift_d);
g_pf2DiscShift_d = NULL;
}
free(g_pfPFBCoeff);
(void) cudaFree(g_pfPFBCoeff_d);
......
......@@ -21,12 +21,12 @@
#define DEF_CUDA_DEVICE 0
#define DEF_SIZE_READ 262144 // data block size. should this be set dynamically once I get the data?
#define DEF_LEN_SPEC 32 // Transform size
#define DEF_LEN_SPEC 64 // Transform size (previously 32)
#define NUM_TAPS 8 // PFB Decimation factor
#define DEF_NUM_CHANNELS 25 // System spec for total number of channels
#define DEF_NUM_CHANNELS 20 // System spec for total number of channels (previously 25)
#define PFB_CHANNELS 5 // Number of coarse channels through PFB
#define DEF_NUM_ELEMENTS 64 // System spec for number of elements
#define SAMPLES 4000// Time samples.
#define SAMPLES 8000// Time samples. Used to be 4000 with 64 points
#define PFB_OUTPUT_BLOCK_SIZE (SAMPLES+3*32)*PFB_CHANNELS*DEF_NUM_ELEMENTS*2 // (3*DEF_LEN_SPEC is to add more samples on the end to make it look like 128 pfb windows had been processed for the pfb correlator)
......@@ -43,6 +43,10 @@
#define FILE_COEFF_DATATYPE "float"
#define FILE_COEFF_SUFFIX ".dat"
// // Discard channels and perform FFT shift
// #define recover_idx(pt,i,sb,st) ((pt+(47*i)) + DEF_LEN_SPEC*(sb) + DEF_LEN_SPEC*DEF_NUM_ELEMENTS*PFB_CHANNELS*(st))
// #define fftshift_idx(pt,i,sb,st) ((pt+(17*(1-i))) + DEF_LEN_SPEC*(sb) + DEF_LEN_SPEC*DEF_NUM_ELEMENTS*PFB_CHANNELS*(st))
#define USEC2SEC 1e-6
typedef unsigned char BYTE;
......
#ifdef __cplusplus
extern "C" {
#include "pfb.h"
}
#endif
// data ptrs
char2* g_pc2InBuf = NULL;
char2* g_pc2InBufRead = NULL;
char2* g_pc2Data_d = NULL;
char2* g_pc2DataRead_d = NULL;
float2* g_pf2FFTIn_d = NULL;
float2* g_pf2FFTOut_d = NULL;
float2* g_pf2DiscShift_d = NULL;
float *g_pfPFBCoeff = NULL;
float *g_pfPFBCoeff_d = NULL;
char* g_pcInputData_d = NULL;
// pfb params
int g_iNFFT = DEF_LEN_SPEC;
int g_iNTaps = NUM_TAPS;
int g_iNumSubBands = PFB_CHANNELS * DEF_NUM_ELEMENTS;
// process flags
int g_IsDataReadDone = FALSE;
int g_IsProcDone = FALSE;
// size vars
int g_iSizeFile = 0;
int g_iReadCount = 0;
int g_iSizeRead = DEF_SIZE_READ;
int g_iFileCoeff = 0;
char g_acFileCoeff[256] = {0};
// GPU params
dim3 g_dimBPFB(1, 1, 1);
dim3 g_dimGPFB(1, 1);
dim3 g_dimBCopy(1, 1, 1);
dim3 g_dimGCopy(1, 1);
dim3 mapGSize(1,1,1);
dim3 mapBSize(1,1,1);
dim3 saveGSize(1, 1, 1 ); // (5, 256, 1)
dim3 saveBSize(1, 1, 1); // (64, 1, 1)
cufftHandle g_stPlan = {0};
int g_iMaxThreadsPerBlock = 0;
int g_iMaxPhysThreads = 0;
int runPFB(signed char* inputData_h, float* outputData_h, params pfbParams) {
g_IsProcDone = FALSE;
int iRet = EXIT_SUCCESS;
long lProcData = 0;
long ltotData = pfbParams.fine_channels*pfbParams.elements*(pfbParams.samples + pfbParams.nfft*pfbParams.taps);
int start = pfbParams.fine_channels*pfbParams.elements*pfbParams.nfft*pfbParams.taps;
int countFFT = 0;
int cpySize = pfbParams.fine_channels*pfbParams.elements*pfbParams.samples*(2*sizeof(char));
// copy data to device