Commit 8efa733c authored by Mark Ruzindana's avatar Mark Ruzindana
Browse files

PFB scalloping fix code modifications and additions.

parent 61e94c60
...@@ -369,7 +369,7 @@ void data_restructure(signed char * data, cuComplex * data_restruc){ ...@@ -369,7 +369,7 @@ void data_restructure(signed char * data, cuComplex * data_restruc){
in the GPU which would be faster anyway. in the GPU which would be faster anyway.
*/ */
/*
int i = threadIdx.x; int i = threadIdx.x;
int c = threadIdx.y; int c = threadIdx.y;
...@@ -391,8 +391,9 @@ void data_restructure(signed char * data, cuComplex * data_restruc){ ...@@ -391,8 +391,9 @@ void data_restructure(signed char * data, cuComplex * data_restruc){
data_restruc[out_idx].y = data[2*in_idx + 1]*1.0f; data_restruc[out_idx].y = data[2*in_idx + 1]*1.0f;
return; return;
*/
/*
// Original Code // Original Code
int e = threadIdx.x; int e = threadIdx.x;
int t = blockIdx.x; int t = blockIdx.x;
...@@ -404,7 +405,7 @@ void data_restructure(signed char * data, cuComplex * data_restruc){ ...@@ -404,7 +405,7 @@ void data_restructure(signed char * data, cuComplex * data_restruc){
return; return;
*/
} }
...@@ -501,13 +502,13 @@ void run_beamformer(signed char * data_in, float * data_out) { ...@@ -501,13 +502,13 @@ void run_beamformer(signed char * data_in, float * data_out) {
dim3 gridDim_transpose(Nm, Nf, Nt); dim3 gridDim_transpose(Nm, Nf, Nt);
dim3 blockDim_transpose(Ni, Nc, 1); dim3 blockDim_transpose(Ni, Nc, 1);
//signed char* d_tra_data_in = d_data1; signed char* d_tra_data_in = d_data1;
//signed char* d_tra_data_out = d_data2; //signed char* d_tra_data_out = d_data2;
signed char * d_restruct_in = d_data1; //signed char * d_restruct_in = d_data1;
cuComplex * d_restruct_out = d_data; cuComplex * d_restruct_out = d_data;
cudaMemcpy(d_restruct_in, data_in, 2*BN_SAMP*sizeof(signed char), cudaMemcpyHostToDevice); //cudaMemcpy(d_restruct_in, data_in, 2*BN_SAMP*sizeof(signed char), cudaMemcpyHostToDevice);
//cudaMemcpy(d_tra_data_in, data_in, 2*BN_SAMP*sizeof(signed char), cudaMemcpyHostToDevice); cudaMemcpy(d_tra_data_in, data_in, 2*BN_SAMP*sizeof(signed char), cudaMemcpyHostToDevice);
err_code = cudaGetLastError(); err_code = cudaGetLastError();
if (err_code != cudaSuccess) { if (err_code != cudaSuccess) {
printf("RTBF: cudaMemcpy Failed: %s\n", cudaGetErrorString(err_code)); printf("RTBF: cudaMemcpy Failed: %s\n", cudaGetErrorString(err_code));
...@@ -520,9 +521,9 @@ void run_beamformer(signed char * data_in, float * data_out) { ...@@ -520,9 +521,9 @@ void run_beamformer(signed char * data_in, float * data_out) {
// } // }
// Restructure data for cublasCgemmBatched function. // Restructure data for cublasCgemmBatched function.
//data_restructure<<<gridDim_transpose, blockDim_transpose>>>(d_tra_data_in, d_restruct_out); data_restructure<<<gridDim_transpose, blockDim_transpose>>>(d_tra_data_in, d_restruct_out);
//data_restructure<<<gridDim_transpose, blockDim_transpose>>>(d_restruct_in, d_restruct_out); //data_restructure<<<gridDim_transpose, blockDim_transpose>>>(d_restruct_in, d_restruct_out);
data_restructure<<<dimGrid_d, dimBlock_d>>>(d_restruct_in, d_restruct_out); //data_restructure<<<dimGrid_d, dimBlock_d>>>(d_restruct_in, d_restruct_out);
if (err_code != cudaSuccess) { if (err_code != cudaSuccess) {
printf("RTBF: CUDA Error (data_restructure): %s\n", cudaGetErrorString(err_code)); printf("RTBF: CUDA Error (data_restructure): %s\n", cudaGetErrorString(err_code));
} }
......
...@@ -7,12 +7,12 @@ ...@@ -7,12 +7,12 @@
#include <math.h> #include <math.h>
#define BN_ELE 38 // Number of elements/antennas in the array #define BN_ELE 38 // Number of elements/antennas in the array
#define BN_BIN 25 // Number of frequency bins #define BN_BIN 20 // 25 // Number of frequency bins
#define BN_TIME 4000 //40 // Number of decimated time samples #define BN_TIME 8000 // 4000 //40 // Number of decimated time samples
#define BN_BEAM 14 // Number of beams we are forming #define BN_BEAM 14 // Number of beams we are forming
#define BN_POL 4 #define BN_POL 4
#define BN_BEAM1 (BN_BEAM/2) // Number of beams we are forming #define BN_BEAM1 (BN_BEAM/2) // Number of beams we are forming
#define BN_TIME_STI 40 //40 // Number of decimated time samples per integrated beamformer output #define BN_TIME_STI 80 //40 // Number of decimated time samples per integrated beamformer output
#define BN_STI (BN_TIME/BN_TIME_STI) // Number of short time integrations #define BN_STI (BN_TIME/BN_TIME_STI) // Number of short time integrations
#define BN_STI_BLOC 64 #define BN_STI_BLOC 64
#define BN_ELE_BLOC 64 #define BN_ELE_BLOC 64
......
...@@ -7,9 +7,9 @@ ...@@ -7,9 +7,9 @@
#define NF 8 // Number of Fengines #define NF 8 // Number of Fengines
#define NI 8 // Number of inputs per Fengine #define NI 8 // Number of inputs per Fengine
#define NA (NF*NI) // Number of total antennas #define NA (NF*NI) // Number of total antennas
#define NC 25 // Number of frequency channels #define NC 20 // 25 // Number of frequency channels
#define NT 20 // Number of time samples per packet/mcnt #define NT 20 // Number of time samples per packet/mcnt
#define NM 200 // Number of packets/mcnts per block #define NM 400 // 200 // Number of packets/mcnts per block
#define pow1 32768 // Next power of 2 >= Nm*Nt #define pow1 32768 // Next power of 2 >= Nm*Nt
#define nblocks2 (pow1/1024) // Block size for second kernel #define nblocks2 (pow1/1024) // Block size for second kernel
#define nblocks1 (pow1/nblocks2) // Block size for first kernel #define nblocks1 (pow1/nblocks2) // Block size for first kernel
......
...@@ -10,7 +10,7 @@ import getopt ...@@ -10,7 +10,7 @@ import getopt
import math import math
import numpy import numpy
import scipy.signal as sp import scipy.signal as sp
import matplotlib.pyplot as plotter #import matplotlib.pyplot as plotter
# function definitions # function definitions
def PrintUsage(ProgName): def PrintUsage(ProgName):
...@@ -142,16 +142,16 @@ FileCoeff = open("coeff_" \ ...@@ -142,16 +142,16 @@ FileCoeff = open("coeff_" \
if ("signedchar" == DataType): if ("signedchar" == DataType):
FileCoeff.write(PFBCoeffInt8) FileCoeff.write(PFBCoeffInt8)
# plot the coefficients # plot the coefficients
if (Plot): #if (Plot):
plotter.plot(PFBCoeffInt8) # plotter.plot(PFBCoeffInt8)
else: else:
FileCoeff.write(PFBCoeffFloat32) FileCoeff.write(PFBCoeffFloat32)
# plot the coefficients # plot the coefficients
if (Plot): #if (Plot):
plotter.plot(PFBCoeffFloat32) # plotter.plot(PFBCoeffFloat32)
FileCoeff.close() FileCoeff.close()
if (Plot): #if (Plot):
plotter.show() # plotter.show()
...@@ -60,6 +60,21 @@ __global__ void PFB_kernel(char2* pc2Data, ...@@ -60,6 +60,21 @@ __global__ void PFB_kernel(char2* pc2Data,
return; return;
} }
// Discard channels and perform FFT shift (part of scalloping solution)
__global__ void Discard_Shift_kernel(float2* pf2FFTOut, float2* pf2DiscShift)
{
int pt = threadIdx.x; // N-point FFT index
int sb = blockIdx.x; // Number of elements x coarse channels (time series) index
int st = blockIdx.y; // Windows index (4000/32 = 125 windows)
int i = blockIdx.z; // Chunks of channels to recover
// Both pre-processor macros are defined in kernels.h
pf2DiscShift[fftshift_idx(pt,i,sb,st)].x = pf2FFTOut[recover_idx(pt,i,sb,st)].x;
pf2DiscShift[fftshift_idx(pt,i,sb,st)].y = pf2FFTOut[recover_idx(pt,i,sb,st)].y;
return;
}
// When PFB disabled just perform FFT. // When PFB disabled just perform FFT.
__global__ void CopyDataForFFT(char2 *pc2Data, float2 *pf2FFTIn) __global__ void CopyDataForFFT(char2 *pc2Data, float2 *pf2FFTIn)
{ {
......
...@@ -4,8 +4,17 @@ ...@@ -4,8 +4,17 @@
#include <cuda.h> #include <cuda.h>
#include <cufft.h> #include <cufft.h>
#define N_POINTS_FFT 64
#define N_ELEMENTS 64
#define N_FINE_CHANS 5
// Discard channels and perform FFT shift
#define recover_idx(pt,i,sb,st) ((pt+(48*i)) + N_POINTS_FFT*(sb) + N_POINTS_FFT*N_ELEMENTS*N_FINE_CHANS*(st))
#define fftshift_idx(pt,i,sb,st) ((pt+(16*(1-i))) + (N_POINTS_FFT/2)*(sb) + (N_POINTS_FFT/2)*N_ELEMENTS*N_FINE_CHANS*(st))
// stuct of parameters for PFB. Values indicate default values. // stuct of parameters for PFB. Values indicate default values.
#define DEFAULT_PFB_PARAMS {4000, 32, 8, 25, 5, 64, 320, 0, (char*)"hanning\0", (char*)"float\0", (char*)"\0", 1}; //#define DEFAULT_PFB_PARAMS {4000, 32, 8, 25, 5, 64, 320, 0, (char*)"hanning\0", (char*)"float\0", (char*)"\0", 1};
#define DEFAULT_PFB_PARAMS {8000, 64, 8, 20, 5, 64, 320, 0, (char*)"hanning\0", (char*)"float\0", (char*)"\0", 1};
// plot 1 mean to hide the plot of the filter before continuing. // plot 1 mean to hide the plot of the filter before continuing.
typedef struct { typedef struct {
int samples; int samples;
...@@ -24,6 +33,7 @@ typedef struct { ...@@ -24,6 +33,7 @@ typedef struct {
__global__ void PFB_kernel(char2* pc2Data, float2* pf2FFTIn, float* pfPFBCoeff, params pfbParams); __global__ void PFB_kernel(char2* pc2Data, float2* pf2FFTIn, float* pfPFBCoeff, params pfbParams);
__global__ void Discard_Shift_kernel(float2* pf2FFTOut, float2* pf2DiscShift);
__global__ void map(char* dataIn, char2* dataOut, int channelSelect, params pfbParams); __global__ void map(char* dataIn, char2* dataOut, int channelSelect, params pfbParams);
__global__ void CopyDataForFFT(char2* pc2Data, float2* pf2FFTIn); __global__ void CopyDataForFFT(char2* pc2Data, float2* pf2FFTIn);
__global__ void saveData(char2* dataIn, char2* dataOut); __global__ void saveData(char2* dataIn, char2* dataOut);
......
#ifdef __cplusplus
extern "C" {
#include "kernels.h"
}
#endif
__global__ void map(char* dataIn,
char2* dataOut,
int channelSelect,
params pfbParams)
{
// select the channel range
int channelMin = pfbParams.fine_channels*channelSelect;
int absIdx = 2 * blockDim.y*(blockIdx.x*pfbParams.coarse_channels + (channelMin+blockIdx.y)) + 2 * threadIdx.y; // times 2 because we are mapping a sequence of values to char2 array.
int mapIdx = blockDim.y*(blockIdx.x*gridDim.y + blockIdx.y) + threadIdx.y;
dataOut[mapIdx].x = dataIn[absIdx];
dataOut[mapIdx].y = dataIn[absIdx+1];
return;
}
/* prepare data for PFB */
__global__ void PFB_kernel(char2* pc2Data,
float2* pf2FFTIn,
float* pfPFBCoeff,
params pfbParams)
{
int blkIdx = blockIdx.y * gridDim.x + blockIdx.x;
int i = blkIdx*blockDim.x + threadIdx.x;
int absCoeff = (blockIdx.x * blockDim.x) + threadIdx.x;
int iNFFT = (gridDim.x * blockDim.x);
int j = 0;
int iAbsIdx = 0;
int coeffIdx = 0;
float2 f2PFBOut = make_float2(0.0, 0.0);
char2 c2Data = make_char2(0, 0);
for (j = 0; j < pfbParams.taps; ++j)
{
/* calculate the absolute index */
iAbsIdx = (j * iNFFT) + i;
coeffIdx = (j * iNFFT) + absCoeff;
/* get the address of the block */
c2Data = pc2Data[iAbsIdx];
f2PFBOut.x += (float) c2Data.x * pfPFBCoeff[coeffIdx];
f2PFBOut.y += (float) c2Data.y * pfPFBCoeff[coeffIdx];
}
pf2FFTIn[i] = f2PFBOut;
return;
}
// Discard channels and perform FFT shift (part of scalloping solution)
__global__ void Discard_Shift_kernel(float2* pf2FFTOut, float2* pf2DiscShift)
{
int pt = threadIdx.x; // N-point FFT index
int sb = blockIdx.x; // Number of elements x coarse channels (time series) index
int st = blockIdx.y; // Windows index (4000/32 = 125 windows)
int i = blockIdx.z; // Chunks of channels to recover
// Both pre-processor macros are defined in kernels.h
pf2DiscShift[fftshift_idx(pt,i,sb,st)].x = pf2FFTOut[recover_idx(pt,i,sb,st)].x;
pf2DiscShift[fftshift_idx(pt,i,sb,st)].y = pf2FFTOut[recover_idx(pt,i,sb,st)].y;
return;
}
// When PFB disabled just perform FFT.
__global__ void CopyDataForFFT(char2 *pc2Data, float2 *pf2FFTIn)
{
int blkIdx = blockIdx.y * gridDim.x + blockIdx.x;
int i = blkIdx*blockDim.x + threadIdx.x;
pf2FFTIn[i].x = (float) pc2Data[i].x;
pf2FFTIn[i].y = (float) pc2Data[i].y;
return;
}
// prepares for the next PFB.
__global__ void saveData(char2* dataIn, char2* dataOut){
int i = blockIdx.y*(gridDim.x*blockDim.x) + blockIdx.x*blockDim.x + threadIdx.x;
dataOut[i] = dataIn[i];
return;
}
#ifndef __KERNELS_H__
#define __KERNELS_H__
#include <cuda.h>
#include <cufft.h>
#define N_POINTS_FFT 64
#define N_ELEMENTS 64
#define N_FINE_CHANS 5
// Discard channels and perform FFT shift
#define recover_idx(pt,i,sb,st) ((pt+(48*i)) + N_POINTS_FFT*(sb) + N_POINTS_FFT*N_ELEMENTS*N_FINE_CHANS*(st))
#define fftshift_idx(pt,i,sb,st) ((pt+(16*(1-i))) + (N_POINTS_FFT/2)*(sb) + (N_POINTS_FFT/2)*N_ELEMENTS*N_FINE_CHANS*(st))
// stuct of parameters for PFB. Values indicate default values.
//#define DEFAULT_PFB_PARAMS {4000, 32, 8, 25, 5, 64, 320, 0, (char*)"hanning\0", (char*)"float\0", (char*)"\0", 1};
#define DEFAULT_PFB_PARAMS {8000, 64, 8, 20, 5, 64, 320, 0, (char*)"hanning\0", (char*)"float\0", (char*)"\0", 1};
// plot 1 mean to hide the plot of the filter before continuing.
typedef struct {
int samples;
int nfft;
int taps;
int coarse_channels;
int fine_channels;
int elements;
int subbands;
int select;
char* window;
char* dataType;
char* coeffPath;
int plot;
} params;
__global__ void PFB_kernel(char2* pc2Data, float2* pf2FFTIn, float* pfPFBCoeff, params pfbParams);
__global__ void Discard_Shift_kernel(float2* pf2FFTOut, float2* pf2DiscShift);
__global__ void map(char* dataIn, char2* dataOut, int channelSelect, params pfbParams);
__global__ void CopyDataForFFT(char2* pc2Data, float2* pf2FFTIn);
__global__ void saveData(char2* dataIn, char2* dataOut);
#endif
#ifdef __cplusplus
extern "C" {
#include "kernels.h"
}
#endif
__global__ void map(char* dataIn,
char2* dataOut,
int channelSelect,
params pfbParams)
{
// select the channel range
int channelMin = pfbParams.fine_channels*channelSelect;
int absIdx = 2 * blockDim.y*(blockIdx.x*pfbParams.coarse_channels + (channelMin+blockIdx.y)) + 2 * threadIdx.y; // times 2 because we are mapping a sequence of values to char2 array.
int mapIdx = blockDim.y*(blockIdx.x*gridDim.y + blockIdx.y) + threadIdx.y;
dataOut[mapIdx].x = dataIn[absIdx];
dataOut[mapIdx].y = dataIn[absIdx+1];
return;
}
/* prepare data for PFB */
__global__ void PFB_kernel(char2* pc2Data,
float2* pf2FFTIn,
float* pfPFBCoeff,
params pfbParams)
{
int blkIdx = blockIdx.y * gridDim.x + blockIdx.x;
int i = blkIdx*blockDim.x + threadIdx.x;
int absCoeff = (blockIdx.x * blockDim.x) + threadIdx.x;
int iNFFT = (gridDim.x * blockDim.x);
int j = 0;
int iAbsIdx = 0;
int coeffIdx = 0;
float2 f2PFBOut = make_float2(0.0, 0.0);
char2 c2Data = make_char2(0, 0);
for (j = 0; j < pfbParams.taps; ++j)
{
/* calculate the absolute index */
iAbsIdx = (j * iNFFT) + i;
coeffIdx = (j * iNFFT) + absCoeff;
/* get the address of the block */
c2Data = pc2Data[iAbsIdx];
f2PFBOut.x += (float) c2Data.x * pfPFBCoeff[coeffIdx];
f2PFBOut.y += (float) c2Data.y * pfPFBCoeff[coeffIdx];
}
pf2FFTIn[i] = f2PFBOut;
return;
}
// When PFB disabled just perform FFT.
__global__ void CopyDataForFFT(char2 *pc2Data, float2 *pf2FFTIn)
{
int blkIdx = blockIdx.y * gridDim.x + blockIdx.x;
int i = blkIdx*blockDim.x + threadIdx.x;
pf2FFTIn[i].x = (float) pc2Data[i].x;
pf2FFTIn[i].y = (float) pc2Data[i].y;
return;
}
// prepares for the next PFB.
__global__ void saveData(char2* dataIn, char2* dataOut){
int i = blockIdx.y*(gridDim.x*blockDim.x) + blockIdx.x*blockDim.x + threadIdx.x;
dataOut[i] = dataIn[i];
return;
}
#ifndef __KERNELS_H__
#define __KERNELS_H__
#include <cuda.h>
#include <cufft.h>
// stuct of parameters for PFB. Values indicate default values.
#define DEFAULT_PFB_PARAMS {4000, 32, 8, 25, 5, 64, 320, 0, (char*)"hanning\0", (char*)"float\0", (char*)"\0", 1};
// plot 1 mean to hide the plot of the filter before continuing.
typedef struct {
int samples;
int nfft;
int taps;
int coarse_channels;
int fine_channels;
int elements;
int subbands;
int select;
char* window;
char* dataType;
char* coeffPath;
int plot;
} params;
__global__ void PFB_kernel(char2* pc2Data, float2* pf2FFTIn, float* pfPFBCoeff, params pfbParams);
__global__ void map(char* dataIn, char2* dataOut, int channelSelect, params pfbParams);
__global__ void CopyDataForFFT(char2* pc2Data, float2* pf2FFTIn);
__global__ void saveData(char2* dataIn, char2* dataOut);
#endif
...@@ -13,6 +13,7 @@ char2* g_pc2DataRead_d = NULL; ...@@ -13,6 +13,7 @@ char2* g_pc2DataRead_d = NULL;
float2* g_pf2FFTIn_d = NULL; float2* g_pf2FFTIn_d = NULL;
float2* g_pf2FFTOut_d = NULL; float2* g_pf2FFTOut_d = NULL;
float2* g_pf2DiscShift_d = NULL;
float *g_pfPFBCoeff = NULL; float *g_pfPFBCoeff = NULL;
float *g_pfPFBCoeff_d = NULL; float *g_pfPFBCoeff_d = NULL;
...@@ -85,6 +86,10 @@ int runPFB(signed char* inputData_h, float* outputData_h, params pfbParams) { ...@@ -85,6 +86,10 @@ int runPFB(signed char* inputData_h, float* outputData_h, params pfbParams) {
while(!g_IsProcDone) { while(!g_IsProcDone) {
//FFT //FFT
iRet = doFFT(); iRet = doFFT();
// New Code ///////////////////////////////////
Discard_Shift_kernel<<<g_dimGPFB, g_dimBPFB>>>(g_pf2FFTOut_d, g_pf2DiscShift_d);
/////////////////////////////////////////////////
if(iRet != EXIT_SUCCESS) { if(iRet != EXIT_SUCCESS) {
(void) fprintf(stderr, "ERROR: FFT failed\n"); (void) fprintf(stderr, "ERROR: FFT failed\n");
cleanUp(); cleanUp();
...@@ -96,6 +101,7 @@ int runPFB(signed char* inputData_h, float* outputData_h, params pfbParams) { ...@@ -96,6 +101,7 @@ int runPFB(signed char* inputData_h, float* outputData_h, params pfbParams) {
// step input and output buffers. // step input and output buffers.
g_pf2FFTIn_d += g_iNumSubBands * g_iNFFT; g_pf2FFTIn_d += g_iNumSubBands * g_iNFFT;
g_pf2FFTOut_d += g_iNumSubBands * g_iNFFT; g_pf2FFTOut_d += g_iNumSubBands * g_iNFFT;
g_pf2DiscShift_d += g_iNumSubBands * (g_iNFFT/2);
lProcData += g_iNumSubBands * g_iNFFT; lProcData += g_iNumSubBands * g_iNFFT;
if(lProcData >= ltotData - NUM_TAPS*g_iNumSubBands*g_iNFFT){ // >= process 117 ffts leaving 256 time samples, > process 118 ffts leaving 224 time samples. if(lProcData >= ltotData - NUM_TAPS*g_iNumSubBands*g_iNFFT){ // >= process 117 ffts leaving 256 time samples, > process 118 ffts leaving 224 time samples.
...@@ -114,10 +120,14 @@ int runPFB(signed char* inputData_h, float* outputData_h, params pfbParams) { ...@@ -114,10 +120,14 @@ int runPFB(signed char* inputData_h, float* outputData_h, params pfbParams) {
//wind back in/out ptrs - should put in another pointer as a process read ptr instead of updating the global ptr. //wind back in/out ptrs - should put in another pointer as a process read ptr instead of updating the global ptr.
g_pf2FFTOut_d = g_pf2FFTOut_d - countFFT*g_iNumSubBands*g_iNFFT; g_pf2FFTOut_d = g_pf2FFTOut_d - countFFT*g_iNumSubBands*g_iNFFT;
g_pf2FFTIn_d = g_pf2FFTIn_d -countFFT*g_iNumSubBands*g_iNFFT; g_pf2FFTIn_d = g_pf2FFTIn_d -countFFT*g_iNumSubBands*g_iNFFT;
g_pf2DiscShift_d = g_pf2DiscShift_d - countFFT*g_iNumSubBands*(g_iNFFT/2);
int outDataSize = countFFT * g_iNumSubBands * g_iNFFT; //int outDataSize = countFFT * g_iNumSubBands * g_iNFFT;
// Modified variable outDataSize 1/2 of g_iNFFT due to the discard of half the channels //////
int outDataSize = countFFT * g_iNumSubBands * (g_iNFFT/2);
//CUDASafeCallWithCleanUp(cudaMemcpy(outputData_h, fftOutPtr, outDataSize*sizeof(cufftComplex), cudaMemcpyDeviceToHost)); //CUDASafeCallWithCleanUp(cudaMemcpy(outputData_h, fftOutPtr, outDataSize*sizeof(cufftComplex), cudaMemcpyDeviceToHost));
CUDASafeCallWithCleanUp(cudaMemcpy(outputData_h, g_pf2FFTOut_d, outDataSize*sizeof(cufftComplex), cudaMemcpyDeviceToHost)); CUDASafeCallWithCleanUp(cudaMemcpy(outputData_h, g_pf2DiscShift_d, outDataSize*sizeof(cufftComplex), cudaMemcpyDeviceToHost));
////////////////////////////////////////////////////////////////////////////////////////////
return iRet; return iRet;
...@@ -302,11 +312,20 @@ int initPFB(int iCudaDevice, params pfbParams){ ...@@ -302,11 +312,20 @@ int initPFB(int iCudaDevice, params pfbParams){
//int sizeDataBlock_in = g_iNumSubBands * g_iNFFT * sizeof(float2); //int sizeDataBlock_in = g_iNumSubBands * g_iNFFT * sizeof(float2);
int sizeDataBlock_in = pfbParams.samples*g_iNumSubBands * sizeof(float2); int sizeDataBlock_in = pfbParams.samples*g_iNumSubBands * sizeof(float2);
int sizeTotalDataBlock_out = pfbParams.samples*g_iNumSubBands * sizeof(float2); // output fft array same size as output data for convinence the full size is not used. In the pfb function the output data will be the fft counter times block amount in the fft. int sizeTotalDataBlock_out = pfbParams.samples*g_iNumSubBands * sizeof(float2); // output fft array same size as output data for convinence the full size is not used. In the pfb function the output data will be the fft counter times block amount in the fft.
// New variables ///////////////////////////////////
int g_iNwindows = 125; // with 8000 time samples, but with 4032 samples, Nwindows = 63
int sizeTotalDataBlock_disc = (g_iNFFT/2)*g_iNwindows*g_iNumSubBands * sizeof(float2); // Not sure about the value of this yet.
//////////////////////////////////////////////////
CUDASafeCallWithCleanUp(cudaMalloc((void **) &g_pf2FFTIn_d, sizeDataBlock_in)); CUDASafeCallWithCleanUp(cudaMalloc((void **) &g_pf2FFTIn_d, sizeDataBlock_in));
CUDASafeCallWithCleanUp(cudaMalloc((void **) &g_pf2FFTOut_d, sizeTotalDataBlock_out)); // goal will be to update the output ptr each time it fires. CUDASafeCallWithCleanUp(cudaMalloc((void **) &g_pf2FFTOut_d, sizeTotalDataBlock_out)); // goal will be to update the output ptr each time it fires.
// New Code ////////////////////////////////////////////////////////
CUDASafeCallWithCleanUp(cudaMalloc((void **) &g_pf2DiscShift_d, sizeTotalDataBlock_disc)); // Discard and shift array
////////////////////////////////////////////////////////////////////
CUDASafeCallWithCleanUp(cudaMemset((void *) g_pf2FFTIn_d, 0, sizeDataBlock_in)); CUDASafeCallWithCleanUp(cudaMemset((void *) g_pf2FFTIn_d, 0, sizeDataBlock_in));
CUDASafeCallWithCleanUp(cudaMemset((void *) g_pf2FFTOut_d, 0, sizeTotalDataBlock_out)); CUDASafeCallWithCleanUp(cudaMemset((void *) g_pf2FFTOut_d, 0, sizeTotalDataBlock_out));
// New Code ////////////////////////////////////////////////////////////
CUDASafeCallWithCleanUp(cudaMemset((void *) g_pf2DiscShift_d, 0, sizeTotalDataBlock_disc));
////////////////////////////////////////////////////////////////////////
// set kernel parameters // set kernel parameters
(void) fprintf(stdout, "\tSetting kernel parameters...\n"); (void) fprintf(stdout, "\tSetting kernel parameters...\n");
...@@ -320,8 +339,11 @@ int initPFB(int iCudaDevice, params pfbParams){ ...@@ -320,8 +339,11 @@ int initPFB(int iCudaDevice, params pfbParams){
g_dimGPFB.x = (g_iNumSubBands * g_iNFFT) / g_dimBPFB.x; g_dimGPFB.x = (g_iNumSubBands * g_iNFFT) / g_dimBPFB.x;
g_dimGCopy.x = (g_iNumSubBands * g_iNFFT) / g_dimBCopy.x; g_dimGCopy.x = (g_iNumSubBands * g_iNFFT) / g_dimBCopy.x;
g_dimGPFB.y = 125; g_dimGPFB.y = 125; // with 8000 time samples, but with 4032 samples, g_dimBPFB.y = 63
g_dimGCopy.y = 125; g_dimGCopy.y = 125; // same as g_dimGPFB.y
// g_dimBPFB.y = 63; // No. of windows given 64 point FFTs and 4000 time samples per block. Since 4000/64=62.5, I need to see whether 32 samples should be discarded.
// g_dimGPFB.y = 63; // No. of windows given 64 point FFTs and 4000 time samples per block
// map kernel params // map kernel params
mapGSize.x = pfbParams.samples; mapGSize.x = pfbParams.samples;
...@@ -444,6 +466,10 @@ void cleanUp() { ...@@ -444,6 +466,10 @@ void cleanUp() {
(void) cudaFree(g_pf2FFTOut_d); (void) cudaFree(g_pf2FFTOut_d);