Merge pull request #1 from mitchburnett/master

Integration of pfb codes

Merge pull request #1 from mitchburnett/master
Integration of pfb codes
34fca3c8 · Mitch Burnett · GitHub · 8a5cfafa · 529276b0 · 34fca3c8
Commit 34fca3c8 authored 8 years ago by Mitch Burnett Committed by GitHub 8 years ago
Hide whitespace changes
Inline Side-by-side

Showing

with 1695 additions and 7 deletions
+1695 -7
--- a/install
+++ b/install
@@ -9,12 +9,13 @@ else
 fi

 # Change this to match your destination directory
-prefix=/users/$USER/fresh_hash
+prefix=~/fresh_hash

 # Set library source directories
 HASHPIPE_SRC=$FLAG_DIR/lib/hashpipe/src
 XGPU_SRC=$FLAG_DIR/lib/xGPU/src
 BEAM_SRC=$FLAG_DIR/lib/beamformer/src
+PFB_SRC=$FLAG_DIR/lib/pfb/src
 POWER_SRC=$FLAG_DIR/lib/flagPower/src

 # Compile the hashpipe library
@@ -59,13 +60,23 @@ make install prefix=$prefix
 echo "========================================"
 echo 

+# Compile the pfb library
+echo "Installing flagpfb"
+echo "========================================"
+echo "Navigating to $PFB_SRC"
+cd $PFB_SRC
+make
+make install prefix=$prefix
+echo "========================================"
+echo
+
 # Compile the flag HASHPIPE plugins
 echo "Installing the FLAG plugins"
 echo "========================================"
 echo "Navigating to $FLAG_DIR/src"
 cd $FLAG_DIR/src
 autoreconf -is
-./configure --prefix=$prefix --with-hashpipe=$prefix --with-xgpu=$prefix --with-xgpufrb=$prefix --with-xgpupfb=$prefix --with-flagbeamformer=$prefix --with-flagpower=$prefix --silent
+./configure --prefix=$prefix --with-hashpipe=$prefix --with-xgpu=$prefix --with-xgpufrb=$prefix --with-xgpupfb=$prefix --with-flagpfb=$prefix --with-flagbeamformer=$prefix --with-flagpower=$prefix --silent
 make
 make install
 echo "========================================"

--- a/lib/pfb/python/flag_gen_coeff.py
+++ b/lib/pfb/python/flag_gen_coeff.py
+#!/usr/bin/python
+
+# flag_gen_coeff.py
+#   Generate PFB filter coefficients for FLAG. The
+#   filter coefficients array contains duplicates for optimised reading
+#   from the GPU.
+
+import sys
+import getopt
+import math
+import numpy
+import scipy.signal as sp
+import matplotlib.pyplot as plotter
+
+# function definitions
+def PrintUsage(ProgName):
+    "Prints usage information."
+    print "Usage: " + ProgName + " [options]"
+    print "    -h  --help                 Display this usage information"
+    print "    -n  --nfft <value>         Number of points in FFT"
+    print "    -t  --taps <value>         Number of taps in PFB"
+    print "    -w  --window <value>       Window to apply i.e \"cheb-win\", default: rect."
+    print "    -b  --sub-bands <value>    Number of sub-bands in data"
+    print "    -d  --data-type <value>    Data type - \"float\" or "          \
+          + "\"signedchar\""
+    print "    -p  --no-plot              Do not plot coefficients"
+    print "    Window types:"
+    print "       rect"
+    print "       hanning"
+    print "       cheb-win"
+    return
+
+def genCoeff(window, M):
+
+    # CHEBWIN
+    if window == "cheb-win":
+        PFBCoeff = sp.chebwin(M, at=-30)
+
+    # HANNING WINDOW
+    elif window == "hanning":
+        X = numpy.array([(float(i) / NFFT) - (float(NTaps) / 2) for i in range(M)])
+        PFBCoeff = numpy.sinc(X) * numpy.hanning(M)
+
+    else:
+        PFBCoeff = numpy.ones(M)
+    return PFBCoeff
+
+# default values
+NFFT = 32768                # number of points in FFT
+NTaps = 8                   # number of taps in PFB
+Window = "rect"             # rectangular window
+NSubBands = 1               # number of sub-bands in data
+DataType = "signedchar"     # data type - "float" or "signedchar"
+Plot = True                 # plot flag
+
+# get the command line arguments
+ProgName = sys.argv[0]
+OptsShort = "hn:t:w:b:d:p"
+OptsLong = ["help", "nfft=", "taps=","window=", "sub-bands=", "data-type=", "no-plot"]
+
+# check if the minimum expected number of arguments has been passed
+# to the program
+if (1 == len(sys.argv)):
+    sys.stderr.write("ERROR: No arguments passed to the program!\n")
+    PrintUsage(ProgName)
+    sys.exit(1)
+
+# get the arguments using the getopt module
+try:
+    (Opts, Args) = getopt.getopt(sys.argv[1:], OptsShort, OptsLong)
+except getopt.GetoptError, ErrMsg:
+    # print usage information and exit
+    sys.stderr.write("ERROR: " + str(ErrMsg) + "!\n")
+    PrintUsage(ProgName)
+    sys.exit(1)
+
+# parse the arguments
+for o, a in Opts:
+    if o in ("-h", "--help"):
+        PrintUsage(ProgName)
+        sys.exit()
+    elif o in ("-n", "--nfft"):
+        NFFT = int(a)
+    elif o in ("-t", "--taps"):
+        NTaps = int(a)
+    elif o in ("-w", "--window"):
+        Window = a
+    elif o in ("-b", "--sub-bands"):
+        NSubBands = int(a)
+    elif o in ("-d", "--data-type"):
+        DataType = a
+    elif o in ("-p", "--no-plot"):
+        Plot = False
+    else:
+        PrintUsage(ProgName)
+        sys.exit(1)
+
+M = NTaps * NFFT
+PFBCoeff = genCoeff(Window, M)
+
+# create conversion map
+if ("signedchar" == DataType):
+    Map = numpy.zeros(256, numpy.float32)
+    for i in range(0, 128):
+        Map[i] = float(i) / 128
+    for i in range(128, 256):
+        Map[i] = - (float(256 -i) / 128)
+
+# 32-bit (float) coefficients
+PFBCoeffFloat32 = numpy.zeros(M * NSubBands, numpy.float32)
+# 8-bit (signedchar) coefficients
+if ("signedchar" == DataType):
+    PFBCoeffInt8 = numpy.zeros(M * NSubBands, numpy.int8)
+k = 0
+for i in range(len(PFBCoeff)):
+    Coeff = float(PFBCoeff[i])
+    if ("signedchar" == DataType):
+        for j in range(256):
+            # if (math.fabs(Coeff - Map[j]) <= (0.0078125 / 2)):
+            if (math.fabs(Coeff - Map[j]) <= 0.0078125):
+                for m in range(NSubBands):
+                    PFBCoeffInt8[k + m] = j
+                break
+    elif ("float" == DataType):
+        for m in range(NSubBands):
+            PFBCoeffFloat32[k + m] = Coeff
+    else:
+        # print usage information and exit
+        sys.stderr.write("ERROR: Invalid data type!\n")
+        PrintUsage(ProgName)
+        sys.exit(1)
+    k = k + NSubBands
+
+
+# write the coefficients to disk and also plot it
+FileCoeff = open("coeff_"                                                     \
+                  + DataType + "_"                                            \
+                  + str(NTaps) + "_"                                          \
+                  + str(NFFT) + "_"                                           \
+                  + str(NSubBands) + ".dat",                                  \
+                 "wb")
+if ("signedchar" == DataType):
+    FileCoeff.write(PFBCoeffInt8)
+    # plot the coefficients
+    if (Plot):
+        plotter.plot(PFBCoeffInt8)
+else:
+    FileCoeff.write(PFBCoeffFloat32)
+    # plot the coefficients
+    if (Plot):
+        plotter.plot(PFBCoeffFloat32)
+
+FileCoeff.close()
+
+if (Plot):
+    plotter.show()
+
--- a/lib/pfb/src/Makefile
+++ b/lib/pfb/src/Makefile
+-include Makefile.local
+
+prefix ?= /usr/local
+libdir ?= $(prefix)/lib
+includedir ?= $(prefix)/include
+bindir ?= $(prefix)/bin
+CUDA_DIR ?= $(prefix)/cuda
+
+#CUDA_DIR ?= /usr/local/cuda
+#prefix ?= tmp
+#libdir ?= $(prefix)/lib
+#includedir ?= $(prefix)/include
+#bindir ?= $(prefix)/bin
+#CUDA_DIR ?= /Developer/NVIDIA/CUDA-6.5
+
+INCLUDES = -I$(CUDA_DIR)/include
+CUDA_LIBDIR = -L$(CUDA_DIR)/lib
+#INCLUDES = -I/Developer/NVIDIA/CUDA-6.5/include
+#CUDA_LIBDIR = -L/Developer/NVIDIA/CUDA-6.5/lib
+
+CLIBDIR = -L/usr/local/lib
+CU_LIB  = -lcufft -lcudart
+C_LIB = -lpython2.7 -ldl -lutil
+
+LINKER = -Xlinker -export-dynamic
+
+ifneq ($(strip $(OSTYPE)),osx)
+CUDA_LIBDIR ?= $(CUDA_DIR)/lib64
+else
+CUDA_LIBDIR ?= $(CUDA_DIR)/lib
+endif
+
+NVCC = $(CUDA_DIR)/bin/nvcc
+
+all: libflagpfb.so
+
+libflagpfb.so: pfb.o kernels.o
+	$(NVCC) $(INCLUDS) -Xcompiler -fPIC -Xcompiler -D_REENTRANT -O3 pfb.o kernels.o -o libflagpfb.so $(CUDA_LIBDIR) $(CU_LIB) $(CLIBDIR) $(C_LIB) $(LINKER) --shared
+
+pfb.o: kernels.cu kernels.h
+	$(NVCC) $(INCLUDES) $(CUDA_LIBDIR) $(CU_LIB) -c -o pfb.o -Xcompiler -fPIC -Xcompiler -D_REENTRANT -O3 pfb.cu
+
+kernels.o: kernels.cu kernels.h
+	$(NVCC) $(INCLUDES) $(CUDA_LIBDIR) $(CU_LIB) -c -o kernels.o kernels.cu -Xcompiler -fPIC -Xcompiler -D_REENTRANT -O3
+
+clean:
+	rm -f *.o
+	rm -f *.so
+
+install: kernels.o pfb.o libflagpfb.so
+	mkdir -p $(includedir)
+	cp pfb.h $(includedir)
+	cp kernels.h $(includedir)
+	mkdir -p $(libdir)
+	cp libflagpfb.so $(libdir)
+	#mkdir -p $(bindir)
+	#cp cublas_main $(bindir)
--- a/lib/pfb/src/coeff_float_8_32_320.dat
+++ b/lib/pfb/src/coeff_float_8_32_320.dat
--- a/lib/pfb/src/kernels.cu
+++ b/lib/pfb/src/kernels.cu
+#ifdef __cplusplus
+extern "C" {
+#include "kernels.h"
+}
+#endif
+
+__global__ void map(char* dataIn,
+                    char2* dataOut,
+                    int channelSelect,
+                    params pfbParams) 
+{
+
+    // select the channel range
+    int channelMin = pfbParams.fine_channels*channelSelect;
+    
+    int absIdx = 2 * blockDim.y*(blockIdx.x*pfbParams.coarse_channels + (channelMin+blockIdx.y)) + 2 * threadIdx.y;  // times 2 because we are mapping a sequence of values to char2 array.
+    int mapIdx = blockDim.y*(blockIdx.x*gridDim.y + blockIdx.y) + threadIdx.y;
+
+    dataOut[mapIdx].x = dataIn[absIdx];
+    dataOut[mapIdx].y = dataIn[absIdx+1];
+    return;
+}
+
+/* prepare data for PFB */
+__global__ void PFB_kernel(char2* pc2Data,
+                      float2* pf2FFTIn,
+                      float* pfPFBCoeff,
+                      params pfbParams)
+{
+    int i = (blockIdx.x * blockDim.x) + threadIdx.x;
+    int iNFFT = (gridDim.x * blockDim.x);
+    int j = 0;
+    int iAbsIdx = 0;
+    float2 f2PFBOut = make_float2(0.0, 0.0);
+    char2 c2Data = make_char2(0, 0);
+
+    for (j = 0; j < pfbParams.taps; ++j)
+    {
+        /* calculate the absolute index */
+        iAbsIdx = (j * iNFFT) + i;
+
+        /* get the address of the block */
+        c2Data = pc2Data[iAbsIdx];
+        
+        f2PFBOut.x += (float) c2Data.x * pfPFBCoeff[iAbsIdx];
+        f2PFBOut.y += (float) c2Data.y * pfPFBCoeff[iAbsIdx];
+    }
+
+    pf2FFTIn[i] = f2PFBOut;
+
+    return;
+}
+
+__global__ void CopyDataForFFT(char2 *pc2Data, float2 *pf2FFTIn)
+{
+    int i = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+    pf2FFTIn[i].x = (float) pc2Data[i].x;
+    pf2FFTIn[i].y = (float) pc2Data[i].y;
+
+    return;
+}
+
+// prepares for the next PFB.
+__global__ void saveData(char2* dataIn, char2* dataOut){
+	int i = blockIdx.y*(gridDim.x*blockDim.x) + blockIdx.x*blockDim.x + threadIdx.x;
+
+	dataOut[i] = dataIn[i];
+
+	return;
+}
\ No newline at end of file
--- a/lib/pfb/src/kernels.h
+++ b/lib/pfb/src/kernels.h
+#ifndef __KERNELS_H__
+#define __KERNELS_H__
+
+#include <cuda.h>
+#include <cufft.h>
+
+// stuct of parameters for PFB. Values indicate default values.
+#define DEFAULT_PFB_PARAMS {4000, 32, 8, 25, 5, 64, 320, 0, (char*)"hanning\0", (char*)"float\0", 1};
+// plot 1 mean to hide the plot of the filter before continuing.
+typedef struct {
+	int samples;
+	int nfft;
+	int taps;
+	int coarse_channels;
+	int fine_channels;
+	int elements;
+	int subbands;
+	int select;
+	char* window;
+	char* dataType;
+	int plot;
+} params;
+
+
+__global__ void PFB_kernel(char2* pc2Data, float2* pf2FFTIn, float* pfPFBCoeff, params pfbParams);
+__global__ void map(char* dataIn, char2* dataOut, int channelSelect, params pfbParams);
+__global__ void CopyDataForFFT(char2* pc2Data, float2* pf2FFTIn);
+__global__ void saveData(char2* dataIn, char2* dataOut);
+
+#endif
\ No newline at end of file
--- a/lib/pfb/src/pfb.cu
+++ b/lib/pfb/src/pfb.cu
+#ifdef __cplusplus
+extern "C" {
+#include "pfb.h"
+}
+#endif
+
+// data ptrs
+char2* g_pc2InBuf = NULL;
+char2* g_pc2InBufRead = NULL;
+
+char2* g_pc2Data_d = NULL;
+char2* g_pc2DataRead_d = NULL;
+
+float2* g_pf2FFTIn_d = NULL;
+float2* g_pf2FFTOut_d = NULL;
+
+float *g_pfPFBCoeff = NULL;
+float *g_pfPFBCoeff_d = NULL;
+char* g_pcInputData_d = NULL;
+
+// pfb params
+int g_iNFFT = DEF_LEN_SPEC;
+int g_iNTaps = NUM_TAPS;
+int g_iNumSubBands = PFB_CHANNELS * DEF_NUM_ELEMENTS;
+
+// process flags
+int g_IsDataReadDone = FALSE;
+int g_IsProcDone = FALSE;
+
+// size vars
+int g_iSizeFile = 0;
+int g_iReadCount = 0;
+int g_iSizeRead = DEF_SIZE_READ;
+int g_iFileCoeff = 0;
+char g_acFileCoeff[256] = {0};
+
+// GPU params
+dim3 g_dimBPFB(1, 1, 1);
+dim3 g_dimGPFB(1, 1);
+dim3 g_dimBCopy(1, 1, 1);
+dim3 g_dimGCopy(1, 1);
+dim3 mapGSize(1,1,1);
+dim3 mapBSize(1,1,1);
+dim3 saveGSize(1, 1, 1 ); // (5, 256, 1)
+dim3 saveBSize(1, 1, 1); // (64, 1, 1)
+
+cufftHandle g_stPlan = {0};
+int g_iMaxThreadsPerBlock = 0;
+int g_iMaxPhysThreads = 0;
+
+int runPFB(signed char* inputData_h, float* outputData_h, params pfbParams) {
+
+	//process variables
+	int iRet = EXIT_SUCCESS;
+	int countPFB = 0; // count number of times pfb fires.
+	int countCpyFFT = 0;
+	int countFFT = 0; // count number of FFT's computed.
+	long lProcData = 0; // count how much data processed
+	long ltotData = pfbParams.samples * pfbParams.fine_channels * pfbParams.elements + pfbParams.fine_channels*pfbParams.elements*pfbParams.nfft*pfbParams.taps; // total amount of data to proc (includes the padding for the saved filter state.)
+	int start = pfbParams.fine_channels*pfbParams.elements*(pfbParams.nfft*pfbParams.taps); // starting point to copy over the map data.
+	// copy data to device
+	CUDASafeCallWithCleanUp(cudaMemcpy(g_pcInputData_d, inputData_h, g_iSizeRead, cudaMemcpyHostToDevice)); //g_iSizeRead = samples*coarse_channels*elements*(2*sizeof(char));
+
+	// map - extract channel data from full data stream and load into buffer.
+	map<<<mapGSize, mapBSize>>>(g_pcInputData_d, &g_pc2Data_d[start], pfbParams.select, pfbParams);
+	CUDASafeCallWithCleanUp(cudaGetLastError());
+	CUDASafeCallWithCleanUp(cudaThreadSynchronize());
+
+	// Begin PFB
+	g_pc2DataRead_d = g_pc2Data_d; // p_pc2Data_d contains all the data. DataRead will update with each pass through the PFB.
+	int pfb_on = 1; // Enable pfb flag. Extendable.
+	while(!g_IsProcDone){
+
+		if(pfb_on) {
+			//PFB
+			PFB_kernel<<<g_dimGPFB, g_dimBPFB>>>(g_pc2DataRead_d, g_pf2FFTIn_d, g_pfPFBCoeff_d, pfbParams);
+			CUDASafeCallWithCleanUp(cudaGetLastError());
+			CUDASafeCallWithCleanUp(cudaThreadSynchronize());
+
+			//update data read pointer
+			g_pc2DataRead_d += g_iNumSubBands * g_iNFFT;
+			++countPFB;
+		} else {
+			CopyDataForFFT<<<g_dimGPFB, g_dimBPFB>>>(g_pc2DataRead_d, g_pf2FFTIn_d);
+
+			g_pc2DataRead_d += g_iNumSubBands * g_iNFFT;
+			++countCpyFFT;
+		}
+
+		//FFT
+		iRet = doFFT();
+		if(iRet != EXIT_SUCCESS) {
+			(void) fprintf(stderr, "ERROR: FFT failed\n");
+			cleanUp();
+			return EXIT_FAILURE;
+		}
+		CUDASafeCallWithCleanUp(cudaGetLastError());
+		++countFFT;
+
+		//update output fft pointer.
+		g_pf2FFTOut_d += g_iNumSubBands * g_iNFFT;
+
+		//update proc data
+		lProcData += g_iNumSubBands * g_iNFFT;
+		if(lProcData >= ltotData - NUM_TAPS*g_iNumSubBands*g_iNFFT){ // >= process 117 ffts leaving 256 time samples, > process 118 ffts leaving 224 time samples.
+			(void) fprintf(stdout, "\nINFO: Processed finished!\n");
+			(void) fprintf(stdout, "\tCounters--PFB:%d FFT:%d\n",countPFB, countFFT);
+			(void) fprintf(stdout, "\tData process by the numbers:\n \t\tProcessed:%ld (Samples) \n \t\tTo Process:%ld (Samples)\n\n",lProcData, ltotData);
+			g_IsProcDone = TRUE;
+
+			// prepare next filter
+			saveData<<<saveGSize, saveBSize>>>(g_pc2DataRead_d, g_pc2Data_d);
+			CUDASafeCallWithCleanUp(cudaGetLastError());
+
+			// copy back to host.
+			//wind back out ptr - should put in another pointer as a process read ptr.
+			int outDataSize = countFFT * g_iNumSubBands * g_iNFFT * sizeof(cufftComplex);
+			g_pf2FFTOut_d = g_pf2FFTOut_d - countFFT*g_iNumSubBands*g_iNFFT;
+			//fprintf(stdout, "Copyting back: %d\n", outDataSize);
+			CUDASafeCallWithCleanUp(cudaMemcpy(outputData_h, g_pf2FFTOut_d, outDataSize, cudaMemcpyDeviceToHost));
+		}
+
+	}
+
+	return iRet;
+}
+
+// return true or false upon successful setup.
+int initPFB(int iCudaDevice, params pfbParams){
+
+	int iRet = EXIT_SUCCESS;
+
+	// set pfb params from input parameters.
+	pfbParams.subbands = pfbParams.elements*pfbParams.fine_channels;
+
+	g_iNFFT = pfbParams.nfft;
+	g_iNTaps = pfbParams.taps;
+	g_iNumSubBands = pfbParams.subbands; // equal to elements*fine_channels. (The fine channels are the channels processed.)
+
+	g_iSizeRead = pfbParams.samples*pfbParams.coarse_channels*pfbParams.elements*(2*sizeof(char));
+
+	int iDevCount = 0;
+	cudaDeviceProp stDevProp = {0};
+	cufftResult iCUFFTRet = CUFFT_SUCCESS;
+
+	int i = 0;
+
+	//Register signal handlers?
+
+	/********************************************/
+	/* Look for eligable Cuda Device and select */
+	/********************************************/
+	(void) fprintf(stdout, "Querying CUDA devices.\n");
+
+	(void) cudaGetDeviceCount(&iDevCount);
+	if (0 == iDevCount) {
+		(void) fprintf(stderr, "ERROR: No CUDA-capable device found!\n");
+		return EXIT_FAILURE;
+	}
+	// Look for requested device (if applicable)
+	if (iCudaDevice >= iDevCount) {
+		(void) fprintf(stderr,
+					   "ERROR: Requested device %d no found in present %d device list.\n",
+					   iCudaDevice,
+					   iDevCount);
+		return EXIT_FAILURE;
+	}
+	// Query devices and setup selected device.
+	for(i = 0; i < iDevCount; i++) {
+		CUDASafeCallWithCleanUp(cudaGetDeviceProperties(&stDevProp, i));
+		printf("\tDevice %d: %s, Compute Capability %d.%d, %d physical threads %s\n",
+				i,
+				stDevProp.name, stDevProp.major, stDevProp.minor,
+				stDevProp.multiProcessorCount * stDevProp.maxThreadsPerMultiProcessor,
+				(iCudaDevice == i) ? "<<SELECTED>>" : "");
+	}
+	CUDASafeCallWithCleanUp(cudaSetDevice(iCudaDevice));
+
+	// Setup block and thread paramters
+	CUDASafeCallWithCleanUp(cudaGetDeviceProperties(&stDevProp, 0));
+	g_iMaxThreadsPerBlock = stDevProp.maxThreadsPerBlock;
+	g_iMaxPhysThreads = stDevProp.multiProcessorCount * stDevProp.maxThreadsPerMultiProcessor;
+
+	// Check if valid operation lengths. i.e. The input buffer is long enough (should this be done here or elsewhere?)
+
+	// Set malloc size - lTotCUDAMalloc is used only to calculate the total amount of memory not used for the allocation.
+	size_t cudaMem_total, cudaMem_available;
+	size_t lTotCUDAMalloc = 0;
+	cudaMemGetInfo(&cudaMem_available, &cudaMem_total);
+	lTotCUDAMalloc += g_iSizeRead; // size   data
+	lTotCUDAMalloc += (g_iNumSubBands * g_iNFFT * sizeof(float(2))); // size of FFT input array This should be different since our data is unsigned char?
+	lTotCUDAMalloc += (g_iNumSubBands * pfbParams.samples * sizeof(float(2))); // size of FFT output array
+	lTotCUDAMalloc += (g_iNumSubBands * g_iNFFT * sizeof(float)); 	// size of PFB Coefficients
+	// Check CUDA device can handle the memory request
+	if(lTotCUDAMalloc > stDevProp.totalGlobalMem) {
+		(void) fprintf(stderr,
+						"ERROR: Total memory requested on GPU is %g MB of %g possible MB (Total Global Memory: %g MB).\n"
+						"\t**** Memory breakdown *****\n"
+						"\tInput data buffer:\t%g MB\n"
+						"\tFFT in array:\t%g MB\n"
+						"\tFFT out array:\t%g MB\n"
+						"\tPFB Coefficients: %f KB\n",
+						((float) lTotCUDAMalloc) / (1024*1024),
+						((float) cudaMem_available) / (1024*1024), //stDevProp.totalGlobalMem
+						((float) cudaMem_total) / (1024*1024),
+						((float) g_iSizeRead) / (1024 * 1024),
+						((float) g_iNumSubBands * g_iNFFT * sizeof(float2)) / (1024 * 1024),
+						((float) g_iNumSubBands * pfbParams.samples * sizeof(float2)) / (1024 * 1024),
+						((float) g_iNumSubBands * g_iNFFT * sizeof(float)));
+		return EXIT_FAILURE;
+	}
+	
+	// print memory usage report.
+	(void) fprintf(stdout,
+					"INFO: Total memory requested on GPU is %g MB of %g possible MB (Total Global Memory: %g MB).\n"
+					"\t**** Memory breakdown ****\n"
+					"\tInput data buffer:\t%g MB\n"
+					"\tFFT in array:\t%g MB\n"
+					"\tFFT out array:\t%g MB\n"
+					"\tPFB Coefficients: %f KB\n",
+					((float) lTotCUDAMalloc) / (1024*1024),
+					((float) cudaMem_available) / (1024*1024), //stDevProp.totalGlobalMem
+					((float) cudaMem_total) / (1024*1024),
+					((float) g_iSizeRead) / (1024 * 1024),
+					((float) g_iNumSubBands * g_iNFFT * sizeof(float2)) / (1024 * 1024),
+					((float) g_iNumSubBands * pfbParams.samples * sizeof(float2)) / (1024 * 1024),
+					((float) g_iNumSubBands * g_iNFFT * sizeof(float)));
+
+	/*************************/
+	/* Load PFB coefficients */
+	/*************************/
+	(void) fprintf(stdout, "\nSetting up PFB filter coefficients...\n");
+	int sizePFB = g_iNumSubBands * g_iNTaps * g_iNFFT * sizeof(float);
+
+	// Allocate memory for PFB coefficients to be read in
+	g_pfPFBCoeff = (float *) malloc(sizePFB); // allocate the memory needed for the size of one pfb pass through
+	if(NULL == g_pfPFBCoeff) {
+		(void) fprintf(stderr, "ERROR: Memory allocation for the PFB coefficients failed. %s\n",
+								strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	char relPath[256] = "/home/mburnett/GitHub/flag_gpu/lib/pfb/src/";
+	// Read filter coefficients from file
+	(void) fprintf(stdout, "\tReading in coefficients...\n");
+	(void) sprintf(g_acFileCoeff,
+				   "%s%s_%s_%d_%d_%d%s",
+				   relPath,
+				   FILE_COEFF_PREFIX,
+				   FILE_COEFF_DATATYPE,
+				   g_iNTaps,
+				   g_iNFFT,
+				   g_iNumSubBands,
+				   FILE_COEFF_SUFFIX);
+
+	g_iFileCoeff = open(g_acFileCoeff, O_RDONLY);
+	if(g_iFileCoeff < EXIT_SUCCESS) {
+		(void) fprintf(stderr, "ERROR: Failed to open coefficient file %s. %s\n",
+					  			g_acFileCoeff,
+					  			strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	iRet = read(g_iFileCoeff, g_pfPFBCoeff, sizePFB);
+	if(iRet != sizePFB) {
+		(void) fprintf(stderr, "ERROR: Failed reading filter coefficients. %s\n", strerror(errno));
+		return EXIT_FAILURE;
+	}
+	
+	(void) close(g_iFileCoeff);
+
+	/********************************************/
+	/* Allocate memory and setup on CUDA device */
+	/********************************************/
+	(void) fprintf(stdout, "\nSetting up CUDA device.\n");
+
+	//malloc map array and copy data to device
+	(void) fprintf(stdout, "\tAllocating memory for MAP...\n");
+	// creates a size that is paddedd in the front to store the filter state. Worth one 256 (nfft*taps) time sample amount of data
+	int sizeMap = pfbParams.samples * pfbParams.fine_channels * pfbParams.elements * (2*sizeof(char)) + pfbParams.fine_channels*pfbParams.elements*pfbParams.nfft*pfbParams.taps * (2*sizeof(char));
+	CUDASafeCallWithCleanUp(cudaMalloc((void **) &g_pcInputData_d, g_iSizeRead));
+	CUDASafeCallWithCleanUp(cudaMemset((void *)   g_pcInputData_d, 0, g_iSizeRead));
+	CUDASafeCallWithCleanUp(cudaMalloc((void **) &g_pc2Data_d, sizeMap));
+	CUDASafeCallWithCleanUp(cudaMemset((void *)   g_pc2Data_d, 0, sizeMap));
+
+	// allocate memory for pfb coefficients on GPU
+	(void) fprintf(stdout, "\tAllocating memory for PFB...\n");
+	CUDASafeCallWithCleanUp(cudaMalloc((void **) &g_pfPFBCoeff_d, sizePFB));
+
+	// copy coeff to device
+	(void) fprintf(stdout, "\tCopying filter coefficients...\n");
+	CUDASafeCallWithCleanUp(cudaMemcpy(g_pfPFBCoeff_d, g_pfPFBCoeff, sizePFB, cudaMemcpyHostToDevice));
+
+	// allocate memory for FFT in and out arrays
+	(void) fprintf(stdout, "\tAllocate memory for FFT arrays...\n");
+	int sizeDataBlock_in = g_iNumSubBands * g_iNFFT * sizeof(float2);
+	int sizeTotalDataBlock_out = pfbParams.samples*g_iNumSubBands * sizeof(float2); // output fft array same size as output data for convinence the full size is not used. In the pfb function the output data will be the fft counter times block amount in the fft.
+	CUDASafeCallWithCleanUp(cudaMalloc((void **) &g_pf2FFTIn_d, sizeDataBlock_in));
+	CUDASafeCallWithCleanUp(cudaMalloc((void **) &g_pf2FFTOut_d, sizeTotalDataBlock_out)); // goal will be to update the output ptr each time it fires.
+
+	// set kernel parameters
+	(void) fprintf(stdout, "\tSetting kernel parameters...\n");
+	if(g_iNFFT < g_iMaxThreadsPerBlock) {
+		g_dimBPFB.x  = g_iNFFT;
+		g_dimBCopy.x = g_iNFFT;
+	} else {
+		g_dimBPFB.x  = g_iMaxThreadsPerBlock;
+		g_dimBCopy.x = g_iMaxThreadsPerBlock;
+	}
+	g_dimGPFB.x  = (g_iNumSubBands * g_iNFFT) / g_dimBPFB.x;
+	g_dimGCopy.x = (g_iNumSubBands * g_iNFFT) / g_dimBCopy.x;
+
+	// map kernel params	
+	mapGSize.x = pfbParams.samples;
+	mapGSize.y = pfbParams.fine_channels;
+	mapGSize.z = 1;
+
+	mapBSize.x = 1;
+	mapBSize.y = pfbParams.elements;
+	mapBSize.z = 1;
+
+	// copy kernel params
+	saveGSize.x = pfbParams.fine_channels;
+	saveGSize.y = pfbParams.nfft*pfbParams.taps;
+	saveGSize.z = 1;
+
+	saveBSize.x = pfbParams.elements;
+	saveBSize.y = 1;
+	saveBSize.z = 1;
+
+	(void) fprintf(stdout, "\t\tPFB Kernel Parmaters are:\n\t\tgridDim(%d,%d,%d) blockDim(%d,%d,%d)\n\n",
+							g_dimGPFB.x, g_dimGPFB.y, g_dimGPFB.z,
+							g_dimBPFB.x, g_dimBPFB.y, g_dimBPFB.z);
+
+	(void) fprintf(stdout, "\t\tMAP Kernel Parmaters are:\n\t\tgridDim(%d,%d,%d) blockDim(%d,%d,%d)\n\n",
+							mapGSize.x, mapGSize.y, mapGSize.z,
+							mapBSize.x, mapBSize.y, mapBSize.z);
+
+	(void) fprintf(stdout, "\t\tSave Kernel Parmaters are:\n\t\tgridDim(%d,%d,%d) blockDim(%d,%d,%d)\n",
+							saveGSize.x, saveGSize.y, saveGSize.z,
+							saveBSize.x, saveBSize.y, saveBSize.z);
+
+	// create a CUFFT plan
+	(void) fprintf(stdout, "\tCreating cuFFT plan...\n");
+	iCUFFTRet = cufftPlanMany(&g_stPlan,
+							  FFTPLAN_RANK,
+							  &g_iNFFT,
+							  &g_iNFFT,
+							  FFTPLAN_ISTRIDE,
+							  FFTPLAN_IDIST,
+							  &g_iNFFT,
+							  FFTPLAN_OSTRIDE,
+							  FFTPLAN_ODIST,
+							  CUFFT_C2C,
+							  FFTPLAN_BATCH);
+	if(iCUFFTRet != CUFFT_SUCCESS) {
+		(void) fprintf(stderr, "ERROR: Plan creation failed!\n");
+		return EXIT_FAILURE;
+	}
+
+	fprintf(stdout, "\nDevice for PFB successfully initialized!\n");
+	return EXIT_SUCCESS;
+}
+
+// make a call to execute a ptyhon program.
+void genCoeff(char* procName, params pfbParams) {	
+	system("python --version");	
+	FILE* file;
+	char fname[256] = {"/home/mburnett/GitHub/flag_gpu/lib/pfb/python/flag_gen_coeff.py"};
+
+	int argCount = 11;
+	char* arguments[32] = {}; // come back and create a dynamic structure, i.e definetly do not need 32, always 10 or 11.
+	int i = 0;
+	for(i = 0; i < 32; i++) {
+		arguments[i] = (char*) malloc(256*sizeof(char*));
+	}
+
+	arguments[0] = procName;
+
+	arguments[1] = (char*) "-n\0"; // (char*) acknowledges that I am assigning a const literal to a mutable and removes compile warnings for now.
+	sprintf(arguments[2], "%d", pfbParams.nfft);
+
+	arguments[3] = (char*) "-t\0";
+	sprintf(arguments[4], "%d", pfbParams.taps);
+
+	arguments[5] = (char*) "-b\0";
+	sprintf(arguments[6], "%d", pfbParams.subbands);
+
+	arguments[7] = (char*) "-w\0";
+	sprintf(arguments[8], "%s", pfbParams.window);
+
+	arguments[9] = (char*) "-d\0";
+	sprintf(arguments[10], "%s", pfbParams.dataType);
+
+	if(pfbParams.plot) {
+		arguments[11] = (char*) "-p\0";
+		argCount++;
+	}
+
+	for(i = 0; i < argCount; i++){
+		fprintf(stdout, " %s", arguments[i]); // Add a gen coeff output for feedback.c
+	}
+	fprintf(stdout, "\n");
+
+	// initalize and run python script
+	Py_SetProgramName(procName);
+	Py_Initialize();
+	PySys_SetArgv(argCount, arguments);
+	file = fopen(fname, "r");
+	PyRun_SimpleFile(file, fname);
+	Py_Finalize();
+
+	return;
+}
+
+int doFFT() {
+    cufftResult iCUFFTRet = CUFFT_SUCCESS;
+
+    /* execute plan */
+    iCUFFTRet = cufftExecC2C(g_stPlan,
+                             (cufftComplex*) g_pf2FFTIn_d,
+                             (cufftComplex*) g_pf2FFTOut_d,
+                             CUFFT_FORWARD);
+    if (iCUFFTRet != CUFFT_SUCCESS)
+    {
+        (void) fprintf(stderr, "ERROR! FFT failed!\n");
+        return EXIT_FAILURE;
+    }
+
+    return EXIT_SUCCESS;
+}
+
+int resetDevice() {
+	cudaError_t cuErr = cudaDeviceReset();
+	if (cuErr != cudaSuccess) {
+		fprintf(stderr, "Device Reset Failed.\n");
+
+		return EXIT_FAILURE;
+	}
+	return EXIT_SUCCESS;
+}
+
+void cleanUp() {
+/* free resources */
+    if (g_pc2InBuf != NULL) {
+        free(g_pc2InBuf);
+        g_pc2InBuf = NULL;
+    }
+    if (g_pc2Data_d != NULL) {
+        (void) cudaFree(g_pc2Data_d);
+        g_pc2Data_d = NULL;
+    }
+    if (g_pf2FFTIn_d != NULL) {
+        (void) cudaFree(g_pf2FFTIn_d);
+        g_pf2FFTIn_d = NULL;
+    }
+    if (g_pf2FFTOut_d != NULL) {
+        (void) cudaFree(g_pf2FFTOut_d);
+        g_pf2FFTOut_d = NULL;
+    }
+
+    free(g_pfPFBCoeff);
+    (void) cudaFree(g_pfPFBCoeff_d);
+
+    /* destroy plan */
+    /* TODO: check for plan */
+    (void) cufftDestroy(g_stPlan);
+
+    return;
+}
+
+void __CUDASafeCallWithCleanUp(cudaError_t iRet, const char* pcFile, const int iLine, void (*pcleanUp)(void)) {
+    if (iRet != cudaSuccess)
+    {
+        (void) fprintf(stderr,
+                       "ERROR: File <%s>, Line %d: %s\n",
+                       pcFile,
+                       iLine,
+                       cudaGetErrorString(iRet));
+        /* free resources */
+        (*pcleanUp)();
+        exit(EXIT_FAILURE);
+    }
+
+    return;
+}
--- a/lib/pfb/src/pfb.h
+++ b/lib/pfb/src/pfb.h
+#ifndef __PFB_H__
+#define __PFB_H__
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <sys/types.h>  /* for open()  */
+#include <sys/stat.h>	/* for open()  */
+#include <fcntl.h>		/* for open()  */
+
+#include <python2.7/Python.h> /* for executing coeff gen file */
+
+#include "kernels.h"
+
+#define FALSE 					0
+#define TRUE  					1
+#define DEBUG					1
+
+#define DEF_CUDA_DEVICE			0
+
+#define DEF_SIZE_READ			262144	// data block size. should this be set dynamically once I get the data?
+#define DEF_LEN_SPEC			32	// Transform size
+#define NUM_TAPS				8	// PFB Decimation factor
+#define DEF_NUM_CHANNELS		25  // System spec for total number of channels
+#define PFB_CHANNELS			5	// Number of coarse channels through PFB
+#define DEF_NUM_ELEMENTS		64  // System spec for number of elements
+#define SAMPLES					4000// Time samples.
+
+#define PFB_OUTPUT_BLOCK_SIZE	SAMPLES*PFB_CHANNELS*2
+
+// FFT Plan configuration
+#define FFTPLAN_RANK 			1				 // dimension of the transform
+#define FFTPLAN_ISTRIDE			(g_iNumSubBands) // The distance between two successive input time elements. - (polarization*numsubbands).
+#define FFTPLAN_OSTRIDE			(g_iNumSubBands) // Similar to ostride to maintain data structure
+#define FFTPLAN_IDIST			1				 // The distance between the first elements of two consecutive batches in the input data. Each FFT operation is a 'batch'. Each subband is a time series and we need a FFT for each subband. Since we have interleaved samples the distance between consecutive batches is 1 sample.
+#define FFTPLAN_ODIST			1				 // Simailar to odist to maintian data structure
+#define FFTPLAN_BATCH			(g_iNumSubBands) // The total number of FFTs to perform per call to DoFFT().
+
+// coeff file configuration
+#define FILE_COEFF_PREFIX		"coeff"
+#define FILE_COEFF_DATATYPE		"float"
+#define FILE_COEFF_SUFFIX		".dat"
+
+#define USEC2SEC            1e-6
+
+typedef unsigned char BYTE;
+
+#define CUDASafeCallWithCleanUp(iRet) __CUDASafeCallWithCleanUp(iRet, __FILE__, __LINE__, &cleanUp)
+void __CUDASafeCallWithCleanUp(cudaError_t iRet, const char* pcFile, const int iLine, void (*pcleanUp)(void));
+
+void genCoeff(char* procName, params pfbParams);
+int initPFB(int iCudaDevice, params pfbParams);
+int runPFB(signed char* inputData_h, float* outputData_h, params pfbParams);
+int doFFT();
+int resetDevice(void);
+void cleanUp(void);
+
+#endif
\ No newline at end of file
--- a/lib/pfb/utils/main.c
+++ b/lib/pfb/utils/main.c
+#include "main.h"
+
+char* g_inputData = NULL;
+float2* g_outputData = NULL;
+
+params pfbParams = DEFAULT_PFB_PARAMS;
+
+int main(int argc, char *argv[]) {
+
+	int ret = EXIT_SUCCESS;
+
+	/*********************************** PARSE INPUT *****************************************/
+
+	/* valid short and long options */
+	const char* const pcOptsShort = ":hn:t:w:k:c:f:e:b:d:s:p";
+	const struct option stOptsLong[] = {
+		{ "help",		0, NULL,	'h' },   
+		{ "nfft", 		1, NULL,	'n' },
+		{ "taps",		1, NULL,	't' },
+		{ "window",		1, NULL,	'w' },
+		{ "samples",	1, NULL,	'k' },
+		{ "coarse", 	1, NULL,	'c' },
+		{ "fine",		1, NULL, 	'f' },
+		{ "elements",	1, NULL,	'e' },
+		{ "nsub",		1, NULL,	'b' },
+		{ "datatype",	1, NULL,	'd' },
+		{ "select",		1, NULL,	's' },
+		{ "plot",		0, NULL,	'p' },
+		{ NULL,			0, NULL, 	0	}
+	};
+
+	const char* progName = argv[0];
+
+	int errFlag = 0;
+
+	/* parse input */
+	int opt = 0; //
+	int prevInd = 0; // used to track optind to manual check missing arguments.
+	do {
+		/* 
+			Getopt will load the next option if the argument is missing, getopt's ':' error check
+			really only works on the last option. This assumes that no argument has a '-' in it.
+		*/
+		prevInd = optind;
+		opt = getopt_long(argc, argv, pcOptsShort, stOptsLong, NULL);
+
+		if(optind == prevInd + 2 && (*optarg == '-' || *optarg == '.')) { // assumes arguments cannot start with '-' or '.'. Also, if optarg is null this causes a seg fault and the first logical comparisson catches the null case. The parans for the or helps not cause the fault.
+			optopt = opt; // update getopt's optopt variable to contain the violating variable. 
+			opt = ':'; // trigger the error character.
+			--optind; // decrement optind since it was incremented incorrectly.
+		}
+
+		switch(opt)
+		{
+			case 'h':
+				printUsage(progName);
+				return EXIT_SUCCESS;
+
+			case 'n':
+				pfbParams.nfft = (int) atoi(optarg);
+				break;
+
+			case 't':
+				pfbParams.taps = (int) atoi(optarg);
+				break;
+
+			case 'w':
+				pfbParams.window = optarg;
+				break;
+
+			case 'k':
+				pfbParams.samples = (int) atoi(optarg);
+				break;
+
+			case 'c':
+				pfbParams.coarse_channels = (int) atoi(optarg);
+				break;
+
+			case 'e':
+				pfbParams.elements = (int) atoi(optarg);
+				break;
+
+			case 'f':
+				pfbParams.fine_channels = (int) atoi(optarg);
+				break;
+
+			case 'b':
+				pfbParams.subbands =  (int) atoi(optarg);
+				break;
+
+			case 'd':
+				pfbParams.dataType = optarg;
+				break;
+
+			case 's':
+				pfbParams.select = (int) atoi(optarg);
+				// check valid select range.
+				if(pfbParams.select < 0 || pfbParams.select > 4) {
+					(void) fprintf(stderr, "ERROR: Channel select range [0, 4]\n");
+					errFlag++;
+				}
+				break;
+
+			case 'p':
+				pfbParams.plot = 0;
+				break;
+
+			case ':':
+				(void) fprintf(stderr, "-%c option requires a parameter.\n", optopt);
+				errFlag++;
+				break;
+
+			case '?':
+				(void) fprintf(stderr, "Unrecognized option -%c.\n", optopt);
+				errFlag++;
+				break;
+
+			case -1: /* done with options */
+				break;
+
+			default: /* unexpected */
+				assert(0);
+		}
+	} while (opt != -1);
+
+	if(errFlag) {
+		printUsage(progName);
+		return EXIT_FAILURE;
+	}
+
+	// no data file presented
+	int genFlag = 0;
+	if(argc <= optind) {
+		(void) fprintf(stderr, "ERROR: Missing data file.\n");
+		genFlag = 1;
+	}
+	
+	// init input data array
+	int readSize = pfbParams.samples * pfbParams.coarse_channels * pfbParams.elements * (2*sizeof(char));
+	g_inputData = (char*) malloc(readSize);
+	memset(g_inputData, 0, readSize);
+
+	// Determine wether to get data from a file or generate it. 
+	if(!genFlag) {
+		// get data filename
+		char filename[256] = {0};
+		(void) strncpy(filename, argv[optind], 256);
+		filename[255] = '\0';
+		ret = loadData(filename, g_inputData, readSize);
+		if (ret == EXIT_FAILURE) {
+			return EXIT_FAILURE;
+		}
+	} else {
+		// generate data
+		//generate freq array for data
+		int i = 0;
+		float fs = 303.0; // KHz - a default sample rate.
+		int channelBandgap = 10.0;		// KHz jumps
+		float* freq = (float *) malloc(pfbParams.coarse_channels*sizeof(float));
+		for(i = 0; i <= pfbParams.coarse_channels; i++) {
+			freq[i] = channelBandgap * i + 5.0;
+		}
+		genData(g_inputData, freq, fs, pfbParams.samples, pfbParams.coarse_channels, pfbParams.elements);
+	}		
+
+	/****************************** SETUP PFB ******************************/
+
+	/* init cuda device */
+	int iCudaDevice = DEF_CUDA_DEVICE;
+
+	// create coeff and write to a file that is read in initPFB.
+	pfbParams.subbands = pfbParams.elements*pfbParams.fine_channels;
+	genCoeff(argc, argv, pfbParams);
+
+	// init the device, loads coeff
+	ret = initPFB(iCudaDevice, pfbParams);
+
+	// malloc data arrays
+	int outputSize = pfbParams.samples * pfbParams.fine_channels * pfbParams.elements * (2*sizeof(float)); // need to convince myself of this output data size.
+
+	g_outputData = (float2*) malloc(outputSize);
+	memset(g_outputData, 0, outputSize);
+
+
+	// run the pfb function
+	clock_t start, end;
+	start = clock();
+	ret = runPFB(g_inputData, g_outputData, pfbParams);
+	end = clock();
+	double timeTaken = 0;
+	timeTaken = ((double) (end - start))/CLOCKS_PER_SEC;
+
+	(void) printf("Time taken (barring Init()): %gs\n", timeTaken);
+
+	if (ret == EXIT_FAILURE) {
+		(void) fprintf(stderr, "ERROR: runPFB failed!\n");
+		free(g_inputData);
+		free(g_outputData);
+		return EXIT_FAILURE;
+	}
+
+	// process return from pfb - write to file
+	int file = 0;
+	
+	char outfile[256] = "output/outfile.dat\0";
+	file = open(outfile,
+					O_CREAT | O_TRUNC | O_WRONLY,
+					S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+	if(file < EXIT_SUCCESS) {
+		(void) fprintf(stderr, "ERROR: writing outfile failed\n");
+		free(g_inputData);
+		free(g_outputData);
+		return EXIT_FAILURE;
+	}
+
+	// clean up and exit
+	cleanUp();
+	ret = resetDevice();
+
+	(void) write(file, g_outputData, outputSize);
+	(void) close(file);
+
+	free(g_inputData);
+	free(g_outputData);
+
+	return EXIT_SUCCESS;
+}
\ No newline at end of file
--- a/lib/pfb/utils/main.h
+++ b/lib/pfb/utils/main.h
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <string.h>		/* for strncopy(), memcpy(), strerror()*/
+#include <sys/types.h>  /* for open()  */
+#include <sys/stat.h>	/* for open()  */
+#include <sys/time.h>
+#include <fcntl.h>		/* for open()  */
+#include <unistd.h>		/* for close() */
+#include <errno.h>		/* for errno   */  
+
+#include <getopt.h>		/* for option parsing */
+
+#include "pfb.h"
+#include "helper.h"
+#include "tools/tools.h"
\ No newline at end of file
--- a/lib/pfb/utils/tools/filter_response_gen.c
+++ b/lib/pfb/utils/tools/filter_response_gen.c
+#include <stdio.h>
+
+#include <stdlib.h>
+#include <time.h>
+#include <math.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <float.h>
+#include <getopt.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+
+#define LEN_GENSTRING 256
+#define SCALE_FACTOR  127
+#define F_S		      256.0 // MHz
+#define N			  256  // Time samples
+#define MAX_FREQ      256
+#define CHANNELS	  25    // Freq Channels
+#define NUM_EL		  64    // Antenna Elements
+
+void printUsage(const char* progName) {
+	(void) printf("Usage: %s [options] <data-file>\n", progName);
+    (void) printf("    -h  --help                           ");
+    (void) printf("Display this usage information\n");
+    (void) printf("    -s  --samples <value>                ");
+    (void) printf("Number of samples in the data\n");
+    (void) printf("    -f  --fs <value>                     ");
+    (void) printf("Max freq\n");
+    (void) printf("    -w  --band <value>                   ");
+    (void) printf("Sample rate\n");
+    (void) printf("    -c  --channels <value>               ");
+    (void) printf("Number of channels in data\n");
+    (void) printf("    -e  --elemnets <value>               ");
+    (void) printf("Number of elements in data\n");
+	return;
+}
+
+int main(int argc, char *argv[]) {
+
+	int samples = N;
+	int fs = F_S;
+	int coarseChannels = CHANNELS;
+	int elements = NUM_EL;
+	int genFreq = MAX_FREQ;
+
+	/* valid short and long options */
+	const char* const pcOptsShort = ":hs:f:w:c:e:";
+	const struct option stOptsLong[] = {
+		{ "help",		0, NULL,	'h' },
+		{ "samples",	1, NULL,	's' },
+		{ "fs",			1, NULL,	'f' },
+		{ "band",		1, NULL,	'w' },
+		{ "channels",	1, NULL,	'c' },
+		{ "elements",	1, NULL,	'e' },
+		{ NULL,			0, NULL, 	0	}
+	};
+
+	const char* progName = argv[0];
+
+	int errFlag = 0;
+
+	/* parse input */
+	int opt = 0; //
+	int prevInd = 0; // used to track optind to manual check missing arguments.
+	do {
+		/* 
+			Getopt will load the next option if the argument is missing, getopt's ':' error check
+			really only works on the last option. This assumes that no argument has a '-' in it.
+		*/
+		prevInd = optind;
+		opt = getopt_long(argc, argv, pcOptsShort, stOptsLong, NULL);
+
+		if(optind == prevInd + 2 && (*optarg == '-' || *optarg == '.')) { // assumes arguments cannot start with '-' or '.'. Also, if optarg is null this causes a seg fault and the first logical comparisson catches the null case. The parans for the or helps not cause the fault.
+			optopt = opt; // update getopt's optopt variable to contain the violating variable. 
+			opt = ':'; // trigger the error character.
+			--optind; // decrement optind since it was incremented incorrectly.
+		}
+
+		switch(opt)
+		{
+			case 'h':
+				printUsage(progName);
+				return EXIT_SUCCESS;
+
+			case 's':
+				samples = (int) atoi(optarg);
+				break;
+
+			case 'c':
+				coarseChannels = (int) atoi(optarg);
+				break;
+
+			case 'e':
+				elements = (int) atoi(optarg);
+				break;
+
+			case 'f':
+				fs = (int) atoi(optarg);
+				break;
+
+			case 'w':
+				genFreq = (int) atoi(optarg);
+				break;
+
+			case ':':
+				(void) fprintf(stderr, "-%c option requires a parameter.\n", optopt);
+				errFlag++;
+				break;
+
+			case '?':
+				(void) fprintf(stderr, "Unrecognized option -%c.\n", optopt);
+				errFlag++;
+				break;
+
+			case -1: /* done with options */
+				break;
+
+			default: /* unexpected */
+				assert(0);
+		}
+	} while (opt != -1);
+
+	if(errFlag) {
+		printUsage(progName);
+		return EXIT_FAILURE;
+	}
+
+	// no data file presented
+	if(argc <= optind) {
+		(void) fprintf(stderr, "ERROR: Missing output data filename.\n");
+		return EXIT_FAILURE;
+	}
+
+	int iFile = 0;
+	char acDataFilename[LEN_GENSTRING] = {0};
+	(void) strncpy(acDataFilename, argv[optind], LEN_GENSTRING);
+	acDataFilename[LEN_GENSTRING-1] = '\0'; //NUll terminator at end of filename string.
+
+	iFile = open(acDataFilename,
+					O_CREAT | O_TRUNC | O_WRONLY,
+					S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+	if(EXIT_FAILURE == iFile) {
+		(void) fprintf(stderr, "ERROR: Failed to open output file. %s\n", strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	int i = 0;
+	//const int genFreq = 256;
+    //Generate freq array.
+    float* freq = NULL;
+    freq = (float*) malloc(genFreq*sizeof(float));
+    //float freq[genFreq] = {};
+    for(i = 1; i <= genFreq; i++) {
+        freq[i-1] = i*1.0;
+        //fprintf(stdout, "freq: %f\n", freq[i-1]);
+    }
+    fprintf(stdout, "i: %d\n", i);
+    int f = 0;
+	int n = 0;
+	int c = 0;
+	int e = 0;
+ 
+	signed char cDataRe = 0;
+	signed char cDataIm = 0;
+	signed char* toWrite = (signed char *) malloc(samples*coarseChannels*elements*(2*sizeof(signed char)));
+
+	fprintf(stdout,
+			"INFO: Generating samples...\n"
+			"\tSamples:\t %d\n"
+			"\tSample rate:\t %d\n"
+			"\tMax Range:\t %d\n"
+			"\tChannels:\t %d\n"
+			"\tElements:\t %d\n",
+			samples, fs, genFreq, coarseChannels, elements);
+
+	for(f = 0; f < genFreq; f++) {
+		for(n = 0; n < samples; n++) {
+
+				cDataRe = SCALE_FACTOR * (0.1 * cos(2*M_PI * freq[f] * n / fs));
+				cDataIm = SCALE_FACTOR * (0.1 * sin(2*M_PI * freq[f] * n / fs));
+
+			for(c = 0; c < coarseChannels; c++) {
+				for(e = 0; e < 2*elements; e++) {
+					
+					int idx = e + c * (2 * elements) + n * (coarseChannels * 2*elements);
+					if( !(e%2) ) {
+					//create interleaved samples for real and Im 
+					toWrite[idx] = cDataRe;
+					} else {
+					toWrite[idx] = cDataIm;
+					}
+				}
+			}
+		}
+		(void) write(iFile, toWrite, samples*coarseChannels*elements*(2*sizeof(signed char)));
+	}
+
+	(void) close(iFile);
+
+	return EXIT_SUCCESS;
+}
\ No newline at end of file
--- a/lib/pfb/utils/tools/genTool
+++ b/lib/pfb/utils/tools/genTool
--- a/lib/pfb/utils/tools/helper.c
+++ b/lib/pfb/utils/tools/helper.c
+#include "helper.h"
+
+// File containing helper functions for main
+
+void printUsage(const char* progName) {
+	(void) printf("Usage: %s [options] <data-file>\n", progName);
+    (void) printf("    -h  --help                           ");
+    (void) printf("Display this usage information\n");
+    (void) printf("    -b  --nsub                           ");
+    (void) printf("Number of sub-bands in the data\n");
+    (void) printf("    -n  --nfft <value>                   ");
+    (void) printf("Number of points in FFT\n");
+    (void) printf("    -w  --window <string>                ");
+    (void) printf("Filter window type, hanning, cheb-win\n");
+	(void) printf("    -k  --samples <value>                ");
+    (void) printf("Number of time samples processed\n");
+    (void) printf("    -c  --coarse <value>                 ");
+    (void) printf("Number of coarse channels in data\n");
+    (void) printf("    -f  --fine <value>                   ");
+    (void) printf("Number of channels selected to process\n");
+    (void) printf("    -e  --elements <value>               ");
+    (void) printf("Number of elements in data\n");
+    (void) printf("    -d  --datatype <string>              ");
+    (void) printf("Filter coefficient data type, float or int\n");
+    (void) printf("    -s  --select <value>                 ");
+    (void) printf("Where in channels to begin selecting fine\n");
+	return;
+}
+
+
+int loadData(char* f, char* inputData, int size) {
+	int ret = EXIT_SUCCESS;
+	int file =  0;
+
+	//int readSize = SAMPLES * DEF_NUM_CHANNELS * DEF_NUM_ELEMENTS * (2*sizeof(char));
+	//inputData = (char*) malloc(readSize);
+	if(NULL == inputData) {
+		(void) fprintf(stderr, "ERROR: Memory allocation failed! %s.\n", strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	file = open(f, O_RDONLY);
+	if (file < EXIT_SUCCESS) {
+		(void) fprintf(stderr, "ERROR: failed to open data file. %s\n", strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	ret = read(file, inputData, size);
+	if (ret < EXIT_SUCCESS) {
+		(void) fprintf(stderr, "ERROR: failed to read data file. %s\n", strerror(errno));
+		(void) close(file);
+		return EXIT_FAILURE;
+	}
+
+	(void) close(file);
+	return EXIT_SUCCESS;
+
+}
\ No newline at end of file
--- a/lib/pfb/utils/tools/helper.h
+++ b/lib/pfb/utils/tools/helper.h
+#ifndef __HELPER_H
+#define __HELPER_H
+
+#include <stdio.h>
+#include <stdlib.h>
+
+// Had to copy these over form pfb.h since they are part of a load memory helper function most likely not needed in full pfb implementation
+#include <string.h>		/* for strncopy(), memcpy(), strerror()*/
+#include <sys/types.h>  /* for open()  */
+#include <sys/stat.h>	/* for open()  */
+#include <fcntl.h>		/* for open()  */
+#include <unistd.h>		/* for close() */
+#include <errno.h>		/* for errno   */
+
+void printUsage(const char* progName);
+int loadData(char* f, char* inputData, int size);
+
+
+#endif
\ No newline at end of file
--- a/lib/pfb/utils/tools/response_phase.c
+++ b/lib/pfb/utils/tools/response_phase.c
+#include <stdio.h>
+
+#include <stdlib.h>
+#include <time.h>
+#include <math.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <float.h>
+#include <getopt.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+
+#define LEN_GENSTRING 256
+#define SCALE_FACTOR  127
+#define F_S		      256.0 // MHz
+#define N			  256  // Time samples
+#define MAX_FREQ      256
+#define CHANNELS	  25    // Freq Channels
+#define NUM_EL		  64    // Antenna Elements
+
+void printUsage(const char* progName) {
+	(void) printf("Usage: %s [options] <data-file>\n", progName);
+    (void) printf("    -h  --help                           ");
+    (void) printf("Display this usage information\n");
+    (void) printf("    -s  --samples <value>                ");
+    (void) printf("Number of samples in the data\n");
+    (void) printf("    -f  --fs <value>                     ");
+    (void) printf("Max freq\n");
+    (void) printf("    -w  --band <value>                   ");
+    (void) printf("Sample rate\n");
+    (void) printf("    -c  --channels <value>               ");
+    (void) printf("Number of channels in data\n");
+    (void) printf("    -e  --elemnets <value>               ");
+    (void) printf("Number of elements in data\n");
+	return;
+}
+
+int main(int argc, char *argv[]) {
+
+	int samples = N;
+	int fs = F_S;
+	int coarseChannels = CHANNELS;
+	int elements = NUM_EL;
+	int genFreq = MAX_FREQ;
+
+	/* valid short and long options */
+	const char* const pcOptsShort = ":hs:f:w:c:e:";
+	const struct option stOptsLong[] = {
+		{ "help",		0, NULL,	'h' },
+		{ "samples",	1, NULL,	's' },
+		{ "fs",			1, NULL,	'f' },
+		{ "band",		1, NULL,	'w' },
+		{ "channels",	1, NULL,	'c' },
+		{ "elements",	1, NULL,	'e' },
+		{ NULL,			0, NULL, 	0	}
+	};
+
+	const char* progName = argv[0];
+
+	int errFlag = 0;
+
+	/* parse input */
+	int opt = 0; //
+	int prevInd = 0; // used to track optind to manual check missing arguments.
+	do {
+		/* 
+			Getopt will load the next option if the argument is missing, getopt's ':' error check
+			really only works on the last option. This assumes that no argument has a '-' in it.
+		*/
+		prevInd = optind;
+		opt = getopt_long(argc, argv, pcOptsShort, stOptsLong, NULL);
+
+		if(optind == prevInd + 2 && (*optarg == '-' || *optarg == '.')) { // assumes arguments cannot start with '-' or '.'. Also, if optarg is null this causes a seg fault and the first logical comparisson catches the null case. The parans for the or helps not cause the fault.
+			optopt = opt; // update getopt's optopt variable to contain the violating variable. 
+			opt = ':'; // trigger the error character.
+			--optind; // decrement optind since it was incremented incorrectly.
+		}
+
+		switch(opt)
+		{
+			case 'h':
+				printUsage(progName);
+				return EXIT_SUCCESS;
+
+			case 's':
+				samples = (int) atoi(optarg);
+				break;
+
+			case 'c':
+				coarseChannels = (int) atoi(optarg);
+				break;
+
+			case 'e':
+				elements = (int) atoi(optarg);
+				break;
+
+			case 'f':
+				fs = (int) atoi(optarg);
+				break;
+
+			case 'w':
+				genFreq = (int) atoi(optarg);
+				break;
+
+			case ':':
+				(void) fprintf(stderr, "-%c option requires a parameter.\n", optopt);
+				errFlag++;
+				break;
+
+			case '?':
+				(void) fprintf(stderr, "Unrecognized option -%c.\n", optopt);
+				errFlag++;
+				break;
+
+			case -1: /* done with options */
+				break;
+
+			default: /* unexpected */
+				assert(0);
+		}
+	} while (opt != -1);
+
+	if(errFlag) {
+		printUsage(progName);
+		return EXIT_FAILURE;
+	}
+
+	int i = 0;
+    //Generate freq array.
+    float* freq = NULL;
+    freq = (float*) malloc(genFreq*sizeof(float));
+    for(i = 1; i <= genFreq; i++) {
+        freq[i-1] = i*1.0;
+    }
+
+    int p = 0; // phase iterations
+    int f = 0;
+	int n = 0;
+	int c = 0;
+	int e = 0;
+ 
+	signed char cDataRe = 0;
+	signed char cDataIm = 0;
+	signed char* toWrite = (signed char *) malloc(samples*coarseChannels*elements*(2*sizeof(signed char)));
+
+	fprintf(stdout,
+			"INFO: Generating samples...\n"
+			"\tSamples:\t %d\n"
+			"\tSample rate:\t %d\n"
+			"\tChannels:\t %d\n"
+			"\tElements:\t %d\n",
+			samples, fs, coarseChannels, elements);
+
+	for(p = 0; p < 6; p++){
+		int iFile = 0;
+		char acDataFilename[LEN_GENSTRING] = "phase_data/file_%d.dat";
+		sprintf(acDataFilename, acDataFilename, p);
+		fprintf(stdout, "%s\n",acDataFilename);
+		acDataFilename[LEN_GENSTRING-1] = '\0'; //NUll terminator at end of filename string.
+
+		iFile = open(acDataFilename,
+						O_CREAT | O_TRUNC | O_WRONLY,
+						S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+		if(EXIT_FAILURE == iFile) {
+			(void) fprintf(stderr, "ERROR: Failed to open output file. %s\n", strerror(errno));
+			return EXIT_FAILURE;
+		}
+		for(f = 0; f < genFreq; f++) {
+			for(n = 0; n < samples; n++) {
+
+					cDataRe = SCALE_FACTOR * (0.99 * cos(2*M_PI * freq[f] * n / fs + 2*M_PI/(double)(p+1)));
+					cDataIm = SCALE_FACTOR * (0.99 * sin(2*M_PI * freq[f] * n / fs + 2*M_PI/(double)(p+1)));
+
+				for(c = 0; c < coarseChannels; c++) {
+					for(e = 0; e < 2*elements; e++) {
+						
+						int idx = e + c * (2 * elements) + n * (coarseChannels * 2*elements);
+						if( !(e%2) ) {
+						//create interleaved samples for real and Im 
+						toWrite[idx] = cDataRe;
+						} else {
+						toWrite[idx] = cDataIm;
+						}
+					}
+				}
+			}
+			(void) write(iFile, toWrite, samples*coarseChannels*elements*(2*sizeof(signed char)));
+		}
+		(void) close(iFile);
+	}
+
+	return EXIT_SUCCESS;
+}
\ No newline at end of file
--- a/lib/pfb/utils/tools/tools.c
+++ b/lib/pfb/utils/tools/tools.c
+#include "tools.h"
+
+
+void genData(char* data, float* freq, float fs, int samples, int channels, int elements){
+
+
+	fprintf(stdout,
+			"INFO: Generating samples...\n"
+			"\tSamples:\t %d\n"
+			"\tSample rate:\t %f\n"
+			"\tChannels:\t %d\n"
+			"\tElements:\t %d\n",
+			samples, fs, channels, elements);
+
+
+	int n = 0;
+	int f = 0;
+	int e = 0;
+ 
+	signed char dataRe = 0;
+	signed char dataIm = 0;
+	//int size = elements*channels*(2*sizeof(char)); // 2 for complex data.
+	for(n = 0; n < samples; n++) {
+		for(f = 0; f < channels; f++) {
+
+			//if(f==5){ // only insert one tone
+
+				//use the same sample for all elements
+				dataRe = SCALE_FACTOR * (.1 * cos(2*M_PI * freq[f] * n / fs));
+				dataIm = SCALE_FACTOR * (.1 * sin(2*M_PI * freq[f] * n / fs));
+				for(e = 0; e < 2*elements; e++) {
+					
+					int idx = e + f * (2 * elements) + n * channels * (2*elements);
+					if( !(e%2) ) {
+					//create interleaved samples for real and Im 
+					data[idx] = dataRe;
+					} else {
+					data[idx] = dataIm;
+					}
+				}
+			//} else {
+			//	cDataReX = 0;
+			//	cDataImY = 0;
+			//}
+		}
+	}
+	return;
+}
\ No newline at end of file
--- a/lib/pfb/utils/tools/tools.h
+++ b/lib/pfb/utils/tools/tools.h
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <math.h>
+#include <float.h>
+
+#define SCALE_FACTOR 127
+
+void genData(char* data, float* freq, float fs, int samples, int channels, int elements);
\ No newline at end of file
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -4,10 +4,13 @@ AM_CPPFLAGS     =
 AM_CPPFLAGS += -I"@HASHPIPE_INCDIR@"
 AM_CPPFLAGS += -I"@XGPU_INCDIR@"
 AM_CPPFLAGS += -I"@FLAGBEAM_INCDIR@"
+AM_CPPFLAGS += -I"@FLAGPFB_INCDIR@"
 AM_CPPFLAGS += -I"@FLAGPOW_INCDIR@"
 AM_CPPFLAGS += -I"@XGPU_FRB_INCDIR@"
 AM_CPPFLAGS += -I"@XGPU_PFB_INCDIR@"

+AM_CPPFLAGS += "@CUDA_INCDIR@"
+
 # AM_CFLAGS is used for all C compiles
 AM_CFLAGS = -fPIC -O3 -Wall -Werror -fno-strict-aliasing -mavx
 # AM_CFLAGS = -ggdb -fPIC -O3 -Wall -Werror -fno-strict-aliasing
@@ -43,6 +46,11 @@ flag_b_threads = flag_net_thread.c \
                 flag_beamform_thread.c \
                 flag_beamsave_thread.c

+flag_f_threads = flag_net_thread.c \
+                 flag_transpose_thread.c \
+                 flag_pfb_thread.c \
+                 flag_pfbsave_thread.c
+
 flag_bx_threads = flag_net_thread.c \
                  flag_transpose_thread.c \
                  flag_dual_thread.c \
@@ -53,14 +61,14 @@ flag_x_frb_threads = flag_net_thread.c \
                     flag_frb_correlator_thread.c \
                     flag_frb_corsave_thread.c

-# TODO: Add the pfb thread
 flag_fx_threads = flag_net_thread.c \
                     flag_transpose_thread.c \
+                     flag_pfb_thread.c \
                     flag_pfb_correlator_thread.c \
                     flag_pfb_corsave_thread.c

 # This lists all of the plugins that will be created
-lib_LTLIBRARIES = flag_x.la flag_b.la flag_x_frb.la flag_bx.la flag_fx.la
+lib_LTLIBRARIES = flag_x.la flag_b.la flag_f.la flag_x_frb.la flag_bx.la flag_fx.la

 # flag_x.la sources and libraries
 flag_x_la_SOURCES  = $(flag_databuf) $(flag_x_threads) $(fifo_codes)
@@ -77,6 +85,14 @@ flag_b_la_LDFLAGS += -L"@XGPU_LIBDIR@" -Wl,-rpath,"@XGPU_LIBDIR@"
 flag_b_la_LDFLAGS += -L"@HASHPIPE_LIBDIR@" -Wl,-rpath,"@HASHPIPE_LIBDIR@"
 flag_b_la_LDFLAGS += -L"@FLAGBEAM_LIBDIR@" -Wl, -rpath, "@FLAGBEAM_LIBDIR@"

+# flab_f.la sources and libraries
+flag_f_la_SOURCES = $(flag_databuf) $(flag_f_threads) $(fifo_codes)
+flag_f_la_LIBADD  = -lrt -ldl -lutil -L/usr/local/lib -lpython2.7 -lxgpu -lflagpfb -lcufft -lcudart -L/usr/local/cuda/lib64 -lcfitsio
+flag_f_la_LDFLAGS = -avoid-version -module -shared -export-dynamic --enable-shared
+flag_f_la_LDFLAGS += -L"@XGPU_LIBDIR@" -Wl, -rpath, "@XGPU_LIBDIR@"
+flag_f_la_LDFLAGS += -L"@HASHPIPE_LIBDIR@" -Wl, -rpath,"@HASHPIPE_LIBDIR@"
+flag_f_la_LDFLAGS += -L"@FLAGPFB_LIBDIR@" -Wl, -rpath,"@FLAGPFB_LIBDIR@"
+
 # flag_bx.la sources and libraries
 flag_bx_la_SOURCES = $(flag_databuf) $(flag_bx_threads) $(fifo_codes)
 flag_bx_la_LIBADD   = -lrt -lxgpu -lflagbeamformer -lcublas -L/usr/local/cuda/lib64 -lcfitsio
@@ -94,11 +110,11 @@ flag_x_frb_la_LDFLAGS += -L"@HASHPIPE_LIBDIR@" -Wl,-rpath,"@HASHPIPE_LIBDIR@"

 # flag_fx.la sources and libraries
 flag_fx_la_SOURCES = $(flag_databuf) $(flag_fx_threads) $(fifo_codes)
-flag_fx_la_LIBADD   = -lrt -lxgpupfb -lcublas -L/usr/local/cuda/lib64 -lcfitsio
+flag_fx_la_LIBADD   = -lrt -lxgpupfb -lcublas -L/usr/local/cuda/lib64 -lcfitsio -lflagpfb -lcufft -lcudart
 flag_fx_la_LDFLAGS  = -avoid-version -module -shared -export-dynamic --enable-shared
 flag_fx_la_LDFLAGS += -L"@XGPU_PFB_LIBDIR@" -Wl,-rpath,"@XGPU_PFB_LIBDIR@"
 flag_fx_la_LDFLAGS += -L"@HASHPIPE_LIBDIR@" -Wl,-rpath,"@HASHPIPE_LIBDIR@"
-# TODO: Add the PFB library flags
+flag_fx_la_LDFLAGS += -L"@FLAGPFB_LIBDIR@" -Wl, -rpath,"@FLAGPFB_LIBDIR@"


 # Installed scripts

--- a/src/configure.ac
+++ b/src/configure.ac
@@ -28,6 +28,10 @@ AX_CHECK_XGPU_FRB_INFO
 AX_CHECK_XGPU_PFB_INFO
 AX_CHECK_FLAGBEAMFORM
 AX_CHECK_FLAGPOW
+AX_CHECK_FLAGPFB
+
+# Check for CUDA
+AX_CHECK_CUDA

 # Checks for libraries.
 AC_CHECK_LIB([pthread], [pthread_create])
@@ -35,7 +39,7 @@ AC_CHECK_LIB([rt], [clock_gettime])
 AC_CHECK_LIB([z], [crc32])

 # Checks for header files.
-AC_CHECK_HEADERS([netdb.h stdint.h stdlib.h string.h sys/socket.h sys/time.h unistd.h zlib.h])
+AC_CHECK_HEADERS([cuda.h netdb.h stdint.h stdlib.h string.h sys/socket.h sys/time.h unistd.h zlib.h])

 # Checks for typedefs, structures, and compiler characteristics.
 AC_C_INLINE

--- a/src/flag_databuf.c
+++ b/src/flag_databuf.c
@@ -177,6 +177,29 @@ int flag_gpu_beamformer_output_databuf_set_filled(flag_gpu_beamformer_output_dat
    return hashpipe_databuf_set_filled((hashpipe_databuf_t *)d, block_id);
 }

+hashpipe_databuf_t * flag_gpu_pfb_output_databuf_create(int instance_id, int databuf_id) {
+    size_t header_size = sizeof(hashpipe_databuf_t) + sizeof(hashpipe_databuf_cache_alignment);
+    size_t block_size  = sizeof(flag_gpu_pfb_output_block_t);
+    int    n_block     = N_GPU_OUT_BLOCKS;
+    return hashpipe_databuf_create(
+        instance_id, databuf_id, header_size, block_size, n_block);
+}
+
+int flag_gpu_pfb_output_databuf_wait_free(flag_gpu_pfb_output_databuf_t* d, int block_id) {
+    return hashpipe_databuf_wait_free((hashpipe_databuf_t *)d, block_id);
+}
+
+int flag_gpu_pfb_output_databuf_wait_filled(flag_gpu_pfb_output_databuf_t* d, int block_id) {
+    return hashpipe_databuf_wait_filled((hashpipe_databuf_t *)d, block_id);
+}
+
+int flag_gpu_pfb_output_databuf_set_free(flag_gpu_pfb_output_databuf_t* d, int block_id) {
+    return hashpipe_databuf_set_free((hashpipe_databuf_t *)d, block_id);
+}
+
+int flag_gpu_pfb_output_databuf_set_filled(flag_gpu_pfb_output_databuf_t* d, int block_id) {
+    return hashpipe_databuf_set_filled((hashpipe_databuf_t *)d, block_id);
+}

 hashpipe_databuf_t * flag_gpu_power_output_databuf_create(int instance_id, int databuf_id) {
    size_t header_size = sizeof(hashpipe_databuf_t) + sizeof(hashpipe_databuf_cache_alignment);