Commit 66e66ca4 authored by Mitch Burnett's avatar Mitch Burnett
Browse files

create a correlator extension to the rtbf

right now the correlator is implemented as an 'experimental extension'
where the functionality is provided by leveraging the infrastructure
provided by the rtbf. In most cases the correlator operates similar to
the `BEAM_OP_RAW` mode of the beamformer with the exception of the
output size and the inputs to a call to `cublasCgemmbatched()`.

`init_correlator()` is functionally similar to `init_beamformer` except
only relevant to `BEAM_OP_CORR` where the cublas context is constructed
to prepare the data dimensions for the call to `correlate()`.

The `run_correlator()` function is very similar also to
`run_beamformer()` and looks again like when running `BEAM_OP_RAW`
because it matches the requirements of copy onto device -> transpose ->
matrix kernel (correlation instead of beamform) -> copy off teh device.
The copy of device is conditional however. `run_correlator()` could
really be operated inside `run_beamformer()` but to not take a penalty
in a hit on the if statment that is not needed in the beamformer
(because a copy off is always required) it would be better for now to
just have a seperate method.

The `corr_demo.c` is also functionally modeled the same as `rtbf_demo.c`
except to just demonstrate the functions of correlation and conditionaly
integration dumps.
parent b442f5be
No related merge requests found
Showing with 807 additions and 1 deletion
+807 -1
......@@ -32,6 +32,7 @@ rtbf_srcs = ['./src/cublas_beamformer.cu']
rtbf_info_src = ['./src/rtbf_info.c']
demo_rtbf_src = ['./src/rtbf_demo.c']
demo_corr_src = ['./src/corr_demo.c']
rtbf_nantenna = get_option('RTBF_NANTENNA')
rtbf_nbin = get_option('RTBF_NBIN')
......@@ -86,6 +87,12 @@ if not shared_only
dependencies: [cuda_dep, m_dep, plot_deps],
install: true)
executable(
'democorr', demo_corr_src,
link_with: rtbf_lib,
dependencies: [cuda_dep, m_dep, plot_deps],
install: true)
executable(
'rawmode', './src/unit_beam_op_raw.c',
link_with: rtbf_lib,
......
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <complex.h>
#include "cublas_beamformer.h"
#include "cpgplot.h"
#define ELAPSED_MS(start,stop) \
((((int64_t)stop.tv_sec-start.tv_sec)*1000*1000*1000+(stop.tv_nsec-start.tv_nsec))/1e6)
//#define RUN_UNIT_TEST
#ifdef RUN_UNIT_TEST
// TODO figure out how to get these into the program without tons of args
// unit test definition
#define INPUT_PER_FID 4 // inputs per fid
#define NUM_FIDS 2 // number of fids
#define TIME_PER_PKT 10 // time per network packet
#define MCNT_PER_BLK 40 // mcounts per block
#else
// xb system definition
#define INPUT_PER_FID 12 // inputs per fid
#define NUM_FIDS 12 // number of fids
#define TIME_PER_PKT 14 // time per network packet
#define MCNT_PER_BLK 336 // mcounts per block
#endif
#ifndef MAX
#define MAX(a,b) ((a) > (b) ? (a) : (b))
#endif
/*
* Multiplicative LCG for generating uniform(0.0, 1.0) random numbers
* x_n = 7^5*x_(n-1)mod(2^31 - 1)
* With x seeded to 1 the 10000th x value should be 1043618065
* From R. Jain, "The Art of Computer Systems Performance Analysis,"
* John Wiley & Sons, 1991. (Page 443, Figure 26.2)
*/
double rand_val(int seed) {
const long a = 16807; // Multiplier
const long m = 2147483647; // Modulus
const long q = 127773; // m div a
const long r = 2836; // m mod a
static long x; // Random int value
long x_div_q; // x divided by q
long x_mod_q; // x modulo q
long x_new; // New x value
// Set the seed if argument is non-zero and then return zero
if (seed > 0) {
x = seed;
return(0.0);
}
// RNG using integer arithmetic
x_div_q = x / q;
x_mod_q = x % q;
x_new = (a * x_mod_q) - (r * x_div_q);
if (x_new > 0)
x = x_new;
else
x = x_new + m;
// Return a random value between 0.0 and 1.0
return((double) x / m);
}
/*
* Function to generate normally distributed random variable using the Box-Muller method
* params:
* double mean, double standard deviation
* returns:
* normally distributed random variable
*/
double norm(double mean, double std_dev) {
double u, r, theta; // Variables for Box-Muller method
double x; // Normal(0, 1) rv
double norm_rv; // The adjusted normal rv
//Generate u
u = 0.0;
while (u == 0.0)
u = rand_val(0);
// Compute r
r = sqrt(-2.0 * log(u));
// Generate theta
theta = 0.0;
while (theta == 0.0)
theta = 2.0 * M_PI * rand_val(0);
// Generate x value
x = r * cos(theta);
// TODO: extend for complex random numbers (compute sin value)
// Adjust x value for specified mean and variance
norm_rv = (x * std_dev) + mean;
// Return the normally distributed RV value
return(norm_rv);
}
float maxf(float*a, int len) {
float max;
for (int m=0; m<len; m++) {
max = MAX(a[m], max);
}
return max;
}
void rtbfFillNetDataBlockBoxMullerRand(complex16_t *h, RTBFContext *context) {
int Nm = context->mcnt_per_data_block;
int Nf = context->nfengines;
int Nt = context->time_per_mcnt;
int Nc = context->freq_per_xid;
int Ni = context->input_per_fid;
// fill network ordered block of array data
for (int m=0; m<Nm; m++) { // mcnt
for (int f=0; f<Nf; f++) { // fid
for (int t=0; t<Nt; t++) { // time
for (int c=0; c<Nc; c++) { // frequency
for (int i=0; i<Ni; i++) { // elements per fid
int midx = m*(Ni*Nc*Nt*Nf); // number of complex items per mcount
int fidx = f*(Ni*Nc*Nt); // number of complex items per fid
int eidx = i + c*Ni + t*(Nc*Ni) + fidx + midx;
//printf("%u, %u, %u, %u, %u (%u)\n", m, f, t, c, i, eidx);
h[eidx].re = roundf(norm(0,1));
h[eidx].im = roundf(norm(0,1));
}
}
}
}
}
return;
}
void rtbf_OP_COR_FillDataBlock(complex16_t *array_input_data, RTBFContext *context) {
static int blockcounter = -1;
int Nm = context->mcnt_per_data_block;
int Nf = context->nfengines;
int Nt = context->time_per_mcnt;
int Nc = context->freq_per_xid;
int Ni = context->input_per_fid;
blockcounter++;
// fill network ordered block of array data
for (int m=0; m<Nm; m++) { // mcnt
for (int f=0; f<Nf; f++) { // fid
for (int t=0; t<Nt; t++) { // time
for (int c=0; c<Nc; c++) { // frequency
for (int i=0; i<Ni; i++) { // elements per fid
int midx = m*(Ni*Nc*Nt*Nf); // number of complex items per mcount
int fidx = f*(Ni*Nc*Nt); // number of complex items per fid
int eidx = i + c*Ni + t*(Nc*Ni) + fidx + midx;
//printf("%u, %u, %u, %u, %u (%u)\n", m, f, t, c, i, eidx);
if (i==0) {
array_input_data[eidx].re = 1;
array_input_data[eidx].im = 0;
} else {
array_input_data[eidx].re = 0;
array_input_data[eidx].im = 0;
}
//array_input_data[eidx].re = 1.0 + blockcounter;
//array_input_data[eidx].im = 0.0;
}
}
}
}
}
}
// TODO a lot of similarity with rtbf_demo.c could keep seperate or add a few more
// if statements to rtbf_demo.c
int main(int argc, char* argv[]) {
int opt;
int verbose = 0;
int mode = 4;
int blocks = 1;
int dumpOn = 1; // dump on each integration
RTBFContext context;
while ((opt = getopt(argc, argv, "m:b:c:w:vh")) != -1) {
switch (opt) {
case 'm':
// Set operational mode
mode = strtoul(optarg, NULL, 0);
if(mode < 0) {
fprintf(stderr, "mode must be zero, or greater\n");
return 1;
}
break;
case 'b':
// Set number of blocks to use
blocks = strtoul(optarg, NULL, 0);
if (blocks < 1) {
fprintf(stderr, "blocks must be positive\n");
return 1;
}
break;
case 'c':
// Set integration count interval
dumpOn = strtoul(optarg, NULL, 0);
if (dumpOn < 1) {
fprintf(stderr, "integration count must be positive\n");
return 1;
}
break;
case 'v':
// be verbose in checking output
verbose = 1;
break;
default:
fprintf(stderr,
"Usage: %s [options]\n"
"Options:\n"
" -m operational mode correlator mode (default 4)\n"
" operational mode values are:\n"
" 4 (integrated correlator dumps)\n"
" -b number of blocks the number of rtbf data blocks\n"
" to generate and calls made to\n"
" the rtbf\n"
" -c integration count the number data blocks to integrate\n"
" in one integration dump interval (1)\n"
" -v be more verbose in the output\n"
" for RAW mode (0) this will plot\n"
" but in other modes is a dump of\n"
" the output data. Not recommended\n"
" to use except on small library\n"
" configurations\n"
" -h Show this message\n",
argv[0]);
exit(EXIT_FAILURE);
}
}
RTBFInfo rtbf_info;
rtbfInfo(&rtbf_info);
int Ni = INPUT_PER_FID;
int Nf = NUM_FIDS;
int Nt = TIME_PER_PKT;
int Nm = MCNT_PER_BLK;
int Ne = rtbf_info.narray_elements; // elements in array
int Nc = rtbf_info.nbin; // frequency channels (bins)
int NT = rtbf_info.ntime; // time per block
long long unsigned int S = rtbf_info.array_len; // number of complex elements in a block size
// validate test xb-engine system parameters with library compiled parameters
if (Ni*Nf != Ne) {
printf("Element configuration error:\n"
" Testing with %u elements in the array. RTBF is compiled to only support %u\n",
Ni*Nf, Ne);
return -1;
}
if (Nm*Nt != NT) {
printf("Time configuration error:\n"
" Testing with %u time samples per GPU copy. RTBF is compiled to only support %u\n",
Nm*Nt, NT);
return -1;
}
if (Ne*Nc*NT != S) {
printf("Input data block size configuration error:\n"
" Testing with a total of %u complex entries in a GPU data block. RTBF is\n"
" compiled to only support %llu\n",
Ne*Nc*NT, S);
return -1;
}
complex16_t *d_cx_ptr = (complex16_t*) malloc(blocks*rtbf_info.array_len*sizeof(complex16_t));
void *rtbf_out_ptr = NULL;
size_t output_len;
switch (mode) {
case BEAM_OP_COR:
rtbf_out_ptr = (float*) malloc(blocks * (Ne*Ne*Nc) * 2*sizeof(float));
output_len = blocks * (Ne*Ne*Nc) * 2*sizeof(float);
break;
default:
printf("error initializing correlator extension of rtbf, unrecognized operational mode\n");
return -1;
}
// Initialize beamformer
context.operational_mode = mode;
context.ibuf_array_h = d_cx_ptr;
context.ibuf_array_len = blocks*rtbf_info.array_len*sizeof(complex16_t);
context.obuf_beamformed_h = rtbf_out_ptr;
context.obuf_beamformed_len = output_len;
context.mcnt_per_data_block = Nm;
context.nfengines = Nf;
context.time_per_mcnt = Nt;
context.freq_per_xid = Nc;
context.input_per_fid = Ni;
int rtbf_status = RTBF_OK;
rtbf_status = init_correlator(&context);
if (rtbf_status != RTBF_OK) {
printf("rtbf failure with code: %d\n", rtbf_status);
free(d_cx_ptr);
free(rtbf_out_ptr);
rtbfCleanup(&context);
return 0;
}
// Run beamformer (really the correlator)
struct timespec start, stop;
double total, per_call, max_bw, gbps;
complex16_t *update_d_cx_ptr = NULL;
void *update_b_out_ptr = NULL;
int doDump = 0;
rand_val(1); // init seed on RNG
for (int b=1; b<=blocks; b++) {
// prepare block offsets
context.input_offset = (b-1)*(rtbf_info.array_len);
context.output_offset = (b-1)*(Ne*Ne*Nc);
update_d_cx_ptr = d_cx_ptr + context.input_offset;
update_b_out_ptr = ((float*) rtbf_out_ptr) + 2*context.output_offset;
// fill data block
rtbfFillNetDataBlockBoxMullerRand(update_d_cx_ptr, &context);
// determine if this is a dump period
doDump = ((b % dumpOn) == 0 || (b==blocks)) ? 1 : 0;
// correlate!
clock_gettime(CLOCK_MONOTONIC, &start);
run_correlator(&context, doDump);
clock_gettime(CLOCK_MONOTONIC, &stop);
total += ELAPSED_MS(start, stop);
printf("run_correlator() time = %f ms\n", (float) ELAPSED_MS(start,stop));
// check
if (doDump && verbose && mode==BEAM_OP_COR) {
cpgopen("/xwin"); // docs say cpgopen should be used over beg now
cpgask(1);
// used to select the beam/channel
int chan_index = 0;
// for getting, setting, and using cursor info
float cx, cy;
char pressed;
// custom color table (see PGCTAB doc for how to define the table)
float L[7] = {0.00, 0.18, 0.36, 0.55, 0.72, 0.87, 1.00};
float R[7] = {0.10, 0.34, 0.59, 0.89, 1.00, 1.00, 1.00};
float G[7] = {0.00, 0.00, 0.20, 0.20, 0.43, 0.72, 1.00};
float B[7] = {0.23, 0.60, 0.47, 0.28, 0.00, 0.00, 0.76};
// doc said contra=1.0 bright=0.5 are nominal, but bright=0.3 looked best in this case
float contra = 1.0;
float bright = 0.3;
cpgctab(L, R, G, B, 7, contra, bright);
// setup env and labels before plotting
cpgenv(0.5, Ne+0.5, Ne+0.5, 0.5, 0, 1);
float tr[6]= {0.00, 1.00, 0.00, 0.00, 0.00, 1.00};
// TODO reproduce label for correlator case
//cpglab("theta", "dB arb. units", "beam patterns");
// array data
#define zabs(z) (sqrt(z.x*z.x+z.y*z.y))
float *bin_matrix_power = (float *) malloc((Ne*Ne) *sizeof(float));
float *pgmat = (float *) malloc((Ne*Ne) *sizeof(float));
// the approach here is to loop accepting input, and redraw the line instead
// of starting a new window (e.g., call to cpgenv)
printf("press 0-8, or x to end\n");
do {
// ask for cursor position (could have used cpgcurs, but this method
// provides the "mode" to put a vertical line through the cursor)
cpgcurs(&cx, &cy, &pressed);
//printf("cx=%f, cy=%f, pressed=%c\n", cx, cy, pressed);
if (pressed >= '0' && pressed <= '8') {
chan_index = atoi(&pressed);
}
// take pressed input
// copy the selected frequency bin into `bin_matrix_power`
for (int i=0; i<Ne; i++) {
for (int j=0; j<Ne; j++) {
int idx = j+i*Ne + chan_index*(Ne*Ne);
int oidx = j+i*Ne; // bin_matrix is only one channel long
float *dout = (float *) update_b_out_ptr; // cast from void* to float*
complex float y = CMPLXF(dout[2*idx], dout[2*idx+1]);
bin_matrix_power[oidx] = 10*log10((creal(conj(y)*y)));
}
}
// get max value for scaling image color
float max = maxf(bin_matrix_power, Ne*Ne);
max = (max==0)? 1 : max;
printf("max: %f\n", max);
// TODO does this approach work for cpimag???
// draw by redrawing the line, this was similar to what was done to animate
// from pgdemo11.f and my example analyzer codes
//
// TODO: pixels on edge are erased, need to do something like pgdemo11.f (and
// other demos) that slightly pad the edge so that the axis and labels are
// not erased
cpgbbuf();
cpgsci(0);
cpgimag(pgmat, Ne, Ne, 1, Ne, 1, Ne, 0.0, max, tr);
cpgsci(1);
for (int p=0; p<Ne*Ne; p++) {
pgmat[p] = bin_matrix_power[p];
}
cpgsci(1);
cpgimag(pgmat, Ne, Ne, 1, Ne, 1, Ne, 0.0, max, tr);
cpgebuf();
} while (pressed!='x');
free(pgmat);
free(bin_matrix_power);
cpgend();
}
// clear integration buffer on dump
if (doDump) {
doDump = 0;
rtbfClearCorrelatorIntegrationBuffer(&context);
}
}
printf("data_block_size=%lld, Rx_size=%d\n", rtbf_info.array_len, Ne*Ne*Nc);
double count = blocks;
per_call = total/count;
max_bw = rtbf_info.ntime*rtbf_info.nbin/per_call/1000; // MHz
// using the full ibuf length, the number of blocks needs to be divided out as
// this is what is transfered to the gpu in a single call to the rtbf
//gbps = ((float)(8 * (context.ibuf_array_len/count) * count)) / total / 1e6; // Gbps
gbps = ((float)(8 * (rtbf_info.array_len * sizeof(complex16_t) + Ne*Ne*Nc*2*sizeof(float))* count)) / total / 1e6; // Gbps
printf("Elapsed time %.6f ms total, %.6f ms/call average\n",
total, per_call);
printf("Theoretical BW_max %.3f MHz, throughput %.3f Gbps\n",
max_bw, gbps);
// cleanup and exit
free(d_cx_ptr);
free(rtbf_out_ptr);
rtbfCleanup(&context);
return 0;
}
......@@ -275,6 +275,235 @@ int update_weights(RTBFContext* context, float* weights) {
return rtbf_status;
}
// initialize the correlator
int init_correlator(RTBFContext* context) {
switch(context->operational_mode) {
case BEAM_OP_COR:
break;
default:
printf("error initializing correlator component of rtbf, unrecognized operational mode\n");
return RTBF_INVALID_INIT;
}
RTBFInternalContext *internal = (RTBFInternalContext *) malloc(sizeof(RTBFInternalContext));
if (internal == NULL) {
return RTBF_OUTOFMEM_ERROR;
}
context->internal = internal;
long long unsigned int array_len = compiletime_info.array_len;
long long unsigned int weight_len = compiletime_info.weight_len;
long long unsigned int beamformed_len = compiletime_info.beamformed_len;
long long unsigned int beam_power_len = compiletime_info.beam_power_len;
long long unsigned int sti_stokes_len = compiletime_info.sti_stokes_len;
long long unsigned int Rx_matrix_len = compiletime_info.narray_elements *
compiletime_info.narray_elements *
compiletime_info.nbin;
// register input and output buffers
// TODO if extension code works make common function for reuse
// TODO make sure no "beamformer" spefic code (i.e., make sure OK to apply to an Rx
//
// Note that in the context of the correlator that `obuf_len` would more
// properly be considered `matrix_len`
if (context->ibuf_array_h && context->obuf_beamformed_h) {
printf("registering user allocated memory with CUDA\n");
// Register caller-allocated host memory with CUDA.
// Round address down to nearest page_size boundary
uintptr_t ptr_ibuf = (uintptr_t)context->ibuf_array_h;
uintptr_t ptr_aligned_ibuf = ptr_ibuf - (ptr_ibuf % page_size);
uintptr_t ptr_obuf = (uintptr_t)context->obuf_beamformed_h;
uintptr_t ptr_aligned_obuf = ptr_obuf - (ptr_obuf % page_size);
// Compute length starting with compile time requirement
size_t ibuf_len = context->ibuf_array_len;
size_t obuf_len = context->obuf_beamformed_len;
// TODO Verify that lengths are at least
// TODO would have different lengths to verify in context of a Rx
// "compiletime_info.array_len*sizeof(complex16_t)"
// and as long as the selected operational mode, for raw beam op this would
// be
// "compiletime_info.beamformed_len *2*sizeof(float)"
// and for sti mode this is
// "compiletime_info.sti_stokes_len *2*sizeof(float"
// note, that the 2 is still used in stokes output len so that right now the
// user can pass in the right stokes output len and still work.
// Add in any rounding that was done to the input pointer
ibuf_len += (ptr_ibuf - ptr_aligned_ibuf);
// Round length up to next multiple of page size
ibuf_len = (ibuf_len+page_size-1) / page_size * page_size;
obuf_len += (ptr_obuf - ptr_aligned_obuf);
// Round length up to next multiple of page size
obuf_len = (obuf_len+page_size-1) / page_size * page_size;
printf("ibuf aligned context->ibuf_array_h = 0x%lx\n", ptr_aligned_ibuf);
printf("length = %lx\n", ibuf_len);
printf("obuf aligned context->obuf_beamformed_h = 0x%lx\n", ptr_aligned_obuf);
printf("length = %lx\n", obuf_len);
if ( ptr_aligned_ibuf <= (ptr_aligned_obuf+obuf_len-1) &&
ptr_aligned_obuf <= (ptr_aligned_ibuf+ibuf_len-1)) {
// clamping the individual memory regions results in an overlap
// and the regions are two close to separate. Clamp instead the whole
// region
uintptr_t ptr_aligned_min;
size_t length;
printf("there is a memory collision!!\n");
if (ptr_aligned_ibuf < ptr_aligned_obuf) {
ptr_aligned_min = ptr_aligned_ibuf;
length = (ptr_aligned_obuf+obuf_len) - ptr_aligned_min;
} else {
ptr_aligned_min = ptr_aligned_obuf;
length = (ptr_aligned_ibuf+ibuf_len) - ptr_aligned_min;
}
printf("aligned buf min = %lx\n", ptr_aligned_min);
printf("length = %lx\n", length);
cudaHostRegister((void *) ptr_aligned_min, length, 0);
internal->unregister_ibuf_array_h = (complex16_t*) ptr_aligned_min;
internal->unregister_obuf_beamformed_h = NULL;
checkCudaError();
} else {
printf("there is *not* a memory collision\n");
// there is no overlap of the memory regions and we pin them individually
cudaHostRegister((void *) ptr_aligned_ibuf, ibuf_len, 0);
cudaHostRegister((void *) ptr_aligned_obuf, obuf_len, 0);
internal->unregister_ibuf_array_h = (complex16_t*) ptr_aligned_ibuf;
internal->unregister_obuf_beamformed_h = (float *) ptr_aligned_obuf;
checkCudaError();
}
} else {
// for now, user must initialize rtbf with user allocated input and output array.
return RTBF_INVALID_INIT;
}
// initialize offset values to zero TODO redundant (common for correlator and beamformer)
context->input_offset = 0;
context->output_offset = 0;
// create cublas handle TODO redundant (common) for correlator and beamformer
cublasStatus_t blas_stat = CUBLAS_STATUS_SUCCESS;
blas_stat = cublasCreate(&(internal->handle));
checkCublasError(blas_stat);
// setup internal context blas dimensions
// TODO check these dimensions then explain how they work out because they do
// not exactly make sense with ntime not showing up in B anywhere since we are
// doing an auto correlation and B should be =A. I think what it shold be is
// that since that cublasCgemm takes in op(B) dimensions for these arguments
// they need to be such that rows_b is ntime and cols_b is nelements (i.e.,
// conjugate transposed). But what is confusing is that then setting the
// leading dimension of b since nrows_B
internal->nr_rows_A = compiletime_info.narray_elements;
internal->nr_cols_A = compiletime_info.ntime;
internal->nr_rows_B = compiletime_info.ntime;//internal->nr_rows_A;
internal->nr_cols_B = compiletime_info.narray_elements;//internal->nr_cols_A;
internal->nr_rows_C = compiletime_info.narray_elements;
internal->nr_cols_C = compiletime_info.narray_elements;
internal->batchCount = compiletime_info.nbin;
// allocate device memory for transpose, beamformer, and post-beamform operation
// TODO get rid of these, temp used to fill in "d_beamformed" and "obuf_D2H_len" lengths
cudaMalloc((void **) &(internal->d_restruct_in), array_len*sizeof(complex16_t)); // input data
cudaMalloc((void **) &(internal->d_array_data), array_len*sizeof(cuComplex)); // transpose
cudaMalloc((void **) &(internal->d_beamformed), Rx_matrix_len*sizeof(cuComplex)); // Rx output
checkCudaError();
// setup input and output information based on operational mode
internal->ibuf_H2D_len = array_len*sizeof(complex16_t);
// TODO place closer attention to differences here with Rx. But in reality it
// should function similar to BEAM_OP_RAW because outputs are complex and
// there is no additional processing
// The mode better only be BEAM_OP_COR
if (context->operational_mode == BEAM_OP_COR) {
internal->obuf_D2H_len = Rx_matrix_len * sizeof(cuComplex);
internal->post_beamform_func = NULL; // skip post-processing kernel, dump beamformed outputs
}
cudaMalloc((void **) &(internal->d_post_beamform), internal->obuf_D2H_len);
checkCudaError();
if (context->operational_mode == BEAM_OP_COR) {
internal->d_output = (float *)internal->d_beamformed;
}
// local dimension variable references to use isntead of "internal->"
int nrows_A, ncols_A, nrows_B, ncols_B, nrows_C, ncols_C;
int batchCount;
nrows_A = internal->nr_rows_A;
ncols_A = internal->nr_cols_A;
nrows_B = internal->nr_rows_B;
ncols_B = internal->nr_cols_B;
nrows_C = internal->nr_rows_C;
ncols_C = internal->nr_cols_C;
batchCount = internal->batchCount;
// TODO cleanup also redundant (common) to beamformer from here until free of temp memory
// allocate 3 arrays on host, used to allocated device arrays needed for gemmBatched.
const cuComplex** h_arr_A = NULL; // array data
const cuComplex** h_arr_B = NULL;// for array data and possibly redundant since auto corr?
cuComplex** h_arr_C = NULL; // accumulation result
cudaMallocHost((void **) &h_arr_A, batchCount*sizeof(cuComplex*));
cudaMallocHost((void **) &h_arr_B, batchCount*sizeof(cuComplex*));
cudaMallocHost((void **) &h_arr_C, batchCount*sizeof(cuComplex*));
checkCudaError();
// TODO NOTE THE CHANGE HERE A is now data, weights not used and B is really
// redundant since the call in correlate() to cublasCgemmbatched() uses
// d_arr_A twice.
// TODO I wonder though if we do use B here and since nrows_b and ncols_B are
// set to op(B) if this casuses errors OR if we can not call cgemmbatched with
// OP_C or what goes on here
// allocate memory for each batch in an array.
for (int i=0; i<batchCount; i++) {
h_arr_A[i] = internal->d_array_data + i*nrows_A*ncols_A;
h_arr_B[i] = internal->d_array_data + i*nrows_B*ncols_B; // redundant, not used for correlator
h_arr_C[i] = internal->d_beamformed + i*nrows_C*ncols_C;
}
// allocate memory to arrays on device.
cudaMalloc((void **) &(internal->d_arr_A), batchCount*sizeof(cuComplex*));
cudaMalloc((void **) &(internal->d_arr_B), batchCount*sizeof(cuComplex*));
cudaMalloc((void **) &(internal->d_arr_C), batchCount*sizeof(cuComplex*));
checkCudaError();
// copy memory from host to device.
cudaMemcpy(internal->d_arr_A, h_arr_A, batchCount*sizeof(cuComplex*), cudaMemcpyHostToDevice);
cudaMemcpy(internal->d_arr_B, h_arr_B, batchCount*sizeof(cuComplex*), cudaMemcpyHostToDevice);
cudaMemcpy(internal->d_arr_C, h_arr_C, batchCount*sizeof(cuComplex*), cudaMemcpyHostToDevice);
checkCudaError();
// free the temporary memory
cudaFreeHost(h_arr_A);
cudaFreeHost(h_arr_B);
cudaFreeHost(h_arr_C);
checkCudaError();
// initialize beamformer metadata structure
// TODO how should metadata be handled here? Should we change?
rtbf_metadata *md = &(context->metadata);
md->xid = 0;
md->offsets = (float *) malloc(compiletime_info.nbeam*sizeof(float));
md->cal_filename = (char *) malloc(CHARLEN*sizeof(char));
md->algorithm = (char *) malloc(CHARLEN*sizeof(char));
md->weight_filename = (char *) malloc(CHARLEN*sizeof(char));
if (!(md->offsets) || !(md->cal_filename) ||
!(md->algorithm) || !(md->weight_filename))
{
return RTBF_OUTOFMEM_ERROR;
}
return RTBF_OK;
}
// initialize the rtbf
// TODO return if already initialized
// TODO handle clean up if return early due to an error
......@@ -494,6 +723,33 @@ int init_beamformer(RTBFContext* context) {
return RTBF_OK;
}
const static cuComplex X_alf = make_cuComplex(1, 0);
const static cuComplex X_bet = make_cuComplex(1, 0);
int correlate(RTBFInternalContext* internal) {
cublasStatus_t blas_stat = CUBLAS_STATUS_SUCCESS;
cublasCgemmBatched(
internal->handle,
CUBLAS_OP_N,
CUBLAS_OP_C,
internal->nr_rows_A, //Ne//nrows_A// Number of rows in matrix A and C. TODO THIS IS op(A)
internal->nr_cols_B, //Ne//ncols_B// Number of columns in matrix B and C. TODO THIS IS op(B)
internal->nr_cols_A, //Nt//ncols_A// Number of columns and rows in matrix A and B respectively.TODO this is op(A) and op(B)
&X_alf,
internal->d_arr_A,
internal->nr_rows_A, //Ne//nrows_A// Leading dimension of each batch or matrix in array A (lda).TODO better explain
internal->d_arr_A,
internal->nr_cols_B, //Ne//nrows_B// Leading dimension of each batch or matrix in array B (ldb).TODO better explain
&X_bet,
internal->d_arr_C,
internal->nr_rows_C, //Ne//nrows_C// Leading dimension of each batch or matrix in array C (ldc).TODO better explain
internal->batchCount);
checkCublasError(blas_stat);
return RTBF_OK;
}
const static cuComplex alf = make_cuComplex(1, 0);
const static cuComplex bet = make_cuComplex(0, 0);
......@@ -668,6 +924,85 @@ void beamformed_spectra(cuComplex* data_in, float* data_out) {
return;
}
// run_correlator was very common reusing most of this from `run_beamformer()`.
// `run_beamformer()` could really be extended to include BEAM_OP_COR as it is
// so similar to BEAM_OP_RAW in setup. However, the most significant difference
// is needing to accomodate `doDump` and to avoid an unnecessary performance hit
// to check for a dump in any beamformer mode it is better to have a standalone
// implementation function. The only way I think this to really have a generic
// `run_*()` function is that the library needs to support arbitrary function
// chaning through a linked list of function/kernel pointers. The pipeline is
// then configured as this list.
int run_correlator(RTBFContext* context, int doDump) {
int rtbf_status = RTBF_OK;
RTBFInternalContext* internal = (RTBFInternalContext *)context->internal;
if (internal == NULL) {
rtbf_status = RTBF_NOT_INITIALIZED;
return rtbf_status;
}
// still need to check to make sure this is not called if in a different OP mode
switch (context->operational_mode) {
case BEAM_OP_COR:
break;
default:
rtbf_status = RTBF_FATAL;
return rtbf_status;
}
int Nm = context->mcnt_per_data_block;
int Nf = context->nfengines;
int Nt = context->time_per_mcnt;
int Nc = context->freq_per_xid;
int Ni = context->input_per_fid;
dim3 gridDim_transpose(Nm, Nf, Nt);
dim3 blockDim_transpose(Ni, Nc, 1);
// reference to host input and output buffers and adjust for current processing offset
complex16_t *ibuf_hp = context->ibuf_array_h;
ibuf_hp += context->input_offset;
void* obuf_hp = context->obuf_beamformed_h;
obuf_hp = ((cuComplex*) obuf_hp) + context->output_offset;
// copy memory onto device
cudaMemcpy(internal->d_restruct_in, ibuf_hp, internal->ibuf_H2D_len, cudaMemcpyHostToDevice);
checkCudaError();
// restructure data (transpose/corner turn) where bin is the slowest moving dimension for cublasCgemmBatched
data_restructure<<<gridDim_transpose, blockDim_transpose>>>(internal->d_restruct_in, internal->d_array_data);
checkCudaError();
// correlate
rtbf_status = correlate(internal);
checkCudaError();
// post-beamform kernel
/* no kernel for correlator, performance penalty to always make a check */
// copy output data from device to host
if (doDump) {
cudaMemcpy(obuf_hp, internal->d_output, internal->obuf_D2H_len, cudaMemcpyDeviceToHost);
checkCudaError();
}
return rtbf_status;
}
int rtbfClearCorrelatorIntegrationBuffer(RTBFContext *context) {
int rtbf_status = RTBF_OK;
RTBFInternalContext* internal = (RTBFInternalContext *)context->internal;
if (internal == NULL) {
rtbf_status = RTBF_NOT_INITIALIZED;
return rtbf_status;
}
cudaMemset(internal->d_output, '\0', internal->obuf_D2H_len);
checkCudaError();
return rtbf_status;
}
int run_beamformer(RTBFContext* context) {
int rtbf_status = RTBF_OK;
......@@ -793,6 +1128,7 @@ void rtbfCleanup(RTBFContext* context) {
}
// free metadata resources
// TODO this assumes the md is not null, would cause segfault
rtbf_metadata* md = &(context->metadata);
if (md->offsets != NULL) {
free(md->offsets);
......
......@@ -143,7 +143,9 @@ typedef struct rtbf_metadata_struct {
#define BEAM_OP_RAW 0 // raw beamformer iq outputs
#define BEAM_OP_STI 1 // full-stokes spectrometer outputs w/ short time integration (STI)
#define BEAM_OP_PSD 2 // beamformed power spectrometer
// future mode
// experimental extension
#define BEAM_OP_COR 4 // correlator
// future mode (not implemented)
#define BEAM_OP_RFI 3 // adaptive real-time RFI cancellation
typedef struct RTBFContextStruct {
......@@ -254,6 +256,11 @@ int load_weights_from_file(RTBFContext *context, char *weight_filename);
// integration.
int run_beamformer(RTBFContext *context);
// TODO experimental extension to have correlator included in RTBF
int init_correlator(RTBFContext *context);
int run_correlator(RTBFContext *context, int doDump);
int rtbfClearCorrelatorIntegrationBuffer(RTBFContext *context);
// Free up resources used by the rtbf and reset the RTBFContext.
//
// This is to be called prior to makeing a change to the operational mode and
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment