create a correlator extension to the rtbf

right now the correlator is implemented as an 'experimental extension' where the functionality is provided by leveraging the infrastructure provided by the rtbf. In most cases the correlator operates similar to the `BEAM_OP_RAW` mode of the beamformer with the exception of the output size and the inputs to a call to `cublasCgemmbatched()`. `init_correlator()` is functionally similar to `init_beamformer` except only relevant to `BEAM_OP_CORR` where the cublas context is constructed to prepare the data dimensions for the call to `correlate()`. The `run_correlator()` function is very similar also to `run_beamformer()` and looks again like when running `BEAM_OP_RAW` because it matches the requirements of copy onto device -> transpose -> matrix kernel (correlation instead of beamform) -> copy off teh device. The copy of device is conditional however. `run_correlator()` could really be operated inside `run_beamformer()` but to not take a penalty in a hit on the if statment that is not needed in the beamformer (because a copy off is always required) it would be better for now to just have a seperate method. The `corr_demo.c` is also functionally modeled the same as `rtbf_demo.c` except to just demonstrate the functions of correlation and conditionaly integration dumps.

create a correlator extension to the rtbf
right now the correlator is implemented as an 'experimental extension' where the functionality is provided by leveraging the infrastructure provided by the rtbf. In most cases the correlator operates similar to the `BEAM_OP_RAW` mode of the beamformer with the exception of the output size and the inputs to a call to `cublasCgemmbatched()`. `init_correlator()` is functionally similar to `init_beamformer` except only relevant to `BEAM_OP_CORR` where the cublas context is constructed to prepare the data dimensions for the call to `correlate()`. The `run_correlator()` function is very similar also to `run_beamformer()` and looks again like when running `BEAM_OP_RAW` because it matches the requirements of copy onto device -> transpose -> matrix kernel (correlation instead of beamform) -> copy off teh device. The copy of device is conditional however. `run_correlator()` could really be operated inside `run_beamformer()` but to not take a penalty in a hit on the if statment that is not needed in the beamformer (because a copy off is always required) it would be better for now to just have a seperate method. The `corr_demo.c` is also functionally modeled the same as `rtbf_demo.c` except to just demonstrate the functions of correlation and conditionaly integration dumps.
66e66ca4 · Mitch Burnett · b442f5be · 66e66ca4 · 66e66ca4 · 66e66ca4
Commit 66e66ca4 authored 2 years ago by Mitch Burnett
Hide whitespace changes
Inline Side-by-side

Showing

with 807 additions and 1 deletion
+807 -1
--- a/meson.build
+++ b/meson.build
@@ -32,6 +32,7 @@ rtbf_srcs = ['./src/cublas_beamformer.cu']

 rtbf_info_src = ['./src/rtbf_info.c']
 demo_rtbf_src = ['./src/rtbf_demo.c']
+demo_corr_src = ['./src/corr_demo.c']

 rtbf_nantenna = get_option('RTBF_NANTENNA')
 rtbf_nbin = get_option('RTBF_NBIN')
@@ -86,6 +87,12 @@ if not shared_only
    dependencies: [cuda_dep, m_dep, plot_deps],
    install: true)

+  executable(
+    'democorr', demo_corr_src,
+    link_with: rtbf_lib,
+    dependencies: [cuda_dep, m_dep, plot_deps],
+    install: true)
+
  executable(
    'rawmode', './src/unit_beam_op_raw.c',
    link_with: rtbf_lib,

--- a/src/corr_demo.c
+++ b/src/corr_demo.c
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <string.h>
+#include <time.h>
+
+#include <math.h>
+#include <complex.h>
+
+#include "cublas_beamformer.h"
+#include "cpgplot.h"
+
+#define ELAPSED_MS(start,stop) \
+  ((((int64_t)stop.tv_sec-start.tv_sec)*1000*1000*1000+(stop.tv_nsec-start.tv_nsec))/1e6)
+
+//#define RUN_UNIT_TEST
+#ifdef RUN_UNIT_TEST
+
+// TODO figure out how to get these into the program without tons of args
+// unit test definition
+#define INPUT_PER_FID 4      // inputs per fid
+#define NUM_FIDS      2      // number of fids
+#define TIME_PER_PKT  10     // time per network packet
+#define MCNT_PER_BLK  40     // mcounts per block
+
+#else
+// xb system definition
+#define INPUT_PER_FID 12     // inputs per fid
+#define NUM_FIDS      12     // number of fids
+#define TIME_PER_PKT  14     // time per network packet
+#define MCNT_PER_BLK  336    // mcounts per block
+#endif
+
+#ifndef MAX
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+#endif
+
+/*
+ * Multiplicative LCG for generating uniform(0.0, 1.0) random numbers
+ *   x_n = 7^5*x_(n-1)mod(2^31 - 1)
+ *   With x seeded to 1 the 10000th x value should be 1043618065
+ *   From R. Jain, "The Art of Computer Systems Performance Analysis,"
+ *   John Wiley & Sons, 1991. (Page 443, Figure 26.2)
+ */
+double rand_val(int seed) {
+  const long  a =      16807;  // Multiplier
+  const long  m = 2147483647;  // Modulus
+  const long  q =     127773;  // m div a
+  const long  r =       2836;  // m mod a
+  static long x;               // Random int value
+  long        x_div_q;         // x divided by q
+  long        x_mod_q;         // x modulo q
+  long        x_new;           // New x value
+
+  // Set the seed if argument is non-zero and then return zero
+  if (seed > 0) {
+    x = seed;
+    return(0.0);
+  }
+
+  // RNG using integer arithmetic
+  x_div_q = x / q;
+  x_mod_q = x % q;
+  x_new = (a * x_mod_q) - (r * x_div_q);
+  if (x_new > 0)
+    x = x_new;
+  else
+    x = x_new + m;
+
+  // Return a random value between 0.0 and 1.0
+  return((double) x / m);
+}
+
+/*
+ * Function to generate normally distributed random variable using the Box-Muller method
+ *  params:
+ *    double mean, double standard deviation
+ *  returns:
+ *    normally distributed random variable
+ */
+double norm(double mean, double std_dev) {
+  double   u, r, theta;           // Variables for Box-Muller method
+  double   x;                     // Normal(0, 1) rv
+  double   norm_rv;               // The adjusted normal rv
+
+  //Generate u
+  u = 0.0;
+  while (u == 0.0)
+    u = rand_val(0);
+
+  // Compute r
+  r = sqrt(-2.0 * log(u));
+
+  // Generate theta
+  theta = 0.0;
+  while (theta == 0.0)
+    theta = 2.0 * M_PI * rand_val(0);
+
+  // Generate x value
+  x = r * cos(theta);
+  // TODO: extend for complex random numbers (compute sin value)
+
+  // Adjust x value for specified mean and variance
+  norm_rv = (x * std_dev) + mean;
+
+  // Return the normally distributed RV value
+  return(norm_rv);
+}
+
+float maxf(float*a, int len) {
+  float max;
+  for (int m=0; m<len; m++) {
+    max = MAX(a[m], max);
+  }
+  return max;
+}
+
+void rtbfFillNetDataBlockBoxMullerRand(complex16_t *h, RTBFContext *context) {
+  int Nm = context->mcnt_per_data_block;
+  int Nf = context->nfengines;
+  int Nt = context->time_per_mcnt;
+  int Nc = context->freq_per_xid;
+  int Ni = context->input_per_fid;
+
+  // fill network ordered block of array data
+  for (int m=0; m<Nm; m++) {         // mcnt
+    for (int f=0; f<Nf; f++) {       // fid
+      for (int t=0; t<Nt; t++) {     // time
+        for (int c=0; c<Nc; c++) {   // frequency
+          for (int i=0; i<Ni; i++) { // elements per fid
+            int midx = m*(Ni*Nc*Nt*Nf);        // number of complex items per mcount
+            int fidx = f*(Ni*Nc*Nt);           // number of complex items per fid
+            int eidx = i + c*Ni + t*(Nc*Ni) + fidx + midx;
+            //printf("%u, %u, %u, %u, %u (%u)\n", m, f, t, c, i, eidx);
+            h[eidx].re = roundf(norm(0,1));
+            h[eidx].im = roundf(norm(0,1));
+          }
+        }
+      }
+    }
+  }
+  return;
+}
+ 
+void rtbf_OP_COR_FillDataBlock(complex16_t *array_input_data, RTBFContext *context) {
+  static int blockcounter = -1;
+  int Nm = context->mcnt_per_data_block;
+  int Nf = context->nfengines;
+  int Nt = context->time_per_mcnt;
+  int Nc = context->freq_per_xid;
+  int Ni = context->input_per_fid;
+
+  blockcounter++;
+  // fill network ordered block of array data
+  for (int m=0; m<Nm; m++) {         // mcnt
+    for (int f=0; f<Nf; f++) {       // fid
+      for (int t=0; t<Nt; t++) {     // time
+        for (int c=0; c<Nc; c++) {   // frequency
+          for (int i=0; i<Ni; i++) { // elements per fid
+            int midx = m*(Ni*Nc*Nt*Nf);        // number of complex items per mcount
+            int fidx = f*(Ni*Nc*Nt);           // number of complex items per fid
+            int eidx = i + c*Ni + t*(Nc*Ni) + fidx + midx;
+            //printf("%u, %u, %u, %u, %u (%u)\n", m, f, t, c, i, eidx);
+            if (i==0) {
+              array_input_data[eidx].re = 1;
+              array_input_data[eidx].im = 0;
+            } else {
+              array_input_data[eidx].re = 0;
+              array_input_data[eidx].im = 0;
+            }
+            //array_input_data[eidx].re = 1.0 + blockcounter;
+            //array_input_data[eidx].im = 0.0;
+          }
+        }
+      }
+    }
+  }
+}
+
+// TODO a lot of similarity with rtbf_demo.c could keep seperate or add a few more
+// if statements to rtbf_demo.c
+int main(int argc, char* argv[]) {
+  int opt;
+  int verbose = 0;
+  int mode = 4;
+  int blocks = 1;
+  int dumpOn = 1; // dump on each integration
+  RTBFContext context;
+
+  while ((opt = getopt(argc, argv, "m:b:c:w:vh")) != -1) {
+    switch (opt) {
+      case 'm':
+        // Set operational mode
+        mode = strtoul(optarg, NULL, 0);
+        if(mode < 0) {
+          fprintf(stderr, "mode must be zero, or greater\n");
+          return 1;
+        }
+        break;
+      case 'b':
+        // Set number of blocks to use
+        blocks = strtoul(optarg, NULL, 0);
+        if (blocks < 1) {
+          fprintf(stderr, "blocks must be positive\n");
+          return 1;
+        }
+        break;
+      case 'c':
+        // Set integration count interval
+        dumpOn = strtoul(optarg, NULL, 0);
+        if (dumpOn < 1) {
+          fprintf(stderr, "integration count must be positive\n");
+          return 1;
+        }
+        break;
+      case 'v':
+        // be verbose in checking output
+        verbose = 1;
+        break;
+      default:
+        fprintf(stderr,
+            "Usage: %s [options]\n"
+            "Options:\n"
+            "  -m operational mode   correlator mode (default 4)\n"
+            "                        operational mode values are:\n"
+            "                             4 (integrated correlator dumps)\n"
+            "  -b number of blocks   the number of rtbf data blocks\n"
+            "                        to generate and calls made to\n"
+            "                        the rtbf\n"
+            "  -c integration count  the number data blocks to integrate\n"
+            "                        in one integration dump interval (1)\n"
+            "  -v                    be more verbose in the output\n"
+            "                        for RAW mode (0) this will plot\n"
+            "                        but in other modes is a dump of\n"
+            "                        the output data. Not recommended\n"
+            "                        to use except on small library\n"
+            "                        configurations\n"
+            "  -h                    Show this message\n",
+            argv[0]);
+        exit(EXIT_FAILURE);
+    }
+  }
+
+  RTBFInfo rtbf_info;
+  rtbfInfo(&rtbf_info);
+
+  int Ni = INPUT_PER_FID;
+  int Nf = NUM_FIDS;
+  int Nt = TIME_PER_PKT;
+  int Nm = MCNT_PER_BLK;
+
+  int Ne = rtbf_info.narray_elements; // elements in array
+  int Nc = rtbf_info.nbin;            // frequency channels (bins)
+  int NT = rtbf_info.ntime;           // time per block
+  long long unsigned int S = rtbf_info.array_len; // number of complex elements in a block size
+
+  // validate test xb-engine system parameters with library compiled parameters
+  if (Ni*Nf != Ne) {
+    printf("Element configuration error:\n"
+           "  Testing with %u elements in the array. RTBF is compiled to only support %u\n",
+            Ni*Nf, Ne);
+    return -1;
+  }
+  if (Nm*Nt != NT) {
+    printf("Time configuration error:\n"
+           "  Testing with %u time samples per GPU copy. RTBF is compiled to only support %u\n",
+           Nm*Nt, NT);
+    return -1;
+  }
+  if (Ne*Nc*NT != S) {
+    printf("Input data block size configuration error:\n"
+           "  Testing with a total of %u complex entries in a GPU data block. RTBF is\n"
+           "  compiled to only support %llu\n",
+           Ne*Nc*NT, S);
+    return -1;
+  }
+
+  complex16_t *d_cx_ptr = (complex16_t*) malloc(blocks*rtbf_info.array_len*sizeof(complex16_t));
+
+  void *rtbf_out_ptr = NULL;
+  size_t output_len;
+  switch (mode) {
+    case BEAM_OP_COR:
+      rtbf_out_ptr = (float*) malloc(blocks * (Ne*Ne*Nc) * 2*sizeof(float));
+      output_len = blocks * (Ne*Ne*Nc) * 2*sizeof(float);
+      break;
+    default:
+      printf("error initializing correlator extension of rtbf, unrecognized operational mode\n");
+      return -1;
+  }
+
+  // Initialize beamformer
+  context.operational_mode = mode;
+
+  context.ibuf_array_h = d_cx_ptr;
+  context.ibuf_array_len = blocks*rtbf_info.array_len*sizeof(complex16_t);
+  context.obuf_beamformed_h = rtbf_out_ptr;
+  context.obuf_beamformed_len = output_len;
+
+  context.mcnt_per_data_block = Nm;
+  context.nfengines      = Nf;
+  context.time_per_mcnt  = Nt;
+  context.freq_per_xid   = Nc;
+  context.input_per_fid  = Ni;
+
+  int rtbf_status = RTBF_OK;
+  rtbf_status = init_correlator(&context);
+  if (rtbf_status != RTBF_OK) {
+    printf("rtbf failure with code: %d\n", rtbf_status);
+    free(d_cx_ptr);
+    free(rtbf_out_ptr);
+    rtbfCleanup(&context);
+    return 0;
+  }
+
+  // Run beamformer (really the correlator)
+  struct timespec start, stop;
+  double total, per_call, max_bw, gbps;
+  complex16_t *update_d_cx_ptr = NULL;
+  void *update_b_out_ptr = NULL;
+  int doDump = 0;
+
+  rand_val(1); // init seed on RNG
+
+  for (int b=1; b<=blocks; b++) {
+    // prepare block offsets
+    context.input_offset = (b-1)*(rtbf_info.array_len);
+    context.output_offset = (b-1)*(Ne*Ne*Nc);
+    update_d_cx_ptr = d_cx_ptr + context.input_offset;
+    update_b_out_ptr = ((float*) rtbf_out_ptr) + 2*context.output_offset;
+    // fill data block
+    rtbfFillNetDataBlockBoxMullerRand(update_d_cx_ptr, &context);
+    // determine if this is a dump period
+    doDump = ((b % dumpOn) == 0 || (b==blocks)) ? 1 : 0;
+    // correlate!
+    clock_gettime(CLOCK_MONOTONIC, &start);
+    run_correlator(&context, doDump);
+    clock_gettime(CLOCK_MONOTONIC, &stop);
+    total += ELAPSED_MS(start, stop);
+    printf("run_correlator() time = %f ms\n", (float) ELAPSED_MS(start,stop));
+    // check
+    if (doDump && verbose && mode==BEAM_OP_COR) {
+      cpgopen("/xwin"); // docs say cpgopen should be used over beg now
+      cpgask(1);
+
+      // used to select the beam/channel
+      int chan_index = 0;
+
+      // for getting, setting, and using cursor info
+      float cx, cy;
+      char pressed;
+
+      // custom color table (see PGCTAB doc for how to define the table)
+      float L[7] = {0.00, 0.18, 0.36, 0.55, 0.72, 0.87, 1.00};
+      float R[7] = {0.10, 0.34, 0.59, 0.89, 1.00, 1.00, 1.00};
+      float G[7] = {0.00, 0.00, 0.20, 0.20, 0.43, 0.72, 1.00};
+      float B[7] = {0.23, 0.60, 0.47, 0.28, 0.00, 0.00, 0.76};
+      // doc said contra=1.0 bright=0.5 are nominal, but bright=0.3 looked best in this case
+      float contra = 1.0;
+      float bright = 0.3;
+      cpgctab(L, R, G, B, 7, contra, bright);
+
+      // setup env and labels before plotting
+      cpgenv(0.5, Ne+0.5, Ne+0.5, 0.5, 0, 1);
+      float tr[6]= {0.00, 1.00, 0.00, 0.00, 0.00, 1.00};
+
+      // TODO reproduce label for correlator case
+      //cpglab("theta", "dB arb. units", "beam patterns");
+
+      // array data
+      #define zabs(z) (sqrt(z.x*z.x+z.y*z.y))
+      float *bin_matrix_power = (float *) malloc((Ne*Ne) *sizeof(float));
+      float *pgmat = (float *) malloc((Ne*Ne) *sizeof(float));
+
+      // the approach here is to loop accepting input, and redraw the line instead
+      // of starting a new window (e.g., call to cpgenv)
+      printf("press 0-8, or x to end\n");
+      do {
+        // ask for cursor position (could have used cpgcurs, but this method
+        // provides the "mode" to put a vertical line through the cursor)
+        cpgcurs(&cx, &cy, &pressed);
+        //printf("cx=%f, cy=%f, pressed=%c\n", cx, cy, pressed);
+
+        if (pressed >= '0' && pressed <= '8') {
+          chan_index = atoi(&pressed);
+        }
+
+        // take pressed input
+        // copy the selected frequency bin into `bin_matrix_power`
+        for (int i=0; i<Ne; i++) {
+          for (int j=0; j<Ne; j++) {
+            int idx = j+i*Ne + chan_index*(Ne*Ne);
+            int oidx = j+i*Ne; // bin_matrix is only one channel long
+            float *dout = (float *) update_b_out_ptr; // cast from void* to float*
+            complex float y = CMPLXF(dout[2*idx], dout[2*idx+1]);
+            bin_matrix_power[oidx] = 10*log10((creal(conj(y)*y)));
+          }
+        }
+
+        // get max value for scaling image color
+        float max = maxf(bin_matrix_power, Ne*Ne);
+        max = (max==0)? 1 : max;
+        printf("max: %f\n", max);
+
+        // TODO does this approach work for cpimag???
+        // draw by redrawing the line, this was similar to what was done to animate
+        // from pgdemo11.f and my example analyzer codes
+        //
+        // TODO: pixels on edge are erased, need to do something like pgdemo11.f (and
+        // other demos) that slightly pad the edge so that the axis and labels are
+        // not erased
+        cpgbbuf();
+        cpgsci(0);
+        cpgimag(pgmat, Ne, Ne, 1, Ne, 1, Ne, 0.0, max, tr);
+        cpgsci(1);
+
+        for (int p=0; p<Ne*Ne; p++) {
+          pgmat[p] = bin_matrix_power[p];
+        }
+
+        cpgsci(1);
+        cpgimag(pgmat, Ne, Ne, 1, Ne, 1, Ne, 0.0, max, tr);
+        cpgebuf();
+      } while (pressed!='x');
+      free(pgmat);
+      free(bin_matrix_power);
+      cpgend();
+    }
+    // clear integration buffer on dump
+    if (doDump) {
+      doDump = 0;
+      rtbfClearCorrelatorIntegrationBuffer(&context);
+    }
+  }
+
+  printf("data_block_size=%lld, Rx_size=%d\n", rtbf_info.array_len, Ne*Ne*Nc);
+  double count = blocks;
+  per_call = total/count;
+  max_bw = rtbf_info.ntime*rtbf_info.nbin/per_call/1000; // MHz
+  // using the full ibuf length, the number of blocks needs to be divided out as
+  // this is what is transfered to the gpu in a single call to the rtbf
+  //gbps = ((float)(8 * (context.ibuf_array_len/count) * count)) / total / 1e6; // Gbps
+  gbps = ((float)(8 * (rtbf_info.array_len * sizeof(complex16_t) + Ne*Ne*Nc*2*sizeof(float))* count)) / total / 1e6; // Gbps
+  printf("Elapsed time %.6f ms total, %.6f ms/call average\n",
+      total, per_call);
+  printf("Theoretical BW_max %.3f MHz, throughput %.3f Gbps\n",
+      max_bw, gbps);
+
+  // cleanup and exit
+  free(d_cx_ptr);
+  free(rtbf_out_ptr);
+  rtbfCleanup(&context);
+
+  return 0;
+}
--- a/src/cublas_beamformer.cu
+++ b/src/cublas_beamformer.cu
@@ -275,6 +275,235 @@ int update_weights(RTBFContext* context, float* weights) {
  return rtbf_status;
 }

+// initialize the correlator
+int init_correlator(RTBFContext* context) {
+  switch(context->operational_mode) {
+    case BEAM_OP_COR:
+      break;
+    default:
+      printf("error initializing correlator component of rtbf, unrecognized operational mode\n");
+      return RTBF_INVALID_INIT;
+  }
+
+  RTBFInternalContext *internal = (RTBFInternalContext *) malloc(sizeof(RTBFInternalContext));
+  if (internal == NULL) {
+    return RTBF_OUTOFMEM_ERROR;
+  }
+  context->internal = internal;
+
+  long long unsigned int array_len      = compiletime_info.array_len;
+  long long unsigned int weight_len     = compiletime_info.weight_len;
+  long long unsigned int beamformed_len = compiletime_info.beamformed_len;
+  long long unsigned int beam_power_len = compiletime_info.beam_power_len;
+  long long unsigned int sti_stokes_len = compiletime_info.sti_stokes_len;
+
+  long long unsigned int Rx_matrix_len  = compiletime_info.narray_elements *
+                                          compiletime_info.narray_elements *
+                                          compiletime_info.nbin;
+
+  // register input and output buffers
+  // TODO if extension code works make common function for reuse
+  // TODO make sure no "beamformer" spefic code (i.e., make sure OK to apply to an Rx
+  //
+  // Note that in the context of the correlator that `obuf_len` would more
+  // properly be considered `matrix_len`
+  if (context->ibuf_array_h && context->obuf_beamformed_h) {
+    printf("registering user allocated memory with CUDA\n");
+    // Register caller-allocated host memory with CUDA.
+    // Round address down to nearest page_size boundary
+    uintptr_t ptr_ibuf = (uintptr_t)context->ibuf_array_h;
+    uintptr_t ptr_aligned_ibuf = ptr_ibuf - (ptr_ibuf % page_size);
+
+    uintptr_t ptr_obuf = (uintptr_t)context->obuf_beamformed_h;
+    uintptr_t ptr_aligned_obuf = ptr_obuf - (ptr_obuf % page_size);
+
+    // Compute length starting with compile time requirement
+    size_t ibuf_len = context->ibuf_array_len;
+    size_t obuf_len = context->obuf_beamformed_len;
+
+    // TODO Verify that lengths are at least
+    // TODO would have different lengths to verify in context of a Rx
+    // "compiletime_info.array_len*sizeof(complex16_t)"
+    // and as long as the selected operational mode, for raw beam op this would
+    // be
+    // "compiletime_info.beamformed_len *2*sizeof(float)"
+    // and for sti mode this is
+    // "compiletime_info.sti_stokes_len *2*sizeof(float"
+    // note, that the 2 is still used in stokes output len so that right now the
+    // user can pass in the right stokes output len and still work.
+
+    // Add in any rounding that was done to the input pointer
+    ibuf_len += (ptr_ibuf - ptr_aligned_ibuf);
+    // Round length up to next multiple of page size
+    ibuf_len = (ibuf_len+page_size-1) / page_size * page_size;
+
+    obuf_len += (ptr_obuf - ptr_aligned_obuf);
+    // Round length up to next multiple of page size
+    obuf_len = (obuf_len+page_size-1) / page_size * page_size;
+
+    printf("ibuf aligned context->ibuf_array_h = 0x%lx\n", ptr_aligned_ibuf);
+    printf("length = %lx\n", ibuf_len);
+
+    printf("obuf aligned context->obuf_beamformed_h = 0x%lx\n", ptr_aligned_obuf);
+    printf("length = %lx\n", obuf_len);
+
+    if ( ptr_aligned_ibuf <= (ptr_aligned_obuf+obuf_len-1) &&
+         ptr_aligned_obuf <= (ptr_aligned_ibuf+ibuf_len-1)) {
+      // clamping the individual memory regions results in an overlap
+      // and the regions are two close to separate. Clamp instead the whole
+      // region
+      uintptr_t ptr_aligned_min;
+      size_t length;
+      printf("there is a memory collision!!\n");
+      if (ptr_aligned_ibuf < ptr_aligned_obuf) {
+        ptr_aligned_min = ptr_aligned_ibuf;
+        length = (ptr_aligned_obuf+obuf_len) - ptr_aligned_min;
+      } else {
+        ptr_aligned_min = ptr_aligned_obuf;
+        length = (ptr_aligned_ibuf+ibuf_len) - ptr_aligned_min;
+      }
+      printf("aligned buf min = %lx\n", ptr_aligned_min);
+      printf("length = %lx\n", length);
+      cudaHostRegister((void *) ptr_aligned_min, length, 0);
+      internal->unregister_ibuf_array_h = (complex16_t*) ptr_aligned_min;
+      internal->unregister_obuf_beamformed_h = NULL;
+      checkCudaError();
+    } else {
+      printf("there is *not* a memory collision\n");
+      // there is no overlap of the memory regions and we pin them individually
+      cudaHostRegister((void *) ptr_aligned_ibuf, ibuf_len, 0);
+      cudaHostRegister((void *) ptr_aligned_obuf, obuf_len, 0);
+      internal->unregister_ibuf_array_h = (complex16_t*) ptr_aligned_ibuf;
+      internal->unregister_obuf_beamformed_h = (float *) ptr_aligned_obuf;
+      checkCudaError();
+    }
+
+  } else {
+    // for now, user must initialize rtbf with user allocated input and output array.
+    return RTBF_INVALID_INIT;
+  }
+
+  // initialize offset values to zero TODO redundant (common for correlator and beamformer)
+  context->input_offset = 0;
+  context->output_offset = 0;
+
+  // create cublas handle TODO redundant (common) for correlator and beamformer
+  cublasStatus_t blas_stat = CUBLAS_STATUS_SUCCESS;
+  blas_stat = cublasCreate(&(internal->handle));
+  checkCublasError(blas_stat);
+
+  // setup internal context blas dimensions
+  // TODO check these dimensions then explain how they work out because they do
+  // not exactly make sense with ntime not showing up in B anywhere since we are
+  // doing an auto correlation and B should be =A. I think what it shold be is
+  // that since that cublasCgemm takes in op(B) dimensions for these arguments
+  // they need to be such that rows_b is ntime and cols_b is nelements (i.e.,
+  // conjugate transposed). But what is confusing is that then setting the
+  // leading dimension of b since nrows_B
+  internal->nr_rows_A  = compiletime_info.narray_elements;
+  internal->nr_cols_A  = compiletime_info.ntime;
+  internal->nr_rows_B  = compiletime_info.ntime;//internal->nr_rows_A;
+  internal->nr_cols_B  = compiletime_info.narray_elements;//internal->nr_cols_A;
+  internal->nr_rows_C  = compiletime_info.narray_elements;
+  internal->nr_cols_C  = compiletime_info.narray_elements;
+  internal->batchCount = compiletime_info.nbin;
+
+  // allocate device memory for transpose, beamformer, and post-beamform operation
+  // TODO get rid of these, temp used to fill in "d_beamformed" and "obuf_D2H_len" lengths
+  cudaMalloc((void **) &(internal->d_restruct_in), array_len*sizeof(complex16_t));  // input data
+  cudaMalloc((void **) &(internal->d_array_data), array_len*sizeof(cuComplex));     // transpose
+  cudaMalloc((void **) &(internal->d_beamformed), Rx_matrix_len*sizeof(cuComplex)); // Rx output
+  checkCudaError();
+
+  // setup input and output information based on operational mode
+  internal->ibuf_H2D_len = array_len*sizeof(complex16_t);
+
+  // TODO place closer attention to differences here with Rx. But in reality it
+  // should function similar to BEAM_OP_RAW because outputs are complex and
+  // there is no additional processing
+  // The mode better only be BEAM_OP_COR 
+  if (context->operational_mode == BEAM_OP_COR) {
+    internal->obuf_D2H_len = Rx_matrix_len * sizeof(cuComplex);
+    internal->post_beamform_func = NULL; // skip post-processing kernel, dump beamformed outputs
+  }
+
+  cudaMalloc((void **) &(internal->d_post_beamform), internal->obuf_D2H_len);
+  checkCudaError();
+
+  if (context->operational_mode == BEAM_OP_COR) {
+    internal->d_output = (float *)internal->d_beamformed;
+  }
+
+  // local dimension variable references to use isntead of "internal->"
+  int nrows_A, ncols_A, nrows_B, ncols_B, nrows_C, ncols_C;
+  int batchCount;
+  nrows_A  = internal->nr_rows_A;
+  ncols_A  = internal->nr_cols_A;
+  nrows_B  = internal->nr_rows_B;
+  ncols_B  = internal->nr_cols_B;
+  nrows_C  = internal->nr_rows_C;
+  ncols_C  = internal->nr_cols_C;
+  batchCount = internal->batchCount;
+
+  // TODO cleanup also redundant (common) to beamformer from here until free of temp memory
+  // allocate 3 arrays on host, used to allocated device arrays needed for gemmBatched.
+  const cuComplex** h_arr_A = NULL; // array data
+  const cuComplex** h_arr_B = NULL;// for array data and possibly redundant since auto corr?
+  cuComplex** h_arr_C = NULL; // accumulation result
+
+  cudaMallocHost((void **) &h_arr_A, batchCount*sizeof(cuComplex*));
+  cudaMallocHost((void **) &h_arr_B, batchCount*sizeof(cuComplex*));
+  cudaMallocHost((void **) &h_arr_C, batchCount*sizeof(cuComplex*));
+  checkCudaError();
+
+  // TODO NOTE THE CHANGE HERE A is now data, weights not used and B is really
+  // redundant since the call in correlate() to cublasCgemmbatched() uses
+  // d_arr_A twice.
+  // TODO I wonder though if we do use B here and since nrows_b and ncols_B are
+  // set to op(B) if this casuses errors OR if we can not call cgemmbatched with
+  // OP_C or what goes on here
+  // allocate memory for each batch in an array.
+  for (int i=0; i<batchCount; i++) {
+    h_arr_A[i] = internal->d_array_data + i*nrows_A*ncols_A;
+    h_arr_B[i] = internal->d_array_data + i*nrows_B*ncols_B; // redundant, not used for correlator
+    h_arr_C[i] = internal->d_beamformed + i*nrows_C*ncols_C;
+  }
+
+  // allocate memory to arrays on device.
+  cudaMalloc((void **) &(internal->d_arr_A), batchCount*sizeof(cuComplex*));
+  cudaMalloc((void **) &(internal->d_arr_B), batchCount*sizeof(cuComplex*));
+  cudaMalloc((void **) &(internal->d_arr_C), batchCount*sizeof(cuComplex*));
+  checkCudaError();
+
+  // copy memory from host to device.
+  cudaMemcpy(internal->d_arr_A, h_arr_A, batchCount*sizeof(cuComplex*), cudaMemcpyHostToDevice);
+  cudaMemcpy(internal->d_arr_B, h_arr_B, batchCount*sizeof(cuComplex*), cudaMemcpyHostToDevice);
+  cudaMemcpy(internal->d_arr_C, h_arr_C, batchCount*sizeof(cuComplex*), cudaMemcpyHostToDevice);
+  checkCudaError();
+
+  // free the temporary memory
+  cudaFreeHost(h_arr_A);
+  cudaFreeHost(h_arr_B);
+  cudaFreeHost(h_arr_C);
+  checkCudaError();
+
+  // initialize beamformer metadata structure
+  // TODO how should metadata be handled here? Should we change?
+  rtbf_metadata *md = &(context->metadata);
+  md->xid = 0;
+  md->offsets = (float *) malloc(compiletime_info.nbeam*sizeof(float));
+  md->cal_filename = (char *) malloc(CHARLEN*sizeof(char));
+  md->algorithm = (char *) malloc(CHARLEN*sizeof(char));
+  md->weight_filename = (char *) malloc(CHARLEN*sizeof(char));
+  if (!(md->offsets)   || !(md->cal_filename) ||
+      !(md->algorithm) || !(md->weight_filename))
+  {
+    return RTBF_OUTOFMEM_ERROR;
+  }
+
+  return RTBF_OK;
+}
+
 // initialize the rtbf
 // TODO return if already initialized
 // TODO handle clean up if return early due to an error
@@ -494,6 +723,33 @@ int init_beamformer(RTBFContext* context) {
  return RTBF_OK;
 }

+const static cuComplex X_alf = make_cuComplex(1, 0);
+const static cuComplex X_bet = make_cuComplex(1, 0);
+
+int correlate(RTBFInternalContext* internal) {
+
+  cublasStatus_t blas_stat = CUBLAS_STATUS_SUCCESS;
+  cublasCgemmBatched(
+      internal->handle,
+      CUBLAS_OP_N,
+      CUBLAS_OP_C,
+      internal->nr_rows_A, //Ne//nrows_A// Number of rows in matrix A and C. TODO THIS IS op(A)
+      internal->nr_cols_B, //Ne//ncols_B// Number of columns in matrix B and C. TODO THIS IS op(B)
+      internal->nr_cols_A, //Nt//ncols_A// Number of columns and rows in matrix A and B respectively.TODO this is op(A) and op(B)
+      &X_alf,
+      internal->d_arr_A,
+      internal->nr_rows_A, //Ne//nrows_A// Leading dimension of each batch or matrix in array A (lda).TODO better explain
+      internal->d_arr_A,
+      internal->nr_cols_B, //Ne//nrows_B// Leading dimension of each batch or matrix in array B (ldb).TODO better explain
+      &X_bet,
+      internal->d_arr_C,
+      internal->nr_rows_C, //Ne//nrows_C// Leading dimension of each batch or matrix in array C (ldc).TODO better explain
+      internal->batchCount);
+
+  checkCublasError(blas_stat);
+  return RTBF_OK;
+}
+
 const static cuComplex alf = make_cuComplex(1, 0);
 const static cuComplex bet = make_cuComplex(0, 0);

@@ -668,6 +924,85 @@ void beamformed_spectra(cuComplex* data_in, float* data_out) {
  return;
 }

+// run_correlator was very common reusing most of this from `run_beamformer()`.
+// `run_beamformer()` could really be extended to include BEAM_OP_COR as it is
+// so similar to BEAM_OP_RAW in setup. However, the most significant difference
+// is needing to accomodate `doDump` and to avoid an unnecessary performance hit
+// to check for a dump in any beamformer mode it is better to have a standalone
+// implementation function. The only way I think this to really have a generic
+// `run_*()` function is that the library needs to support arbitrary function
+// chaning through a linked list of function/kernel pointers. The pipeline is
+// then configured as this list.
+int run_correlator(RTBFContext* context, int doDump) {
+
+  int rtbf_status = RTBF_OK;
+  RTBFInternalContext* internal = (RTBFInternalContext *)context->internal;
+  if (internal == NULL) {
+    rtbf_status = RTBF_NOT_INITIALIZED;
+    return rtbf_status;
+  }
+
+  // still need to check to make sure this is not called if in a different OP mode
+  switch (context->operational_mode) {
+    case BEAM_OP_COR:
+      break;
+    default:
+      rtbf_status = RTBF_FATAL;
+      return rtbf_status;
+  }
+
+  int Nm = context->mcnt_per_data_block;
+  int Nf = context->nfengines;
+  int Nt = context->time_per_mcnt;
+  int Nc = context->freq_per_xid;
+  int Ni = context->input_per_fid;
+
+  dim3 gridDim_transpose(Nm, Nf, Nt);
+  dim3 blockDim_transpose(Ni, Nc, 1);
+
+  // reference to host input and output buffers and adjust for current processing offset
+  complex16_t *ibuf_hp = context->ibuf_array_h;
+  ibuf_hp += context->input_offset;
+
+  void* obuf_hp = context->obuf_beamformed_h;
+  obuf_hp = ((cuComplex*) obuf_hp) + context->output_offset;
+
+  // copy memory onto device
+  cudaMemcpy(internal->d_restruct_in, ibuf_hp, internal->ibuf_H2D_len, cudaMemcpyHostToDevice);
+  checkCudaError();
+
+  // restructure data (transpose/corner turn) where bin is the slowest moving dimension for cublasCgemmBatched
+  data_restructure<<<gridDim_transpose, blockDim_transpose>>>(internal->d_restruct_in, internal->d_array_data);
+  checkCudaError();
+
+  // correlate
+  rtbf_status = correlate(internal);
+  checkCudaError();
+
+  // post-beamform kernel
+  /* no kernel for correlator, performance penalty to always make a check */
+
+  // copy output data from device to host
+  if (doDump) {
+    cudaMemcpy(obuf_hp, internal->d_output, internal->obuf_D2H_len, cudaMemcpyDeviceToHost);
+    checkCudaError();
+  }
+
+  return rtbf_status;
+}
+
+int rtbfClearCorrelatorIntegrationBuffer(RTBFContext *context) {
+  int rtbf_status = RTBF_OK;
+  RTBFInternalContext* internal = (RTBFInternalContext *)context->internal;
+  if (internal == NULL) {
+    rtbf_status = RTBF_NOT_INITIALIZED;
+    return rtbf_status;
+  }
+  cudaMemset(internal->d_output, '\0', internal->obuf_D2H_len);
+  checkCudaError();
+  return rtbf_status;
+}
+
 int run_beamformer(RTBFContext* context) {

  int rtbf_status = RTBF_OK;
@@ -793,6 +1128,7 @@ void rtbfCleanup(RTBFContext* context) {
    }

    // free metadata resources
+    // TODO this assumes the md is not null, would cause segfault
    rtbf_metadata* md = &(context->metadata);
    if (md->offsets != NULL) {
      free(md->offsets);

--- a/src/cublas_beamformer.h
+++ b/src/cublas_beamformer.h
@@ -143,7 +143,9 @@ typedef struct rtbf_metadata_struct {
 #define BEAM_OP_RAW 0  // raw beamformer iq outputs
 #define BEAM_OP_STI 1  // full-stokes spectrometer outputs w/ short time integration (STI)
 #define BEAM_OP_PSD 2  // beamformed power spectrometer
-// future mode
+// experimental extension
+#define BEAM_OP_COR 4  // correlator
+// future mode (not implemented)
 #define BEAM_OP_RFI 3  // adaptive real-time RFI cancellation

 typedef struct RTBFContextStruct {
@@ -254,6 +256,11 @@ int load_weights_from_file(RTBFContext *context, char *weight_filename);
 //    integration.
 int run_beamformer(RTBFContext *context);

+// TODO experimental extension to have correlator included in RTBF
+int init_correlator(RTBFContext *context);
+int run_correlator(RTBFContext *context, int doDump);
+int rtbfClearCorrelatorIntegrationBuffer(RTBFContext *context);
+
 // Free up resources used by the rtbf and reset the RTBFContext.
 //
 // This is to be called prior to makeing a change to the operational mode and