Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Open sidebar
ras-devel
FLAG
Commits
518bb408
Commit
518bb408
authored
Aug 01, 2017
by
Mitch Burnett
Committed by
GitHub
Aug 01, 2017
Browse files
Merge pull request #10 from mitchburnett/master
Mid July/Aug Commissioning Improvements
parents
fc266e55
93d4b889
Changes
14
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
842 additions
and
570 deletions
+842
-570
lib/beamformer/src/cublas_beamformer.cu
lib/beamformer/src/cublas_beamformer.cu
+159
-121
lib/pfb/src/pfb.cu
lib/pfb/src/pfb.cu
+7
-1
lib/pfb/src/pfb.h
lib/pfb/src/pfb.h
+2
-1
src/Makefile.am
src/Makefile.am
+8
-2
src/fifo.c
src/fifo.c
+2
-2
src/flag_beamform_thread.c
src/flag_beamform_thread.c
+77
-77
src/flag_correlator_thread.c
src/flag_correlator_thread.c
+32
-8
src/flag_databuf.c
src/flag_databuf.c
+66
-35
src/flag_databuf.h
src/flag_databuf.h
+16
-2
src/flag_net_thread.c
src/flag_net_thread.c
+142
-92
src/flag_pfb_correlator_thread.c
src/flag_pfb_correlator_thread.c
+173
-152
src/flag_pfb_thread.c
src/flag_pfb_thread.c
+87
-49
src/flag_pfb_transpose_thread.c
src/flag_pfb_transpose_thread.c
+32
-11
src/flag_transpose_thread.c
src/flag_transpose_thread.c
+39
-17
No files found.
lib/beamformer/src/cublas_beamformer.cu
View file @
518bb408
...
...
@@ -15,82 +15,18 @@
using
namespace
std
;
// CUDA-specific function prototypes
void
print_matrix
(
const
cuComplex
*
A
,
int
nr_rows_A
,
int
nr_cols_A
,
int
nr_sheets_A
);
void
print_matrix2
(
const
float
*
A
,
int
nr_rows_A
,
int
nr_cols_A
);
void
GPU_fill
(
cuComplex
*
A
,
int
nr_rows_A
,
int
nr_cols_A
);
void
beamform
();
void
GPU_fill2
(
cuComplex
*
A
,
int
nr_rows_A
,
int
nr_cols_A
);
__global__
void
transpose
(
signed
char
*
data
,
signed
char
*
tra_data
);
__global__
void
data_restructure
(
signed
char
*
data
,
cuComplex
*
data_restruc
);
void
beamform
();
__global__
void
sti_reduction
(
cuComplex
*
data_in
,
float
*
data_out
);
// Fill the array A(nr_rows_A, nr_cols_A) with random numbers on GPU
void
GPU_fill
(
cuComplex
*
A
,
int
nr_rows_A
,
int
nr_cols_A
)
{
cuComplex
*
G
;
G
=
new
cuComplex
[
nr_rows_A
*
nr_cols_A
];
for
(
int
i
=
0
;
i
<
nr_rows_A
*
nr_cols_A
;
++
i
){
G
[
i
].
x
=
(
i
+
1
)
%
(
nr_rows_A
*
nr_cols_A
/
(
BN_BIN
));
G
[
i
].
y
=
(
i
+
1
)
%
(
nr_rows_A
*
nr_cols_A
/
(
BN_BIN
));
}
cudaMemcpy
(
A
,
G
,
nr_rows_A
*
nr_cols_A
*
sizeof
(
cuComplex
),
cudaMemcpyHostToDevice
);
delete
[]
G
;
}
void
GPU_fill2
(
cuComplex
*
A
,
int
nr_rows_A
,
int
nr_cols_A
)
{
cuComplex
*
G
;
G
=
new
cuComplex
[
nr_rows_A
*
nr_cols_A
];
for
(
int
i
=
0
;
i
<
nr_rows_A
*
nr_cols_A
;
++
i
){
G
[
i
].
x
=
i
%
(
nr_rows_A
*
nr_cols_A
/
(
BN_BIN
));
G
[
i
].
y
=
i
%
(
nr_rows_A
*
nr_cols_A
/
(
BN_BIN
));
}
cudaMemcpy
(
A
,
G
,
nr_rows_A
*
nr_cols_A
*
sizeof
(
cuComplex
),
cudaMemcpyHostToDevice
);
delete
[]
G
;
}
void
print_matrix
(
const
cuComplex
*
A
,
int
nr_rows_A
,
int
nr_cols_A
,
int
nr_sheets_A
)
{
for
(
int
i
=
0
;
i
<
nr_rows_A
;
++
i
){
for
(
int
j
=
0
;
j
<
nr_cols_A
;
++
j
){
for
(
int
k
=
0
;
k
<
nr_sheets_A
;
++
k
){
// cout << A[j * nr_rows_A + i].x << "+" << A[j * nr_rows_A + i].y << "i" <<" ";
printf
(
"%i,%i,%i: %e + %e i
\n
"
,
i
,
j
,
k
,
A
[
k
*
nr_rows_A
*
nr_cols_A
+
j
*
nr_rows_A
+
i
].
x
,
A
[
k
*
nr_rows_A
*
nr_cols_A
+
j
*
nr_rows_A
+
i
].
y
);
}
}
// cout << endl;
}
// cout << endl;
// for(int i = 0; i < nr_rows_A*nr_cols_A; ++i){
// printf("%i,: %e + %e i\n",i,A[i].x, A[i].y);
// }
}
void
print_matrix2
(
const
float
*
A
,
int
nr_rows_A
,
int
nr_cols_A
)
{
// for(int j = 0; j < nr_cols_A; ++j){
// for(int i = 0; i < nr_rows_A; ++i){
// //cout << A[j * nr_rows_A + i].x << "+" << A[j * nr_rows_A + i].y << "i" <<" ";
// printf("%i,%i: %e\n",i,j,A[j * nr_rows_A + i]);
// }
// cout << endl;
// }
// cout << endl;
for
(
int
i
=
0
;
i
<
nr_rows_A
*
nr_cols_A
;
++
i
){
printf
(
"%i,: %e
\n
"
,
i
,
A
[
i
]);
}
}
// Struct defintion for beamformer metadata
typedef
struct
bf_metadata_struct
{
float
offsets
[
14
];
...
...
@@ -99,11 +35,16 @@ typedef struct bf_metadata_struct {
char
weight_filename
[
65
];
long
long
unsigned
int
xid
;
}
bf_metadata
;
static
bf_metadata
my_metadata
;
static
cuComplex
*
d_weights
=
NULL
;
void
update_weights
(
char
*
filename
){
printf
(
"In update_weights()...
\n
"
);
printf
(
"RTBF: In update_weights()...
\n
"
);
char
weight_filename
[
128
];
strcpy
(
weight_filename
,
filename
);
FILE
*
weights
;
...
...
@@ -115,19 +56,24 @@ void update_weights(char * filename){
bf_weights
=
(
float
*
)
malloc
(
2
*
BN_WEIGHTS
*
sizeof
(
float
));
weights_dc
=
(
float
complex
*
)
malloc
(
BN_WEIGHTS
*
sizeof
(
float
complex
*
));
weights_dc_n
=
(
float
complex
*
)
malloc
(
BN_WEIGHTS
*
sizeof
(
float
complex
*
));
// open weight file
weights
=
fopen
(
weight_filename
,
"r"
);
int
j
;
if
(
weights
!=
NULL
)
{
fread
(
bf_weights
,
sizeof
(
float
),
2
*
BN_WEIGHTS
,
weights
);
fread
(
my_metadata
.
offsets
,
sizeof
(
float
),
14
,
weights
);
fread
(
my_metadata
.
cal_filename
,
sizeof
(
char
),
64
,
weights
);
my_metadata
.
cal_filename
[
64
]
=
'\0'
;
fread
(
my_metadata
.
algorithm
,
sizeof
(
char
),
64
,
weights
);
my_metadata
.
algorithm
[
64
]
=
'\0'
;
fread
(
&
(
my_metadata
.
xid
),
sizeof
(
long
long
unsigned
int
),
1
,
weights
);
my_metadata
.
cal_filename
[
64
]
=
'\0'
;
my_metadata
.
algorithm
[
64
]
=
'\0'
;
// Extract all path information from weight_filename for metadata
char
*
short_filename
=
strrchr
(
weight_filename
,
'/'
);
if
(
short_filename
!=
NULL
)
{
...
...
@@ -137,8 +83,6 @@ void update_weights(char * filename){
strcpy
(
my_metadata
.
weight_filename
,
weight_filename
);
}
// Convert to complex numbers (do a conjugate at the same time)
for
(
j
=
0
;
j
<
BN_WEIGHTS
;
j
++
){
weights_dc_n
[
j
]
=
bf_weights
[
2
*
j
]
-
bf_weights
[(
2
*
j
)
+
1
]
*
I
;
...
...
@@ -159,14 +103,16 @@ void update_weights(char * filename){
}
fclose
(
weights
);
}
free
(
bf_weights
);
// Copy weights to device
cudaMemcpy
(
d_weights
,
weights_dc
,
BN_WEIGHTS
*
sizeof
(
cuComplex
),
cudaMemcpyHostToDevice
);
//r_weights instead of weights_dc //*BN_TIME
// free memory
free
(
weights_dc
);
free
(
weights_dc_n
);
free
(
bf_weights
);
return
;
}
void
bf_get_offsets
(
float
*
offsets
){
...
...
@@ -202,20 +148,25 @@ long long unsigned int bf_get_xid(){
return
my_metadata
.
xid
;
}
static
cuComplex
**
d_arr_A
=
NULL
;
static
cuComplex
**
d_arr_B
=
NULL
;
static
cuComplex
**
d_arr_C
=
NULL
;
static
cuComplex
*
d_beamformed
=
NULL
;
static
cuComplex
*
d_data
=
NULL
;
static
signed
char
*
d_data1
=
NULL
;
// Device memory for input data
static
signed
char
*
d_data2
=
NULL
;
static
float
*
d_outputs
;
static
cublasHandle_t
handle
;
static
cuComplex
**
d_arr_A
=
NULL
;
static
cuComplex
**
d_arr_B
=
NULL
;
static
cuComplex
**
d_arr_C
=
NULL
;
void
init_beamformer
(){
// Allocate memory for the weights, data, beamformer output, and sti output.
// Allocate memory for the weights, data, beamformer output, and sti output.
cudaMalloc
((
void
**
)
&
d_weights
,
BN_WEIGHTS
*
sizeof
(
cuComplex
));
//*BN_TIME
cudaMalloc
((
void
**
)
&
d_data1
,
2
*
BN_SAMP
*
sizeof
(
signed
char
));
//cudaMalloc((void **)&d_data2, 2*BN_SAMP*sizeof(signed char));
cudaMalloc
((
void
**
)
&
d_data
,
BN_SAMP
*
sizeof
(
cuComplex
));
cudaError_t
err_malloc
=
cudaMalloc
((
void
**
)
&
d_beamformed
,
BN_TBF
*
sizeof
(
cuComplex
));
...
...
@@ -229,9 +180,6 @@ void init_beamformer(){
* Create a handle for CUBLAS
**********************************************************/
cublasCreate
(
&
handle
);
// This is all memory allocated to arrays that are used by gemmBatched.
// Allocate 3 arrays on CPU
cudaError_t
cudaStat
;
int
nr_rows_A
,
nr_cols_A
,
nr_rows_B
,
nr_cols_B
,
nr_rows_C
,
nr_cols_C
;
...
...
@@ -243,8 +191,11 @@ void init_beamformer(){
nr_rows_C
=
BN_BEAM
;
nr_cols_C
=
BN_TIME
;
// Allocate memory to host arrays.
const
cuComplex
**
h_arr_A
=
0
;
const
cuComplex
**
h_arr_B
=
0
;
cuComplex
**
h_arr_C
=
0
;
// Allocate memory to host arrays - This is all memory allocated to arrays that are used by gemmBatched. Allocate 3 arrays on CPU
const
cuComplex
**
h_arr_A
=
0
;
const
cuComplex
**
h_arr_B
=
0
;
cuComplex
**
h_arr_C
=
0
;
h_arr_A
=
(
const
cuComplex
**
)
malloc
(
nr_rows_A
*
nr_cols_A
*
BN_BIN
*
sizeof
(
const
cuComplex
*
));
h_arr_B
=
(
const
cuComplex
**
)
malloc
(
nr_rows_B
*
nr_cols_B
*
BN_BIN
*
sizeof
(
const
cuComplex
*
));
h_arr_C
=
(
cuComplex
**
)
malloc
(
nr_rows_C
*
nr_cols_C
*
BN_BIN
*
sizeof
(
cuComplex
*
));
...
...
@@ -256,9 +207,6 @@ void init_beamformer(){
h_arr_C
[
i
]
=
d_beamformed
+
i
*
nr_rows_C
*
nr_cols_C
;
}
// delete[] d_A;
// delete[] d_B;
// Allocate memory to arrays on device.
cudaStat
=
cudaMalloc
((
void
**
)
&
d_arr_A
,
nr_rows_A
*
nr_cols_A
*
BN_BIN
*
sizeof
(
cuComplex
*
));
assert
(
!
cudaStat
);
...
...
@@ -275,19 +223,12 @@ void init_beamformer(){
cudaStat
=
cudaMemcpy
(
d_arr_C
,
h_arr_C
,
nr_rows_C
*
nr_cols_C
*
BN_BIN
*
sizeof
(
cuComplex
*
),
cudaMemcpyHostToDevice
);
assert
(
!
cudaStat
);
free
(
h_arr_A
);
free
(
h_arr_B
);
free
(
h_arr_C
);
}
__global__
void
data_restructure
(
signed
char
*
data
,
cuComplex
*
data_restruc
){
int
e
=
threadIdx
.
x
;
int
t
=
blockIdx
.
x
;
int
f
=
blockIdx
.
y
;
return
;
//Restructure data so that the frequency bin is the slowest moving index
data_restruc
[
f
*
BN_TIME
*
BN_ELE_BLOC
+
t
*
BN_ELE_BLOC
+
e
].
x
=
data
[
2
*
(
t
*
BN_BIN
*
BN_ELE_BLOC
+
f
*
BN_ELE_BLOC
+
e
)]
*
1.0
f
;
data_restruc
[
f
*
BN_TIME
*
BN_ELE_BLOC
+
t
*
BN_ELE_BLOC
+
e
].
y
=
data
[
2
*
(
t
*
BN_BIN
*
BN_ELE_BLOC
+
f
*
BN_ELE_BLOC
+
e
)
+
1
]
*
1.0
f
;
}
signed
char
*
data_in
(
char
*
input_filename
){
...
...
@@ -347,7 +288,7 @@ void beamform() {
nr_rows_C
=
BN_BEAM
;
// Leading dimensions are always the rows of each matrix since the data is stored in a column-wise order.
int
lda
=
nr_rows_A
,
ldb
=
nr_rows_B
,
ldc
=
nr_rows_C
;
int
lda
=
nr_rows_A
,
ldb
=
nr_rows_B
,
ldc
=
nr_rows_C
;
cuComplex
alf
;
cuComplex
bet
;
...
...
@@ -380,6 +321,11 @@ void beamform() {
ldc
,
// Leading dimension of each batch or matrix in array C.
batchCount
);
// Number of batches in each array.
if
(
stat
==
CUBLAS_STATUS_INVALID_VALUE
)
{
printf
(
"RTBF: Invalid CUBLAS values
\n
"
);
}
else
if
(
stat
==
CUBLAS_STATUS_EXECUTION_FAILED
)
{
printf
(
"RTBF: Execution failed.
\n
"
);
}
if
(
stat
!=
CUBLAS_STATUS_SUCCESS
){
cerr
<<
"cublasCgemmBatched failed"
<<
endl
;
...
...
@@ -387,16 +333,80 @@ void beamform() {
}
assert
(
!
cudaGetLastError
());
//Free GPU memory
// cudaFree(d_A);
// cudaFree(d_B);
// cudaFree(d_C);
}
__global__
void
transpose
(
signed
char
*
data
,
signed
char
*
tra_data
)
{
int
i
=
threadIdx
.
x
;
int
c
=
threadIdx
.
y
;
int
m
=
blockIdx
.
x
;
int
f
=
blockIdx
.
y
;
int
t
=
blockIdx
.
z
;
//int Nm = gridDim.x; // number of mcnts (packets)
int
Nf
=
gridDim
.
y
;
// number of f-engines (ROACHES)
int
Nt
=
gridDim
.
z
;
// time samples per mcnt
int
Ni
=
blockDim
.
x
;
// inputs per f-engine (aka antenna elements per ROACH)
int
Nc
=
blockDim
.
y
;
// bins per mcnt
int
in_idx
=
i
+
Ni
*
c
+
Nc
*
Ni
*
t
+
Nt
*
Nc
*
Ni
*
f
+
Nf
*
Nt
*
Nc
*
Ni
*
m
;
int
out_idx
=
i
+
Ni
*
f
+
Nf
*
Ni
*
c
+
Nc
*
Nf
*
Ni
*
t
+
Nt
*
Nc
*
Nf
*
Ni
*
m
;
// Destroy the handle
//cublasDestroy(handle)
;
tra_data
[
2
*
out_idx
]
=
data
[
2
*
in_idx
];
tra_data
[
2
*
out_idx
+
1
]
=
data
[
2
*
in_idx
+
1
]
;
return
;
}
__global__
void
data_restructure
(
signed
char
*
data
,
cuComplex
*
data_restruc
){
/*
Repurpose the transpose thread in the hashpipe codes by performing the transpose in the GPU.
The motivation was, why transpose then transpose again? Why not just perform one transpose
in the GPU which would be faster anyway.
*/
int
i
=
threadIdx
.
x
;
int
c
=
threadIdx
.
y
;
int
m
=
blockIdx
.
x
;
int
f
=
blockIdx
.
y
;
int
t
=
blockIdx
.
z
;
int
Nm
=
gridDim
.
x
;
// number of mcnts (packets)
int
Nf
=
gridDim
.
y
;
// number of f-engines (ROACHES)
int
Nt
=
gridDim
.
z
;
// time samples per mcnt
int
Ni
=
blockDim
.
x
;
// inputs per f-engine (aka antenna elements per ROACH)
int
Nc
=
blockDim
.
y
;
// bins per mcnt
int
in_idx
=
i
+
Ni
*
c
+
Nc
*
Ni
*
t
+
Nt
*
Nc
*
Ni
*
f
+
Nf
*
Nt
*
Nc
*
Ni
*
m
;
int
out_idx
=
i
+
Ni
*
f
+
Nf
*
Ni
*
t
+
Nt
*
Nf
*
Ni
*
m
+
Nm
*
Nt
*
Nf
*
Ni
*
c
;
data_restruc
[
out_idx
].
x
=
data
[
2
*
in_idx
]
*
1.0
f
;
data_restruc
[
out_idx
].
y
=
data
[
2
*
in_idx
+
1
]
*
1.0
f
;
return
;
/*
// Original Code
int e = threadIdx.x;
int t = blockIdx.x;
int f = blockIdx.y;
//Restructure data so that the frequency bin is the slowest moving index
data_restruc[f*BN_TIME*BN_ELE_BLOC + t*BN_ELE_BLOC + e].x = data[2*(t*BN_BIN*BN_ELE_BLOC + f*BN_ELE_BLOC + e)]*1.0f;
data_restruc[f*BN_TIME*BN_ELE_BLOC + t*BN_ELE_BLOC + e].y = data[2*(t*BN_BIN*BN_ELE_BLOC + f*BN_ELE_BLOC + e) + 1]*1.0f;
return;
*/
}
__global__
void
sti_reduction
(
cuComplex
*
data_in
,
float
*
data_out
)
{
...
...
@@ -467,9 +477,13 @@ void sti_reduction(cuComplex * data_in, float * data_out) {
data_out
[
output_idx
(
2
,
b
,
s
,
f
)]
=
reduced_array1
[
0
].
x
*
scale
;
// XY* real.
data_out
[
output_idx
(
3
,
b
,
s
,
f
)]
=
reduced_array1
[
0
].
y
*
scale
;
// XY* imaginary.
}
return
;
}
void
run_beamformer
(
signed
char
*
data_in
,
float
*
data_out
)
{
cudaError_t
err_code
;
// Specify grid and block dimensions
dim3
dimBlock
(
BN_STI_BLOC
,
1
,
1
);
dim3
dimGrid
(
BN_BIN
,
BN_BEAM1
,
BN_STI
);
...
...
@@ -478,19 +492,43 @@ void run_beamformer(signed char * data_in, float * data_out) {
dim3
dimBlock_d
(
BN_ELE_BLOC
,
1
,
1
);
dim3
dimGrid_d
(
BN_TIME
,
BN_BIN
,
1
);
signed
char
*
d_restruct_in
=
d_data1
;
int
Nm
=
200
;
int
Nf
=
8
;
int
Nt
=
20
;
int
Nc
=
25
;
int
Ni
=
8
;
dim3
gridDim_transpose
(
Nm
,
Nf
,
Nt
);
dim3
blockDim_transpose
(
Ni
,
Nc
,
1
);
signed
char
*
d_tra_data_in
=
d_data1
;
//signed char* d_tra_data_out = d_data2;
//signed char * d_restruct_in = d_data1;
cuComplex
*
d_restruct_out
=
d_data
;
cudaMemcpy
(
d_restruct_in
,
data_in
,
2
*
BN_SAMP
*
sizeof
(
signed
char
),
cudaMemcpyHostToDevice
);
//cudaMemcpy(d_restruct_in, data_in, 2*BN_SAMP*sizeof(signed char), cudaMemcpyHostToDevice);
cudaMemcpy
(
d_tra_data_in
,
data_in
,
2
*
BN_SAMP
*
sizeof
(
signed
char
),
cudaMemcpyHostToDevice
);
err_code
=
cudaGetLastError
();
if
(
err_code
!=
cudaSuccess
)
{
printf
(
"RTBF: cudaMemcpy Failed: %s
\n
"
,
cudaGetErrorString
(
err_code
));
}
// Restructure data for cublasCgemmBatched function.
data_restructure
<<<
dimGrid_d
,
dimBlock_d
>>>
(
d_restruct_in
,
d_restruct_out
);
// Transpose the data
// transpose<<<gridDim_transpose, blockDim_transpose>>>(d_tra_data_in, d_tra_data_out);
// if (err_code != cudaSuccess) {
// printf("RTBF: CUDA Error (transpose): %s\n", cudaGetErrorString(err_code));
// }
// printf("Starting beamformer\n");
// Restructure data for cublasCgemmBatched function.
data_restructure
<<<
dimGrid_d
,
dimBlock_d
>>>
(
d_tra_data_in
,
d_restruct_out
);
//data_restructure<<<gridDim_transpose, blockDim_transpose>>>(d_restruct_in, d_restruct_out);
//data_restructure<<<dimGrid_d, dimBlock_d>>>(d_restruct_in, d_restruct_out);
if
(
err_code
!=
cudaSuccess
)
{
printf
(
"RTBF: CUDA Error (data_restructure): %s
\n
"
,
cudaGetErrorString
(
err_code
));
}
// Call beamformer function containing cublasCgemmBatched()
beamform
();
cudaError_t
err_code
=
cudaGetLastError
();
err_code
=
cudaGetLastError
();
if
(
err_code
!=
cudaSuccess
)
{
printf
(
"CUDA Error (beamform): %s
\n
"
,
cudaGetErrorString
(
err_code
));
}
...
...
@@ -498,13 +536,9 @@ void run_beamformer(signed char * data_in, float * data_out) {
cuComplex
*
d_sti_in
=
d_beamformed
;
float
*
d_sti_out
=
d_outputs
;
// printf("Starting sti_reduction\n");
// Call STI reduction kernel.
sti_reduction
<<<
dimGrid
,
dimBlock
>>>
(
d_sti_in
,
d_sti_out
);
// printf("Finishing sti_reduction\n");
err_code
=
cudaGetLastError
();
if
(
err_code
!=
cudaSuccess
)
{
printf
(
"CUDA Error (sti_reduction): %s
\n
"
,
cudaGetErrorString
(
err_code
));
...
...
@@ -513,8 +547,7 @@ void run_beamformer(signed char * data_in, float * data_out) {
// Copy output data from device to host.
cudaMemcpy
(
data_out
,
d_sti_out
,
BN_POL
*
(
BN_OUTPUTS
*
sizeof
(
float
)
/
2
),
cudaMemcpyDeviceToHost
);
// cudaFree(d_data);
// cudaFree(d_outputs);
return
;
}
...
...
@@ -532,6 +565,10 @@ void rtbfCleanup() {
cudaFree
(
d_data1
);
}
if
(
d_data2
!=
NULL
)
{
cudaFree
(
d_data2
);
}
if
(
d_outputs
!=
NULL
)
{
cudaFree
(
d_outputs
);
}
...
...
@@ -553,4 +590,5 @@ void rtbfCleanup() {
}
// Free up and release cublas handle
cublasDestroy
(
handle
);
}
lib/pfb/src/pfb.cu
View file @
518bb408
...
...
@@ -117,13 +117,19 @@ int runPFB(signed char* inputData_h, float* outputData_h, params pfbParams) {
int
outDataSize
=
countFFT
*
g_iNumSubBands
*
g_iNFFT
;
//CUDASafeCallWithCleanUp(cudaMemcpy(outputData_h, fftOutPtr, outDataSize*sizeof(cufftComplex), cudaMemcpyDeviceToHost));
//printf("making sure new build...\n");
CUDASafeCallWithCleanUp
(
cudaMemcpy
(
outputData_h
,
g_pf2FFTOut_d
,
outDataSize
*
sizeof
(
cufftComplex
),
cudaMemcpyDeviceToHost
));
return
iRet
;
}
void
flushBuffer
(
params
pfbParams
)
{
int
start
=
pfbParams
.
fine_channels
*
pfbParams
.
elements
*
pfbParams
.
nfft
*
pfbParams
.
taps
;
CUDASafeCallWithCleanUp
(
cudaMemset
((
void
*
)
g_pc2Data_d
,
0
,
start
*
2
*
sizeof
(
char
)));
return
;
}
// return true or false upon successful setup.
int
initPFB
(
int
iCudaDevice
,
params
pfbParams
){
...
...
lib/pfb/src/pfb.h
View file @
518bb408
...
...
@@ -16,7 +16,7 @@
#define FALSE 0
#define TRUE 1
#define DEBUG 1
//
#define DEBUG 1
#define DEF_CUDA_DEVICE 0
...
...
@@ -56,5 +56,6 @@ int runPFB(signed char* inputData_h, float* outputData_h, params pfbParams);
int
doFFT
();
int
resetDevice
(
void
);
void
cleanUp
(
void
);
void
flushBuffer
(
params
pfbParams
);
#endif
src/Makefile.am
View file @
518bb408
...
...
@@ -41,9 +41,15 @@ flag_x_threads = flag_net_thread.c \
flag_correlator_thread.c
\
flag_corsave_thread.c
# CPU Transpose
#flag_b_threads = flag_net_thread.c \
# flag_transpose_thread.c \
# flag_beamform_thread.c \
# flag_beamsave_thread.c
# GPU Transpose
flag_b_threads
=
flag_net_thread.c
\
flag_transpose_thread.c
\
flag_beamform_thread.c
\
flag_transpose_beamform_thread.c
\
flag_beamsave_thread.c
flag_f_threads
=
flag_net_thread.c
\
...
...
src/fifo.c
View file @
518bb408
src/flag_beamform_thread.c
View file @
518bb408
...
...
@@ -94,17 +94,16 @@ static void * run(hashpipe_thread_args_t * args) {
state
cur_state
=
ACQUIRE
;
state
next_state
=
ACQUIRE
;
int64_t
good_data
=
1
;
char
weight_flag
[
8
]
;
int
weight_flag
;
char
netstat
[
17
];
char
weight_file
[
17
];
// Indicate in shared memory buffer that this thread is ready to start
hashpipe_status_lock_safe
(
&
st
);
hputi4
(
st
.
buf
,
"
COR
READY"
,
1
);
hputi4
(
st
.
buf
,
"
RBF
READY"
,
1
);
hashpipe_status_unlock_safe
(
&
st
);
int
check_count
=
0
;
// Main loop for thread
while
(
run_threads
())
{
...
...
@@ -117,26 +116,23 @@ static void * run(hashpipe_thread_args_t * args) {
hashpipe_status_lock_safe
(
&
st
);
hgetl
(
st
.
buf
,
"CLEANB"
,
&
cleanb
);
hgets
(
st
.
buf
,
"NETSTAT"
,
16
,
netstat
);
hget
s
(
st
.
buf
,
"WFLAG"
,
8
,
weight_flag
);
hget
l
(
st
.
buf
,
"WFLAG"
,
&
weight_flag
);
hashpipe_status_unlock_safe
(
&
st
);
if
(
cleanb
==
0
&&
strcmp
(
netstat
,
"CLEANUP"
)
==
0
)
{
next_state
=
CLEANUP
;
printf
(
"BF: Entering CLEANUP state
\n
"
);
break
;
}
if
(
strcmp
(
weight_flag
,
"1"
)
==
0
){
if
(
weight_flag
)
{
hashpipe_status_lock_safe
(
&
st
);
hgets
(
st
.
buf
,
"BWEIFILE"
,
16
,
weight_file
);
hashpipe_status_unlock_safe
(
&
st
);
sprintf
(
w_dir
,
"%s\%s"
,
weightdir
,
weight_file
);
printf
(
"BF: Weight file name: %s
\n
"
,
w_dir
);
printf
(
"RTBF: Weight file name: %s
\n
"
,
w_dir
);
printf
(
"RTB: Initializing beamformer weights...
\n
"
);
// update_weights(weight_file);
printf
(
"RTBF: Initializing beamformer weights...
\n
"
);
update_weights
(
w_dir
);
printf
(
"RTB: Finished updating weights...
\n
"
);
printf
(
"RTB
F
: Finished updating weights...
\n
"
);
// Put metadata into status shared memory
float
offsets
[
BN_BEAM
];
char
cal_filename
[
65
];
...
...
@@ -169,10 +165,7 @@ static void * run(hashpipe_thread_args_t * args) {
hputs
(
st
.
buf
,
"BALGORIT"
,
algorithm
);
hputs
(
st
.
buf
,
"BWFILE"
,
weight_filename
);
hgeti4
(
st
.
buf
,
"XID"
,
&
act_xid
);
hashpipe_status_unlock_safe
(
&
st
);
hashpipe_status_lock_safe
(
&
st
);
hputs
(
st
.
buf
,
"WFLAG"
,
"0"
);
hputl
(
st
.
buf
,
"WFLAG"
,
0
);
hashpipe_status_unlock_safe
(
&
st
);
}
}
...
...
@@ -187,15 +180,14 @@ static void * run(hashpipe_thread_args_t * args) {
// If CLEANUP, don't continue processing
if
(
next_state
!=
CLEANUP
)
{
if
(
DEBUG
)
{
// Print out the header information for this block
flag_gpu_input_header_t
tmp_header
;
memcpy
(
&
tmp_header
,
&
db_in
->
block
[
curblock_in
].
header
,
sizeof
(
flag_gpu_input_header_t
));
good_data
=
tmp_header
.
good_data
;
hashpipe_status_lock_safe
(
&
st
);
hputi4
(
st
.
buf
,
"BEAMMCNT"
,
tmp_header
.
mcnt
);
hashpipe_status_unlock_safe
(
&
st
);
}
// Wait for output block to become free
while
((
rv
=
flag_gpu_beamformer_output_databuf_wait_free
(
db_out
,
curblock_out
))
!=
HASHPIPE_OK
)
{
...
...
@@ -219,18 +211,14 @@ static void * run(hashpipe_thread_args_t * args) {
printf
(
"RTBF: Warning!!!!!!!!! Time = %f ms
\n
"
,
(
float
)
tval_result
.
tv_usec
/
1000
);
}
check_count
++
;
// if(check_count == 1000){
// }
// Get block's starting mcnt for output block
db_out
->
block
[
curblock_out
].
header
.
mcnt
=
tmp_header
.
mcnt
;
db_out
->
block
[
curblock_out
].
header
.
good_data
=
good_data
;
//printf("BF: good_data = %lld\n", (long long int)good_data);
db_out
->
block
[
curblock_out
].
header
.
mcnt
=
db_in
->
block
[
curblock_in
].
header
.
mcnt
;
db_out
->
block
[
curblock_out
].
header
.
good_data
=
db_in
->
block
[
curblock_in
].
header
.
good_data
;
if
(
VERBOSE
)
{
printf
(
"BF: Setting block %d, mcnt %lld as filled
\n
"
,
curblock_out
,
(
long
long
int
)
db_out
->
block
[
curblock_out
].
header
.
mcnt
);