Design NN on ZCU102 (4) — Data Pack for Higher Bandwidth
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
struct DATA_TRANS{
float d1;
float d2;
float d3;
float d4;
};// 4B*4=16Bytes

struct DMA_DATA{
DATA_TRANS data;
bool last;
};


void datapack(hls::stream<DMA_DATA> &in_stream, hls::stream<DMA_DATA> &out_stream, int op){
#pragma HLS INTERFACE s_axilite port=return bundle=CRTL_BUS
#pragma HLS INTERFACE s_axilite port=op bundle=CRTL_BUS
#pragma HLS INTERFACE axis port=in_stream
#pragma HLS INTERFACE axis port=out_stream

DATA_TRANS A[512];
#pragma HLS RESOURCE variable=A core=RAM_1P_BRAM
#pragma HLS DATA_PACK variable=A field_level
//#pragma HLS ARRAY_PARTITION variable=A complete dim=1
DATA_TRANS B[512];
#pragma HLS RESOURCE variable=B core=RAM_1P_BRAM
#pragma HLS DATA_PACK variable=B field_level
//#pragma HLS ARRAY_PARTITION variable=B complete dim=1
DMA_DATA input;
DMA_DATA output;


if(op==0){
L1:for(int i=0;i<512;i++){
#pragma HLS pipeline II=1
input = in_stream.read();
A[i] = input.data;
}
}else if(op==1){
L2:for(int i=0;i<512;i++){
#pragma HLS pipeline II=1
B[i].d1 = A[i].d1+0.54;
B[i].d2 = A[i].d2+0.37;
B[i].d3 = A[i].d4+A[i].d3+0.28;
B[i].d4 = 0.19;
}
}else if(op==2){
L3:for(int i=0;i<512;i++){
#pragma HLS pipeline II=1
output.last=0;
if(i==511){
output.last=1;
}else if(i==0){
output.data.d1 = 1.2;
output.data.d2 = 42.2;
output.data.d3 = 23.2;
output.data.d4 = 15.2;
}else{
output.data = B[i];
}
out_stream.write(output);
}
}
}

The size of data transmission at each time is: 512*16B=8KB.

###Bandwidth from PL to PS
First, we test the bandwidth from PL to PS.
The SDK code is shown as follows.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
timer.startTimer();
for(int i=0;i<131072;i++){
XDatapack_Set_op(&do_datapack,2);
XDatapack_Start(&do_datapack);
// Xil_DCacheFlushRange((INTPTR)start_addr_wr,trans_len_wr*sizeof(DATA_TRANS));
XAxiDma_SimpleTransfer(&do_axi_dma,(INTPTR)start_addr_wr,trans_len_wr*sizeof(DATA_TRANS),XAXIDMA_DEVICE_TO_DMA);
while(XAxiDma_Busy(&do_axi_dma,XAXIDMA_DEVICE_TO_DMA));
while(!XDatapack_IsDone(&do_datapack));

// Xil_DCacheInvalidateRange((INTPTR)start_addr_wr,trans_len_wr*sizeof(DATA_TRANS));
}
timer.stopTimer();
timeInterval = timer.getElapsedTimerInSeconds();
printf("Receiving weights using ticks %f\n",timeInterval);

In the test, we launch 131072 times of data transmission from PL to PS.
The total size is 8KB*131072=1GB.
Note that we do not consider cache flush and invalidate.
The result is shown as follows.

1
2
3
Start: 0
End: 144113621
Receiving weights using ticks 1.441280

Based on the above information, we can see that the bandwidth is 1GB/1.441280 = 710.48MB/s.

###Bandwidth from PS to PL
Next, we test the bandwidth from PL to PS. The SDK code is shown as follows.

1
2
3
4
5
6
7
8
9
10
11
12
13
timer.startTimer();
for(int i=0;i<131072;i++){
XDatapack_Set_op(&do_datapack,0);
XDatapack_Start(&do_datapack);

// Xil_DCacheFlushRange((INTPTR)start_addr_in,trans_len_in*sizeof(DATA_TRANS));
XAxiDma_SimpleTransfer(&do_axi_dma,(INTPTR)start_addr_in,trans_len_in*sizeof(DATA_TRANS),XAXIDMA_DMA_TO_DEVICE);
while(XAxiDma_Busy(&do_axi_dma,XAXIDMA_DMA_TO_DEVICE));
while(!XDatapack_IsDone(&do_datapack));
}
timer.stopTimer();
timeInterval = timer.getElapsedTimerInSeconds();
printf("Receiving weights using ticks %f\n",timeInterval);

The result is shown as follows.

1
2
3
Start: 0
End: 106654906
Receiving weights using ticks 1.066656

Then, we can calculate the bandwidth is 1GB/1.066656=960.01MB/s.
Note that if we add cache flush in the loop, the elapsed time will be increased to 1.777953 seconds.
That is, the bandwidth is 1GB/1.777953=575.94MB/s.