JQub | ECE@GMU

Design NN on ZCU102 (4) — Data Pack for Higher Bandwidth

struct DATA_TRANS{
	float d1;
	float d2;
	float d3;
	float d4;
};// 4B*4=16Bytes

struct DMA_DATA{
	DATA_TRANS data;
	bool last;
};


void datapack(hls::stream<DMA_DATA> &in_stream, hls::stream<DMA_DATA> &out_stream, int op){
#pragma HLS INTERFACE s_axilite port=return bundle=CRTL_BUS
#pragma HLS INTERFACE s_axilite port=op bundle=CRTL_BUS
#pragma HLS INTERFACE axis port=in_stream
#pragma HLS INTERFACE axis port=out_stream

	DATA_TRANS A[512];
#pragma HLS RESOURCE variable=A core=RAM_1P_BRAM
#pragma HLS DATA_PACK variable=A field_level
//#pragma HLS ARRAY_PARTITION variable=A complete dim=1
	DATA_TRANS B[512];
#pragma HLS RESOURCE variable=B core=RAM_1P_BRAM
#pragma HLS DATA_PACK variable=B field_level
//#pragma HLS ARRAY_PARTITION variable=B complete dim=1
	DMA_DATA input;
	DMA_DATA output;


	if(op==0){
		L1:for(int i=0;i<512;i++){
#pragma HLS pipeline II=1
			input = in_stream.read();
			A[i] = input.data;
		}
	}else if(op==1){
		L2:for(int i=0;i<512;i++){
#pragma HLS pipeline II=1
			B[i].d1 = A[i].d1+0.54;
			B[i].d2 = A[i].d2+0.37;
			B[i].d3 = A[i].d4+A[i].d3+0.28;
			B[i].d4 = 0.19;
		}
	}else if(op==2){
		L3:for(int i=0;i<512;i++){
#pragma HLS pipeline II=1
			output.last=0;
			if(i==511){
				output.last=1;
			}else if(i==0){
				output.data.d1 = 1.2;
				output.data.d2 = 42.2;
				output.data.d3 = 23.2;
				output.data.d4 = 15.2;
			}else{
				output.data = B[i];
			}
			out_stream.write(output);
		}
	}
}

The size of data transmission at each time is: 512*16B=8KB.

###Bandwidth from PL to PS
First, we test the bandwidth from PL to PS.
The SDK code is shown as follows.

timer.startTimer();
	for(int i=0;i<131072;i++){
		XDatapack_Set_op(&do_datapack,2);
		XDatapack_Start(&do_datapack);
//		Xil_DCacheFlushRange((INTPTR)start_addr_wr,trans_len_wr*sizeof(DATA_TRANS));
		XAxiDma_SimpleTransfer(&do_axi_dma,(INTPTR)start_addr_wr,trans_len_wr*sizeof(DATA_TRANS),XAXIDMA_DEVICE_TO_DMA);
		while(XAxiDma_Busy(&do_axi_dma,XAXIDMA_DEVICE_TO_DMA));
		while(!XDatapack_IsDone(&do_datapack));

//		Xil_DCacheInvalidateRange((INTPTR)start_addr_wr,trans_len_wr*sizeof(DATA_TRANS));
	}
	timer.stopTimer();
	timeInterval = timer.getElapsedTimerInSeconds();
	printf("Receiving weights using ticks %f\n",timeInterval);

In the test, we launch 131072 times of data transmission from PL to PS.
The total size is 8KB*131072=1GB.
Note that we do not consider cache flush and invalidate.
The result is shown as follows.

1
2
3

Start: 0
End: 144113621
Receiving weights using ticks 1.441280

Based on the above information, we can see that the bandwidth is 1GB/1.441280 = 710.48MB/s.

###Bandwidth from PS to PL
Next, we test the bandwidth from PL to PS. The SDK code is shown as follows.

timer.startTimer();
	for(int i=0;i<131072;i++){
		XDatapack_Set_op(&do_datapack,0);
		XDatapack_Start(&do_datapack);

//		Xil_DCacheFlushRange((INTPTR)start_addr_in,trans_len_in*sizeof(DATA_TRANS));
		XAxiDma_SimpleTransfer(&do_axi_dma,(INTPTR)start_addr_in,trans_len_in*sizeof(DATA_TRANS),XAXIDMA_DMA_TO_DEVICE);
		while(XAxiDma_Busy(&do_axi_dma,XAXIDMA_DMA_TO_DEVICE));
		while(!XDatapack_IsDone(&do_datapack));
	}
	timer.stopTimer();
	timeInterval = timer.getElapsedTimerInSeconds();
	printf("Receiving weights using ticks %f\n",timeInterval);

The result is shown as follows.

1
2
3

Start: 0
End: 106654906
Receiving weights using ticks 1.066656

Then, we can calculate the bandwidth is 1GB/1.066656=960.01MB/s.
Note that if we add cache flush in the loop, the elapsed time will be increased to 1.777953 seconds.
That is, the bandwidth is 1GB/1.777953=575.94MB/s.