1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 struct DATA_TRANS{ float d1; float d2; float d3; float d4; };// 4B*4=16Bytes struct DMA_DATA{ DATA_TRANS data; bool last; }; void datapack(hls::stream<DMA_DATA> &in_stream, hls::stream<DMA_DATA> &out_stream, int op){ #pragma HLS INTERFACE s_axilite port=return bundle=CRTL_BUS #pragma HLS INTERFACE s_axilite port=op bundle=CRTL_BUS #pragma HLS INTERFACE axis port=in_stream #pragma HLS INTERFACE axis port=out_stream DATA_TRANS A[512]; #pragma HLS RESOURCE variable=A core=RAM_1P_BRAM #pragma HLS DATA_PACK variable=A field_level //#pragma HLS ARRAY_PARTITION variable=A complete dim=1 DATA_TRANS B[512]; #pragma HLS RESOURCE variable=B core=RAM_1P_BRAM #pragma HLS DATA_PACK variable=B field_level //#pragma HLS ARRAY_PARTITION variable=B complete dim=1 DMA_DATA input; DMA_DATA output; if(op==0){ L1:for(int i=0;i<512;i++){ #pragma HLS pipeline II=1 input = in_stream.read(); A[i] = input.data; } }else if(op==1){ L2:for(int i=0;i<512;i++){ #pragma HLS pipeline II=1 B[i].d1 = A[i].d1+0.54; B[i].d2 = A[i].d2+0.37; B[i].d3 = A[i].d4+A[i].d3+0.28; B[i].d4 = 0.19; } }else if(op==2){ L3:for(int i=0;i<512;i++){ #pragma HLS pipeline II=1 output.last=0; if(i==511){ output.last=1; }else if(i==0){ output.data.d1 = 1.2; output.data.d2 = 42.2; output.data.d3 = 23.2; output.data.d4 = 15.2; }else{ output.data = B[i]; } out_stream.write(output); } } }
The size of data transmission at each time is: 512*16B=8KB.
###Bandwidth from PL to PS First, we test the bandwidth from PL to PS. The SDK code is shown as follows.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 timer.startTimer(); for(int i=0;i<131072;i++){ XDatapack_Set_op(&do_datapack,2); XDatapack_Start(&do_datapack); // Xil_DCacheFlushRange((INTPTR)start_addr_wr,trans_len_wr*sizeof(DATA_TRANS)); XAxiDma_SimpleTransfer(&do_axi_dma,(INTPTR)start_addr_wr,trans_len_wr*sizeof(DATA_TRANS),XAXIDMA_DEVICE_TO_DMA); while(XAxiDma_Busy(&do_axi_dma,XAXIDMA_DEVICE_TO_DMA)); while(!XDatapack_IsDone(&do_datapack)); // Xil_DCacheInvalidateRange((INTPTR)start_addr_wr,trans_len_wr*sizeof(DATA_TRANS)); } timer.stopTimer(); timeInterval = timer.getElapsedTimerInSeconds(); printf("Receiving weights using ticks %f\n",timeInterval);
In the test, we launch 131072 times of data transmission from PL to PS. The total size is 8KB*131072=1GB. Note that we do not consider cache flush and invalidate. The result is shown as follows.
1 2 3 Start: 0 End: 144113621 Receiving weights using ticks 1.441280
Based on the above information, we can see that the bandwidth is 1GB/1.441280 = 710.48MB/s .
###Bandwidth from PS to PL Next, we test the bandwidth from PL to PS. The SDK code is shown as follows.
1 2 3 4 5 6 7 8 9 10 11 12 13 timer.startTimer(); for(int i=0;i<131072;i++){ XDatapack_Set_op(&do_datapack,0); XDatapack_Start(&do_datapack); // Xil_DCacheFlushRange((INTPTR)start_addr_in,trans_len_in*sizeof(DATA_TRANS)); XAxiDma_SimpleTransfer(&do_axi_dma,(INTPTR)start_addr_in,trans_len_in*sizeof(DATA_TRANS),XAXIDMA_DMA_TO_DEVICE); while(XAxiDma_Busy(&do_axi_dma,XAXIDMA_DMA_TO_DEVICE)); while(!XDatapack_IsDone(&do_datapack)); } timer.stopTimer(); timeInterval = timer.getElapsedTimerInSeconds(); printf("Receiving weights using ticks %f\n",timeInterval);
The result is shown as follows.
1 2 3 Start: 0 End: 106654906 Receiving weights using ticks 1.066656
Then, we can calculate the bandwidth is 1GB/1.066656=960.01MB/s . Note that if we add cache flush in the loop, the elapsed time will be increased to 1.777953 seconds. That is, the bandwidth is 1GB/1.777953=575.94MB/s .