Skip to content

Instantly share code, notes, and snippets.

@franout
Last active May 13, 2020 22:02
Show Gist options
  • Select an option

  • Save franout/5e1fc80fc8242c6fb6e639a2af52b132 to your computer and use it in GitHub Desktop.

Select an option

Save franout/5e1fc80fc8242c6fb6e639a2af52b132 to your computer and use it in GitHub Desktop.
create .so library from PYNQ python code for DTPU accelerator
import cffi
##########################################################################
#### create .so library from PYNQ python code for DTPU accelerator ######
##########################################################################
ffibuilder = cffi.FFI()
ffibuilder.cdef("""
extern "Python" {
void destroy_p(void * );
bool CopyFromBufferHandle_p(void);
bool CopyToBufferHandle_p(void);
void FreeBufferHandle_p(void);
bool SelectDataTypeComputation_p(int);
bool Init_p(int ,int,int);
bool Prepare_p(int,int ,int);
bool Invoke_p(int *,int,int*,int);
void load_overlay(void);
bool ResetHardware_p(void);
void push_weight_to_heap( void *,int *,int);
};
void * tflite_plugin_create_delegate();
void tflite_plugin_destroy_delegate(void * );
bool SelectDataTypeComputation(int);""")
ffibuilder.set_source("dtpu_lib", r"""
#include <tensorflow/lite/c/c_api.h>
#include <tensorflow/lite/c/c_api_experimental.h>
#include <tensorflow/lite/context_util.h>
#include <vector>
static void destroy_p(void * delegate);
static bool CopyFromBufferHandle_p(void);
static bool CopyToBufferHandle_p(void);
static void FreeBufferHandle_p(void);
static bool SelectDataTypeComputation_p(int);
static bool Init_p(int,int,int );
static bool Prepare_p(int, int,int );
static bool Invoke_p(int *,int,int*,int);
static void load_overlay(void);
static bool ResetHardware_p(void);
static void push_weight_to_heap( void *,int *,int);
/*
possible operations
kTfLiteBuiltinAdd = 0,
kTfLiteBuiltinConcatenation = 2,
kTfLiteBuiltinConv2d = 3,
kTfLiteBuiltinDepthwiseConv2d = 4,
kTfLiteBuiltinDepthToSpace = 5,
kTfLiteBuiltinFullyConnected = 9,
kTfLiteBuiltinMul = 18,
kTfLiteBuiltinSub = 41,
kTfLiteBuiltinDelegate = 51,
kTfLiteBuiltinAddN = 106,
*/
namespace tflite{
class TfLiteIntArrayView;
}
extern "C" {
// This is where the execution of the operations or whole graph happens.
// The class below has an empty implementation just as a guideline
// on the structure.
class DTPU_delegate {
public:
// Returns true if my delegate can handle this type of op.
static bool SupportedOp(const TfLiteRegistration* registration) {
// from builtin_ops.h
printf("[DEBUG - C]--- Supported Operation of DTPU delegate class --- \n");
switch (registration->builtin_code) {
case kTfLiteBuiltinConv2d:
case kTfLiteBuiltinDepthwiseConv2d:
printf("[DEBUG - C]--Hello world! I can make 2D convolution and depth wise 2D convolution---\n");
return true;
default:
return false;
}
}
// Any initialization code needed
bool Init(TfLiteContext* context,const TfLiteDelegateParams* delegate_params) {
printf("[DEBUG - C]--- Init of DTPU delegate class --- \n");
// instantiate buffers and soft reset of accelerator
return Init_p(context->tensors_size,delegate_params->input_tensors->size ,delegate_params->output_tensors->size);
}
// Any preparation work needed (e.g. allocate buffers)
TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
printf("[DEBUG - C]--- Prepare of DTPU delegate class --- \n");
// initialize, link the buffers accordint to the size of node data
// kTfLiteMmapRo aka weights
int i;
int num_weight_tensor=0;
TfLiteTensor tmp;
for(i=0;i<context->tensors_size;i++){
tmp=context->tensors[i];
if(tmp.allocation_type==kTfLiteMmapRo){
printf("[DEBUG -C]---found a tensor weight----\n");
switch (tmp.type) {
case kTfLiteFloat32 :
printf("[DEBUG-C]---- kTfLitefloat32 ------\n");
// push_weight_to_heap(tmp.data.f,tmp.dims->data,tmp.dims->size);
case kTfLiteInt32 :
printf("[DEBUG-C]---- kTfLiteInt32 ------\n");
// push_weight_to_heap(tmp.data.i32,tmp.dims->data,tmp.dims->size);
case kTfLiteUInt8 :
printf("[DEBUG-C]---- kTfLiteUint8 ------\n");
push_weight_to_heap(tmp.data.uint8,tmp.dims->data,tmp.dims->size);
num_weight_tensor++;
case kTfLiteInt64 :
printf("[DEBUG-C]---- kTfLiteInt64------\n");
//push_weight_to_heap(tmp.data.i64,tmp.dims->data,tmp.dims->size);
case kTfLiteString :
printf("[DEBUG-C]---- kTfLiteString ------\n");
//push_weight_to_heap(tmp.data.raw,tmp.dims->data,tmp.dims->size);
case kTfLiteBool :
printf("[DEBUG-C]---- kTfLiteBool ------\n");
// push_weight_to_heap(tmp.data.b,tmp.dims->data,tmp.dims->size);
case kTfLiteInt16 :
printf("[DEBUG-C]---- kTfLiteInt16 ------\n");
//push_weight_to_heap(tmp.data.i16,tmp.dims->data,tmp.dims->size);
case kTfLiteComplex64 :
printf("[DEBUG-C]---- kTfLiteComplex64 ------\n");
// push_weight_to_heap(tmp.data.c64,tmp.dims->data,tmp.dims->size);
case kTfLiteInt8 :
printf("[DEBUG-C]---- kTfLiteInt8 ------\n");
push_weight_to_heap(tmp.data.int8,tmp.dims->data,tmp.dims->size);
num_weight_tensor++;
case kTfLiteFloat16 :
printf("[DEBUG-C]---- kTfLiteF16 ------\n");
//push_weight_to_heap(tmp.data.f16,tmp.dims->data,tmp.dims->size);
default:
printf("[DEBUG -C]---- error no type for the weight ----\n");
}
}
}
if(Prepare_p(node->inputs->size,node->outputs->size,num_weight_tensor)){
return kTfLiteOk;
}
return kTfLiteError ;
}
// Actual running of the delegate subgraph.
TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node) {
printf("[DEBUG - C]--- Invoke of DTPU delegate class --- \n");
// run inference on the delegate and data transfer to/from memory/accelerator
if(Invoke_p(node->inputs->data,node->inputs->size,node->outputs->data,node->outputs->size)){
return kTfLiteOk;
}
return kTfLiteError ;
}
};
// use TfLiteType instead of int
TfLiteStatus SelectDataTypeComputation(int data_type ){
printf("[DEBUG - C]--- SelectDataTypeComputation of DTPU delegate class --- \n");
if(SelectDataTypeComputation_p(data_type)){
return kTfLiteOk;
}
return kTfLiteError ;
}
TfLiteStatus ResetHardware( ){
printf("[DEBUG - C]--- Reset underlaying hardware --- \n");
if(ResetHardware_p()){
return kTfLiteOk;
}
return kTfLiteError ;
}
// Create the TfLiteRegistration for the Kernel node which will replace
// the subgraph in the main TfLite graph.
TfLiteRegistration GetMyDelegateNodeRegistration() {
// This is the registration for the Delegate Node that gets added to
// the TFLite graph instead of the subGraph it replaces.
// It is treated as a an OP node. But in our case
// Init will initialize the delegate
// Invoke will run the delegate graph.
// Prepare for preparing the delegate.
// Free for any cleaning needed by the delegate.
printf("[DEBUG - C] --- get delegate node registration function ---\n");
TfLiteRegistration kernel_registration;
kernel_registration.builtin_code = kTfLiteBuiltinDelegate;
kernel_registration.custom_name = "DTPU_delegate";
kernel_registration.free = [](TfLiteContext* context, void* buffer) -> void {
delete reinterpret_cast<DTPU_delegate*>(buffer);
};
kernel_registration.init = [](TfLiteContext* context, const char* buffer,
size_t) -> void* {
// In the node init phase, initialize MyDelegate instance
const TfLiteDelegateParams* delegate_params =
reinterpret_cast<const TfLiteDelegateParams*>(buffer);
DTPU_delegate* my_delegate = new DTPU_delegate;
if (!my_delegate->Init(context, delegate_params)) {
return nullptr;
}
return my_delegate;
};
kernel_registration.invoke = [](TfLiteContext* context,
TfLiteNode* node) -> TfLiteStatus {
DTPU_delegate* kernel = reinterpret_cast<DTPU_delegate*>(node->user_data);
return kernel->Invoke(context, node);
};
kernel_registration.prepare = [](TfLiteContext* context,
TfLiteNode* node) -> TfLiteStatus {
DTPU_delegate* kernel = reinterpret_cast<DTPU_delegate*>(node->user_data);
return kernel->Prepare(context, node);
};
return kernel_registration;
}
// TfLiteDelegate methods
// interface to tensorflow runtime
TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
// Claim all nodes that can be evaluated by the delegate and ask the
// framework to update the graph with delegate kernel instead.
// Reserve 1 element, since we need first element to be size.
printf("[DEBUG - C] ---- preparing the delegate ----\n");
std::vector<int> supported_nodes(1);
TfLiteIntArray* plan;
TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
TfLiteNode* node;
TfLiteRegistration* registration;
for (int node_index : tflite::TfLiteIntArrayView(plan) ) {
TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
context, node_index, &node, &registration));
if (DTPU_delegate::SupportedOp(registration)) {
supported_nodes.push_back(node_index);
}
}
// Set first element to the number of nodes to replace.
supported_nodes[0] = supported_nodes.size() - 1;
TfLiteRegistration my_delegate_kernel_registration =
GetMyDelegateNodeRegistration();
// This call split the graphs into subgraphs, for subgraphs that can be
// handled by the delegate, it will replace it with a
// 'my_delegate_kernel_registration'
return context->ReplaceNodeSubsetsWithDelegateKernels(
context, my_delegate_kernel_registration,
reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()), delegate);
}
void FreeBufferHandle(TfLiteContext* context, TfLiteDelegate* delegate,
TfLiteBufferHandle* handle) {
printf("[DEBUG - C]--- Do any cleanups---\n");
FreeBufferHandle_p();
}
TfLiteStatus CopyToBufferHandle(TfLiteContext* context,
TfLiteDelegate* delegate,
TfLiteBufferHandle buffer_handle,
TfLiteTensor* tensor) {
printf("[DEBUG - C]--- Copies data from tensor to delegate buffer if needed.----\n");
if(CopyToBufferHandle_p()){
return kTfLiteOk;
}
return kTfLiteError;
}
TfLiteStatus CopyFromBufferHandle(TfLiteContext* context,
TfLiteDelegate* delegate,
TfLiteBufferHandle buffer_handle,
TfLiteTensor* tensor) {
printf("[DEBUG - C]---Copies the data from delegate buffer into the tensor raw memory----\n");
if(CopyFromBufferHandle_p()){
return kTfLiteOk;
}
return kTfLiteError;
}
// instantiate the delegate, it returns null if there is an error
TfLiteDelegate * tflite_plugin_create_delegate()
//char** argv , char** argv2, size_t argc, void (*report_error)(const char *) )
{
TfLiteDelegate* delegate = new TfLiteDelegate;
delegate->data_ = nullptr;
delegate->flags = kTfLiteDelegateFlagsNone;
delegate->Prepare = &DelegatePrepare;
// This cannot be null.
delegate->CopyFromBufferHandle = &CopyFromBufferHandle;
// This can be null.
delegate->CopyToBufferHandle = &CopyToBufferHandle;
// This can be null.
delegate->FreeBufferHandle = &FreeBufferHandle;
// load overlay
load_overlay();
printf("[DEBUG - C] ---the delegate method of DTPU is born---\n");
return delegate;
}
void tflite_plugin_destroy_delegate(void * delegate_op ) {
// destroy the delegate
TfLiteDelegate * delegate= (TfLiteDelegate *) delegate_op;
printf("[DEBUG - C]-----cleaning memory -> callback of python function---\n");
//destroy_p(delegate);
free(delegate);
}
}
""",source_extension=".cpp")
#if you want to simply access a global variable you just use its name.
# However to change its value you need to use the global keyword.
ffibuilder.embedding_init_code("""
from dtpu_lib import ffi
from pynq import Overlay
from pynq import allocate
from pynq import MMIO
from pynq import Xlnk
from pynq.lib import dma
import numpy as np
#########################################
############ MEMORY MAP #################
#########################################
BASE_ADDRESS_ACCELERATOR=0x43C00000
ADDRESS_RANGE_ACCELERATOR=0x10000
# address reg offset
CTRL =0x0000
STATUS =0x0004
IARG_RQT_EN =0x0010
OARG_RQT_EN =0x0014
CMD =0x0028
OARG_LENGTH_MODE =0x003C
ISCALAR_FIFO_RST =0x0040
OSCALAR_FIFO_RST =0x0044
ISCALAR_RQT_EN =0x0048
OSCALAR_RQT_EN =0x004C
ISCALAR0_DATA =0x0080
ISCALAR1_DATA =0x0084
ISCALAR2_DATA =0x0088
ISCALAR3_DATA =0x008C
ISCALAR4_DATA =0x0090
ISCALAR5_DATA =0x0094
ISCALAR6_DATA =0x0098
ISCALAR7_DATA =0x009C
ISCALAR8_DATA =0x00A0
ISCALAR9_DATA =0x00A4
ISCALAR10_DATA=0x00A8
ISCALAR11_DATA =0x00AC
ISCALAR12_DATA =0x00B0
ISCALAR13_DATA =0x00B4
ISCALAR14_DATA =0x00B8
ISCALAR15_DATA =0x00BC
OSCALAR0_DATA =0x00C0
OSCALAR1_DATA =0x00C4
OSCALAR2_DATA =0x00C8
OSCALAR3_DATA =0x00CC
OSCALAR4_DATA =0x00D0
OSCALAR5_DATA =0x00D4
OSCALAR6_DATA =0x00D8
OSCALAR7_DATA =0x00DC
IARG0_STATUS =0x0100
IARG1_STATUS =0x0104
IARG2_STATUS =0x0108
IARG3_STATUS =0x010C
IARG4_STATUS =0x0110
IARG5_STATUS =0x0114
IARG6_STATUS =0x0118
IARG7_STATUS =0x011C
OARG0_STATUS =0x0140
OARG1_STATUS =0x0144
OARG2_STATUS =0x0148
OARG3_STATUS =0x014C
OARG4_STATUS =0x0150
OARG5_STATUS =0x0154
OARG6_STATUS =0x0158
OARG7_STATUS =0x015C
ISCALAR0_STATUS =0x0180
ISCALAR1_STATUS =0x0184
ISCALAR2_STATUS =0x0188
ISCALAR3_STATUS =0x018C
ISCALAR4_STATUS =0x0190
ISCALAR5_STATUS =0x0194
ISCALAR6_STATUS =0x0198
ISCALAR7_STATUS =0x019C
ISCALAR8_STATUS =0x01A0
ISCALAR9_STATUS =0x01A4
ISCALAR10_STATUS =0x01A8
ISCALAR11_STATUS =0x01AC
ISCALAR12_STATUS =0x01B0
ISCALAR13_STATUS =0x01B4
ISCALAR14_STATUS =0x01B8
ISCALAR15_STATUS =0x01BC
OSCALAR0_STATUS =0x01C0
OSCALAR1_STATUS =0x01C4
OSCALAR2_STATUS =0x01C8
OSCALAR3_STATUS =0x01CC
OSCALAR4_STATUS =0x01D0
OSCALAR5_STATUS =0x01D4
OSCALAR6_STATUS =0x01D8
OSCALAR7_STATUS =0x01DC
OSCALAR8_STATUS =0x01E0
OSCALAR9_STATUS =0x01E4
OSCALAR10_STATUS =0x01E8
OSCALAR11_STATUS =0x01EC
OSCALAR12_STATUS =0x01F0
OSCALAR13_STATUS =0x01F4
OSCALAR14_STATUS =0x01F8
OSCALAR15_STATUS =0x01FC
OARG0_LENGTH =0x0200
OARG1_LENGTH =0x0204
OARG2_LENGTH =0x0208
OARG3_LENGTH =0x020C
OARG4_LENGTH =0x0210
OARG5_LENGTH =0x0214
OARG6_LENGTH =0x0218
OARG7_LENGTH =0x021C
OARG0_TDEST =0x0240
OARG1_TDEST =0x0244
OARG2_TDEST =0x0248
OARG3_TDEST =0x024C
OARG4_TDEST =0x0250
OARG5_TDEST =0x0254
OARG6_TDEST =0x0258
OARG7_TDEST =0x025C
#####################################################
########## CSR DEFINITIONS ##########
########## MEMORY MAP ##########
########## bitwidth 8 ##########
########## see csr_definition.vh ##########
#####################################################
ARITHMETIC_PRECISION=0
FP_MODE=1
BATCH_SIZE=2 # aka active rows
TRANSPARENT_DELAY_REGISTER=3
DEBUG=4
TEST_OPTIONS=5
ACTIVATE_CHAIN=0x1
INT8=0x1
INT16=0X03
INT32=0x07
INT64=0x0F
# precision of fp computation is tuned using the
# integer precision
ACTIVE_FP=1<<0
ACTIVE_BFP=0x03
ROUNDING=0x00
NO_FP=0x00
WMEM_STARTING_ADDRESS=0 #32 MSB
##############################################
#### accelerator adapter command #############
##############################################
CMD_UPDATE_IN_ARDG=0x0
CMD_UPDATE_OUT_ARG=0x1
CMD_EXECUTE_STEP=0x2
CMD_EXECUTE_CONTINUOS=0x4
CMD_STOP_EXECUTE_CONTINOUS=0x5
##############################################
BASE_ADDRESS_INTC=0x40800000
ADDRESS_RANGE_INTC=0x10000
BASE_ADDRESS_DMA_INFIFO=0x40400000
ADDRESS_RANGE_DMA_INFIFO=0x10000
BASE_ADDRESS_DMA_WM=0x40410000
ADDRESS_RANGE_DMA_WM=0x10000
accelerator=None
input_fifo_buffer=None
output_fifo_buffer=None
weight_buffer=None
csr_buffer=None
overlay=None
driver_csr=None
driver_wm=None
driver_fifo_in=None
driver_fifo_out=None
#####################################
### DESIGN DEPENDENT DEFINITION #####
#####################################
WMEM_SIZE=131072
INFIFO_SIZE=2048
OUTFIFO_SIZE=2048
ROWS=8
COLUMNS=8
DATAWIDTH=64
curr_data_precision=INT8
size_tot=0
input_size=0
output_size=0
num_weight=0
global_iteration=0
global_iteration_shift_wm=[]
tensors=[]
class Tensor:
def __init__(self,data, tot_dim,size_l):
self.tot_dim=tot_dim
self.data=data
self.size_l=size_l
######################################
############ LOAD DESIGN #############
######################################
@ffi.def_extern()
def load_overlay():
global accelerator
global overlay
overlay = Overlay("/home/xilinx/pynqz2.bit") # tcl is also parsed
overlay.download() # Explicitly download bitstream to PL
if overlay.is_loaded():
# Checks if a bitstream is loaded
print("[DEBUG- PYTHON] ----overlay is loaded ----")
else :
print("[DEBUG- PYTHON] ---- overlay is not loaded----")
exit(-1)
accelerator=overlay.dtpu.axis_accelerator_ada
## soft reset
accelerator.write(CTRL,0x0000001)
accelerator.write(CTRL,0x0000000)
accelerator.write(IARG_RQT_EN,0x000000007) ## all data avialable csr, weights and data
accelerator.write(OARG_RQT_EN,1) # out fifo must be empty
accelerator.write(OARG_LENGTH_MODE,0) # hardware mode
accelerator.write(ISCALAR_RQT_EN,0) # NO input SCALAR
accelerator.write(OSCALAR_RQT_EN,0) # no output scalar
accelerator.write(OARG0_TDEST,0) # only one output
@ffi.def_extern()
def Init_p(tot_tensors,input_tens_size,output_tens_size):
global accelerator
global overlay
global size_tot
global input_size
global output_size
print("[DEBUG - PYTHON] --- Init p function ---")
overlay.reset()
accelerator.write(CTRL,0x0000001)
accelerator.write(CTRL,0x0000000)
size_tot=tot_tensors
print("[DEBUG-PYTHON]--- total tensors",size_tot,"---")
input_size=input_tens_size
print("[DEBUG-PYTHON]--- int tensors",input_size,"---")
output_size=output_tens_size
print("[DEBUG-PYTHON]--- out tensors",output_tens_size,"---")
return True
@ffi.def_extern()
def SelectDataTypeComputation_p(data_type):
global csr_buffer
global curr_data_precision
print("[DEBUG - PYTHON ] --- SelectDataTypeComputation DTPU class ---")
# modify the csr buffer
if csr_buffer is not None:
if data_type!=0:
csr_buffer[ARITHMETIC_PRECISION]= data_type
if ((data_type)&0x0000ff)==INT8:
curr_data_precision=8
else:
csr_buffer[ARITHMETIC_PRECISION]= (NO_FP<<8)|(ACTIVATE_CHAIN<<4)| INT8
curr_data_precision=INT8
print("[DEBUG-PYTHON]----precision default 8 bit----")
else:
print("[DEBUG-PYTHON]----csr buffer does not exist! create before calling this function----")
csr_buffer.flush()
################################################
###### program the dma for the csr reg #########
################################################
print("[DEBUG-PYTHON]--- transfering csr buffer ----")
driver_csr.sendchannel.transfer(csr_buffer)
driver_csr.sendchannel.wait()
@ffi.def_extern()
def CopyFromBufferHandle_p():
print("[DEBUG - PYTHON ] --- the from delegate and buffers ---")
return True
@ffi.def_extern()
def CopyToBufferHandle_p():
print("[DEBUG - PYTHON ] --- copying to the delegate and buffers ---")
return True
@ffi.def_extern()
def FreeBufferHandle_p():
global input_fifo_buffer
global output_fifo_buffer
global csr_buffer
global weight_buffer
global driver_csr
global driver_wm
global driver_fifo_in
global driver_fifo_out
print("[DEBUG - PYTHON ] --- freeing buffers ---")
input_fifo_buffer.freebuffer()
output_fifo_buffer.freebuffer()
csr_buffer.freebuffer()
weight_buffer.freebuffer()
del driver_csr
del driver_wm
del driver_fifo_in
del driver_fifo_out
@ffi.def_extern()
def push_weight_to_heap(tensor,size,dim_size):
global tensors
#push the tensor to the heap for handling their transfefr in the Prepare_p
tot_size=1
tensor_i=ffi.cast("int *",tensor)
size_i=size
size_l=[]
data_p=[]
for i in range(dim_size):
size_l.append(size[i])
tot_size+=size[i]
print(tot_size-1)
for i in range (tot_size):
data_p.append(tensor_i[i])
tensors.append(Tensor(data_p,tot_size,size_l))
@ffi.def_extern()
def Prepare_p(input_size,output_size,weight_num):
global input_fifo_buffer
global output_fifo_buffer
global weight_buffer
global csr_buffer
global overlay
global driver_wm
global driver_csr
global driver_fifo_in
global driver_fifo_out
global num_weight
global global_iteration
global global_iteration_shift_wm
global curr_data_precision
global tensors
print("[DEBUG - PYTHON ] --- Prepare p of DTPU class ---")
print("[DEBUG - PYTHON ] --- in size",input_size,"output size",output_size," ---")
#allocate buffers for data transfer
num_weight=weight_num
input_fifo_buffer = allocate(shape=(INFIFO_SIZE,),dtype='u8')
output_fifo_buffer=allocate(shape=(INFIFO_SIZE,),dtype='u8')
weight_buffer=allocate(shape=(WMEM_SIZE,),dtype='u8')
csr_buffer=allocate(shape=(64,),dtype='u8')
driver_wm=overlay.axi_dma_weight_mem
driver_csr=overlay.axi_dma_csr_mem
driver_fifo_in=overlay.axi_dma_infifo
driver_fifo_out=overlay.axi_dma_outfifo
#######################################################################
########### populate buffers pack depending on the precision #########
#######################################################################
#get all tensors from the heap they will always be of 2 dims
index=0
iter=0
for i in range(num_weight):
tmp=tensors[i]
shift=0
for j in range (tmp.tot_dim):
if(shift<int(64/curr_data_precision)):
weight_buffer[index]|=np.uint8(tmp.data[j]<<(np.uint8(shift*curr_data_precision)))
shift=shift+1
else:
index=index+1
shift=0
weight_buffer[index]|=np.uint8(tmp.data[j]<<(np.uint8(shift*curr_data_precision)))
if index>=ROWS:
iter=iter+1
global_iteration_shift_wm.append(index)
# calculate number of runs depending on the size of weight tensor
global_iteration=iter
################################
###### transferring data #######
################################
weight_buffer.flush()
################################################
###### program the dma for the weight ##########
################################################
print("[DEBUG-PYTHON]--- transfering weight buffer ----")
driver_wm.sendchannel.transfer(weight_buffer)
driver_wm.sendchannel.wait()
return True
@ffi.def_extern()
def destroy_p(delegate):
global input_fifo_buffer
global output_fifo_buffer
global csr_buffer
global weight_buffer
print("[DEBUG - PYTHON ] --- destroying the delegate and buffers ---")
input_fifo_buffer.freebuffer()
output_fifo_buffer.freebuffer()
csr_buffer.freebuffer()
weight_buffer.freebuffer()
if delegate is not None:
del delegate
@ffi.def_extern()
def Invoke_p(in_data,in_data_size,out_data,out_data_size):
global driver_csr
global driver_wm
global driver_fifo_in
global driver_fifo_out
global csr_buffer
global weight_buffer
global input_fifo_buffer
global output_fifo_buffer
global accelerator
global global_iteration
global global_iteration_shift_wm
global curr_data_precision
#######################################################################
########### populate buffers pack depending on the precision #########
#######################################################################
tmp=[]
for i in range(in_data_size):
tmp.append(int(in_data[i]))
index=0
shift=0
iter=int(in_data_size/(INFIFO_SIZE*(64/curr_data_precision)))
iter_index=[]
for j in range (in_data_size):
if(shift<int(64/curr_data_precision)):
input_fifo_buffer[index]|=np.uint8(tmp[j]<<(np.uint8(shift*curr_data_precision)))
shift=shift+1
else:
index=index+1
shift=0
input_fifo_buffer[index]|=np.uint8(tmp[j]<<(np.uint8(shift*curr_data_precision)))
if(index>=INFIFO_SIZE):
break
input_fifo_buffer.flush()
######################################################
###### program the dma for the in/out fifos ##########
######################################################
print("[DEBUG-PYTHON]--- transfering input buffer ----")
driver_fifo_in.sendchannel.transfer(input_fifo_buffer)
driver_fifo_in.sendchannel.wait()
driver_fifo_out.recvchannel.transfer(output_fifo_buffer)
for i in range (global_iteration):
# modify inital starting value of weight
#prev=np.uint64(csr_buffer[ARITHMETIC_PRECISION] & 0x00000000FFFFFFFF)
csr_buffer[ARITHMETIC_PRECISION]=np.uint64(((np.uint32(global_iteration_shift_wm[i] <<32)) | (np.uint32(csr_buffer[ARITHMETIC_PRECISION] &0xfffffffff)))
csr_buffer.flush()
for j in range(iter):
#execute the inference and retrieve the data
accelerator.write(CMD, (0x0000000 |(CMD_EXECUTE_STEP<<16))) # execute one step
while driver_fifo_out.recvchannel.running:
pass
for k in range(INFIFO_SIZE):
input_fifo_buffer[i]=output_fifo_buffer[i]
## copy the rest of input
index=0
shift=0
for k in range (in_data_size-j*INFIFO_SIZE):
if(shift<int(64/curr_data_precision)):
input_fifo_buffer[index]|=np.uint8(tmp[k+j*INFIFO_SIZE]<<(np.uint8(shift*curr_data_precision)))
shift=shift+1
else:
index=index+1
shift=0
input_fifo_buffer[index]|=np.uint8(tmp[k+j*INFIFO_SIZE]<<(np.uint8(shift*curr_data_precision)))
if(k>=INFIFO_SIZE):
break
input_fifo_buffer.flush()
for k in range(INFIFO_SIZE):
input_fifo_buffer[i]=output_fifo_buffer[i]
accelerator.write(STATUS,0x00000003)##clear status
print("[DEBUG -PYTHON] ---- accelerator done ----")
################################################################################################
####### unpack the output buffer depending on the precision and give it back to C code ########
################################################################################################
for i in range(out_data_size):
out_data[i]=output_fifo_buffer[i]
return True
@ffi.def_extern()
def ResetHardware_p():
global accelerator
global overlay
print("[DEBUG - PYTHON ] --- Reset hardware p function ---")
overlay.reset()
accelerator.write(CTRL,0x0000001)
accelerator.write(CTRL,0x0000000)
return True
""")
ffibuilder.compile(target="DTPU_delegate.*", verbose=True)
exit()
from cffi import FFI
ffi=FFI()
ffi.cdef("""
void * tflite_plugin_create_delegate();
void tflite_plugin_destroy_delegate(void * );
""")
#@ffi.def_extern()
#def destroy_p(delegate):
# print("destroying the delegate")
# del delegate
#
aa = ffi.dlopen("./DTPU_delegate.so")
exit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment