@inproceedings{5058a0d1240847a7b479c6cb4d26f295, title = {Accelerating Deep Neural Networks on Low Power Heterogeneous Architectures}, abstract = {Deep learning applications are able to recognise images and speech with great accuracy, and their use is now everywhere in our daily lives. However, developing deep learning architectures such as deep neural networks in embedded systems is a challenging task because of the demanding computational resources and power consumption. Hence, sophisticated algorithms and methods that exploit the hardware of the embedded systems need to be investigated. This paper is our first step towards examining methods and optimisations for deep neural networks that can leverage the hardware architecture of low power embedded devices. In particular, in this work we accelerate the inference time of the {VGG}-16 neural network on the {ODROID}-{XU}4 board. More specifically, a serial version of {VGG}-16 is parallelised for both the {CPU} and {GPU} present on the board using {OpenMP} and {OpenCL}. We also investigate several optimisation techniques that exploit the specific hardware architecture of the {ODROID} board and can accelerate the inference further. One of these optimisations uses the {CLBlast} library specifically tuned for the {ARM} Mali-T628 {GPU} present on the board. Overall, we improve the inference time of the initial serial version of the code by 2.8X using {OpenMP}, and by 9.4X using the most optimised version of {OpenCL}.}, booktitle = {11th International Workshop on Programmability and Architectures for Heterogeneous Multicores ({MULTIPROG}-2018)}, author = {Loukadakis, Manolis and Cano, Jose and O'Boyle, Michael}, date = {2018}, keywords = {Deep Neural Networks, Heterogeneous architectures, Low power embedded systems, performance}, }