@inproceedings{5058a0d1240847a7b479c6cb4d26f295, title = {Accelerating {Deep} {Neural} {Networks} on {Low} {Power} {Heterogeneous} {Architectures}}, abstract = {Deep learning applications are able to recognise images and speech with great accuracy, and their use is now everywhere in our daily lives. However, developing deep learning architectures such as deep neural networks in embedded systems is a challenging task because of the demanding computational resources and power consumption. Hence, sophisticated algorithms and methods that exploit the hardware of the embedded systems need to be investigated. This paper is our first step towards examining methods and optimisations for deep neural networks that can leverage the hardware architecture of low power embedded devices. In particular, in this work we accelerate the inference time of the VGG-16 neural network on the ODROID-XU4 board. More specifically, a serial version of VGG-16 is parallelised for both the CPU and GPU present on the board using OpenMP and OpenCL. We also investigate several optimisation techniques that exploit the specific hardware architecture of the ODROID board and can accelerate the inference further. One of these optimisations uses the CLBlast library specifically tuned for the ARM Mali-T628 GPU present on the board. Overall, we improve the inference time of the initial serial version of the code by 2.8X using OpenMP, and by 9.4X using the most optimised version of OpenCL.}, booktitle = {11th {International} {Workshop} on {Programmability} and {Architectures} for {Heterogeneous} {Multicores} ({MULTIPROG}-2018)}, author = {Loukadakis, Manolis and Cano, Jose and O'Boyle, Michael}, year = {2018}, keywords = {Deep Neural Networks, Heterogeneous architectures, Low power embedded systems, performance}, }