Hi,
Are you really sure that Qualcomm’s OpenCL ML extensions are fast? I saw your blog posts showing the speedup for MobileNetV1 so I decided to give it a try alone before I use it as a byoc in TVM.
I am getting very poor results (760 ms) for this half float GEMM:
A : 1024x1024 B: 1024x1024 C: 1024x1024 C = AB
I was hoping to get at least less than 20 ms, which is what ARM Compute Library achieves.
Please see if I am making some mistake. My Qualcomm chip is snapdragon xr2 gen 1 with adreno 650 gpu.
// #include<stdio.h>
// #include<stdlib.h>
// #include<string.h>
// #include<unistd.h>
#include<assert.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <list>
#include <string>
#include <algorithm>
#include <math.h>
#define CL_TARGET_OPENCL_VERSION 200
// #include"CL/cl.h"
#include"CL/cl_qcom_ml_ops.h"
#include"util/util.h"
#include"util/half_float.h"
#define BUFFER_SIZE 1024
// use recordable queue
// use qcom image extensions
void showEvent(cl_event &event, const std::string& event_name) {
cl_ulong queue;
cl_ulong submit;
cl_ulong start;
cl_ulong end;
cl_ulong complete;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &queue, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submit, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &complete, NULL);
fprintf(stdout, "%s\n", event_name.c_str());
fprintf(stdout, "\tqueue -> submit: %f ms\n", ((double)(submit - queue)) / 1000000.0);
fprintf(stdout, "\tsubmit -> start: %f ms\n", ((double)(start - submit)) / 1000000.0);
fprintf(stdout, "\tstart -> end: %f ms\n", ((double)(end - start)) / 1000000.0);
fprintf(stdout, "\tend -> complete: %f ms\n", ((double)(complete - end)) / 1000000.0);
}
int main() {
cl_int ret = 0;
cl_uint num_platforms;
cl_uint num_devices;
cl_event event;
cl_platform_id platform = NULL;
cl_context context = NULL;
cl_ml_tensor_qcom unusedTensor = NULL;
cl_ml_tensor_qcom finalOutput = NULL;
cl_device_id device_id = NULL;
cl_command_queue queue = NULL;
cl_ml_tuningcache_qcom tuning_cache = NULL;
// get platform
ret = clGetPlatformIDs(1, &platform, &num_platforms);
assert(ret == CL_SUCCESS);
// get device
ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, &num_devices);
assert(ret == CL_SUCCESS);
// create context
{
const cl_context_properties context_properties[] = {CL_CONTEXT_PRIORITY_HINT_QCOM, CL_PRIORITY_HINT_HIGH_QCOM, 0};
context = clCreateContext(context_properties, 1, &device_id, NULL, NULL, &ret);
assert(ret == CL_SUCCESS);
clSetPerfHintQCOM(context, CL_PERF_HINT_HIGH_QCOM);
}
// create queue
{
const cl_queue_properties command_queue_properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0};
queue = clCreateCommandQueueWithProperties(context, device_id, command_queue_properties, &ret);
assert(ret == CL_SUCCESS);
}
static const cl_uint MAX_VERSIONS = 256;
cl_int majorVersions[MAX_VERSIONS];
cl_int minorVersions[MAX_VERSIONS];
cl_uint numVersions = 0;
ret = clQueryMLInterfaceVersionsQCOM(NULL, NULL, 0, &numVersions);
CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
CLML_SDK_TEST_AND_EXIT(numVersions > 0u);
CLML_SDK_TEST_AND_EXIT(numVersions <= MAX_VERSIONS);
ret = clQueryMLInterfaceVersionsQCOM(majorVersions, minorVersions, numVersions, NULL);
CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
CLMLInterfaceV1QCOM* h_ClmlIntf = NULL;
for (cl_uint i = 0; i < numVersions; ++i)
{
if (majorVersions[i] == 1)
{
h_ClmlIntf = clGetMLInterfaceV1QCOM(0);
CLML_SDK_TEST_AND_EXIT(h_ClmlIntf != NULL);
break;
}
}
// CLMLInterfaceV3QCOM* h_ClmlIntf = NULL;
// for (cl_uint i = 0; i < numVersions; ++i)
// {
// if (majorVersions[i] == 3)
// {
// h_ClmlIntf = clGetMLInterfaceV3QCOM(0);
// CLML_SDK_TEST_AND_EXIT(h_ClmlIntf != NULL);
// break;
// }
// }
cl_uint M = 1024;
cl_uint K = 1024;
cl_uint N = 1024;
cl_ml_value_qcom alpha;
alpha.type = CL_FLOAT;
alpha.u.fp32 = 1.0;
cl_ml_value_qcom beta;
beta.type = CL_FLOAT;
beta.u.fp32 = 0.0;
cl_ml_tensor_qcom A = NULL;
cl_ml_tensor_qcom B = NULL;
cl_ml_tensor_qcom C = NULL;
// create A
// N C H W
cl_ml_tensor_desc_qcom Adesc = {CL_HALF_FLOAT, CL_TENSOR_LAYOUT_NHWC_QCOM, 1, 1, M, K, 0, CL_TENSOR_DIMENSIONS_4D_QCOM, {}};
ret = h_ClmlIntf->clCreateMLTensorQCOM(context, NULL, &Adesc, &A);
CLML_SDK_TEST_AND_EXIT(A && ret == CL_SUCCESS);
// create B
cl_ml_tensor_desc_qcom Bdesc = {CL_HALF_FLOAT, CL_TENSOR_LAYOUT_NHWC_QCOM, 1, 1, K, N, 0, CL_TENSOR_DIMENSIONS_4D_QCOM, {}};
ret = h_ClmlIntf->clCreateMLTensorQCOM(context, NULL, &Bdesc, &B);
CLML_SDK_TEST_AND_EXIT(B && ret == CL_SUCCESS);
// create C
cl_ml_tensor_desc_qcom Cdesc = {CL_HALF_FLOAT, CL_TENSOR_LAYOUT_NHWC_QCOM, 1, 1, M, N, 0, CL_TENSOR_DIMENSIONS_4D_QCOM, {}};
ret = h_ClmlIntf->clCreateMLTensorQCOM(context, NULL, &Cdesc, &C);
CLML_SDK_TEST_AND_EXIT(C && ret == CL_SUCCESS);
cl_ml_op_qcom op_gemm = NULL;
const cl_ml_op_gemm_desc_qcom gemmDesc = {M, N, K, CL_GEMM_TRANSFORM_NONE_QCOM, CL_GEMM_TRANSFORM_NONE_QCOM, alpha, beta, CL_ARITHMETIC_MODE_FP16_QCOM};
// ret = h_ClmlIntf->clCreateMLOpGemmQCOM(context, NULL, &gemmDesc, A, B, C, &op_gemm, tuningRun ? NULL : tuning_cache);
ret = h_ClmlIntf->clCreateMLOpGemmQCOM(context, NULL, &gemmDesc, A, B, C, &op_gemm, NULL);
CLML_SDK_TEST_AND_EXIT(op_gemm && ret == CL_SUCCESS);
cl_mem AMem, BMem, CMem;
cl_uint size = 0;
ret = h_ClmlIntf->clGetMLTensorMemorySizeQCOM(context, A, &size);
CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
AMem = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size, NULL, &ret);
CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
printf("A size: %u\n", size);
ret = h_ClmlIntf->clGetMLTensorMemorySizeQCOM(context, B, &size);
CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
BMem = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size, NULL, &ret);
CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
printf("B size: %u\n", size);
ret = h_ClmlIntf->clGetMLTensorMemorySizeQCOM(context, C, &size);
CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
CMem = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size, NULL, &ret);
CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
printf("C size: %u\n", size);
cl_half *AData = (cl_half*)malloc(M*K*sizeof(cl_half));
for (int i = 0;i < M*K;++i) ((__fp16*)AData)[i] = (__fp16)1.0;
ret = h_ClmlIntf->clEnqueueWriteMLTensorDataQCOM(queue, AData,
CL_TENSOR_LAYOUT_NHWC_QCOM, A, AMem, 0, NULL, &event);
CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
clFinish(queue);
showEvent(event, "enqueue write A ");
cl_half *BData = (cl_half*)malloc(K*N*sizeof(cl_half));
for (int i = 0;i < K*N;++i) ((__fp16*)BData)[i] = (__fp16)1.0;
ret = h_ClmlIntf->clEnqueueWriteMLTensorDataQCOM(queue, BData,
CL_TENSOR_LAYOUT_NHWC_QCOM, B, BMem, 0, NULL, &event);
CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
clFinish(queue);
showEvent(event, "enqueue write B ");
cl_half *CData = (cl_half*)malloc(M*N*sizeof(cl_half));
for (int i = 0;i < M*N;++i) ((__fp16*)CData)[i] = (__fp16)0.0;
ret = h_ClmlIntf->clEnqueueWriteMLTensorDataQCOM(queue, CData,
CL_TENSOR_LAYOUT_NHWC_QCOM, C, CMem, 0, NULL, &event);
CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
clFinish(queue);
showEvent(event, "enqueue write C ");
// cl_half * host_ptr = (cl_half*)malloc(M * N * sizeof(cl_half));
// clEnqueueReadBuffer(queue, CMem, CL_TRUE, 0, M*N*sizeof(cl_half), host_ptr,0, NULL, NULL);
// for (int row = 0;row < M;++row) {
// for (int col = 0;col < N;++col) {
// float val = to_float(*((cl_half*)&host_ptr[row*N + col]));
// printf("%.1f ", val);
// }
// printf("\n");
// }
cl_ml_tensor_memory_desc_qcom descArray[] =
{
// {tensor, memory}
{A, AMem},
{B, BMem},
{C, CMem}
};
cl_ml_tensor_mem_desc_set_qcom descriptorSet;
ret = h_ClmlIntf->clCreateMLTensorMemoryDescriptorSetQCOM(&descriptorSet);
CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
ret = h_ClmlIntf->clUpdateMLTensorMemoryDescriptorSetQCOM(descriptorSet, sizeof(descArray)/sizeof(descArray[0]), descArray);
CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
cl_event opEvent;
ret = h_ClmlIntf->clEnqueueMLOpQCOM(queue, op_gemm, descriptorSet, 0, NULL, &opEvent);
CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
// if (!tuningRun) {
// }
// else {
// cl_ulong tuned_exec_time;
// h_ClmlIntf->clTuneMLOpQCOM(queue, op_gemm, descriptorSet, tuning_cache, &tuned_exec_time);
// printf("tuned exec time: %lu\n", tuned_exec_time);
// // h_ClmlIntf->clSaveMLTuningCacheQCOM(tuning_cache, )
// }
ret = clWaitForEvents(1, &opEvent);
CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
printf("gemm time 1\n");
showEvent(opEvent, "gemm 2");
clFinish(queue);
ret = h_ClmlIntf->clEnqueueMLOpQCOM(queue, op_gemm, descriptorSet, 0, NULL, &opEvent);
CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
ret = clWaitForEvents(1, &opEvent);
CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
printf("gemm time 2\n");
showEvent(opEvent, "gemm 2");
clFinish(queue);
// cl_half * hptr = (cl_half*)malloc(M * N * sizeof(cl_half));
// h_ClmlIntf->clEnqueueReadMLTensorDataQCOM(queue, C, CMem, (void*)hptr, Cdesc.layout, 0, NULL, &event);
// CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
// showEvent(event);
// for (int row = 0;row < M;++row) {
// for (int col = 0;col < N;++col) {
// float val = to_float(*((cl_half*)&hptr[row*N + col]));
// printf("%.1f ", val);
// }
// printf("\n");
// }
cl_half * host_ptr = (cl_half*)malloc(M * N * sizeof(cl_half));
clEnqueueReadBuffer(queue, CMem, CL_TRUE, 0, M*N*sizeof(cl_half), host_ptr,0, NULL, NULL);
for (int row = 0;row < M;++row) {
for (int col = 0;col < N;++col) {
float val = to_float(*((cl_half*)&host_ptr[row*N + col]));
assert(val == M);
// printf("%.1f ", val);
}
// printf("\n");
}
ret = h_ClmlIntf->clReleaseMLTensorMemoryDescriptorSetQCOM(descriptorSet);
CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
clReleaseMemObject(AMem);
clReleaseMemObject(BMem);
clReleaseMemObject(CMem);
h_ClmlIntf->clReleaseMLOpQCOM(op_gemm);
// std::vector<cl_ml_tensor_memory_desc_qcom> tensorMemDescs;
// cl_ml_tensor_mem_desc_set_qcom descriptorSet;
// ret = h_ClmlIntf->clCreateMLTensorMemoryDescriptorSetQCOM(&descriptorSet);
// CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
// ret = h_ClmlIntf->clUpdateMLTensorMemoryDescriptorSetQCOM(descriptorSet,
// static_cast<uint32_t>(tensorMemDescs.size()),
// tensorMemDescs.data());
// CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
// cl_ml_tensor_memory_desc_qcom AMemDesc = {};
// AMemDesc.tensor = A;
// ret = allocateTensorMemory(h_ClmlIntf, context, &AMemDesc);
// CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
// tensorMemDescs.push_back(AMemDesc);
// std::vector<cl_half> AData;
// AData.resize(M * K);
// for (size_t i = 0; i < AData.size(); ++i)
// {
// AData[i] = to_half(static_cast<float>(i % 5));
// }
// cl_event evt = NULL;
// ret = h_ClmlIntf->clEnqueueWriteMLTensorDataQCOM(
// queue,
// AData.data(),
// CL_TENSOR_LAYOUT_NHWC_QCOM,
// A,
// Adesc.memory,
// 0, // n waitlist
// NULL, // waitlist
// &evt); // event
// CLML_SDK_TEST_AND_EXIT((evt != NULL) && result == CL_SUCCESS);
// for (size_t i = 0; i < tensorMemInfos.size(); ++i)
// {
// // Allocate memory for this tensor
// cl_ml_tensor_memory_desc_qcom tensormemdesc = {};
// tensormemdesc.tensor = tensorMemInfos[i].tensor;
// ret = allocateTensorMemory(h_ClmlIntf, context, &tensormemdesc);
// CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
// tensorMemDescs.push_back(tensormemdesc);
// if (tuningRun)
// {
// continue;
// }
// // Store the Desccriptor of the final tensor for later reading
// if (finalOutput == tensorMemInfos[i].tensor)
// {
// finalOutputDescSet = tensormemdesc;
// }
// if (tensorMemInfos[i].filename.empty())
// {
// continue;
// }
// // If this tensor has an associated file, load it using writetensordata into the created memory
// size_t len = tensorMemInfos[i].filename.size();
// if (std::strcmp(&tensorMemInfos[i].filename[len - 5], "qfp16") == 0)
// {
// preTrainedWeights.push_back(readFP16FromFile(weightsFilePath + tensorMemInfos[i].filename));
// cl_event evt = NULL;
// ret = h_ClmlIntf->clEnqueueWriteMLTensorDataQCOM(
// queue,
// preTrainedWeights.back().data(),
// CL_TENSOR_LAYOUT_NHWC_QCOM,
// tensorMemInfos[i].tensor,
// tensormemdesc.memory,
// 0, // n waitlist
// NULL, // waitlist
// &evt); // event
// CLML_SDK_TEST_AND_EXIT(evt && ret == CL_SUCCESS);
// writeEvents.push_back(evt);
// }
// else
// {
// CLML_SDK_TEST_AND_EXIT(!"Unexpected file type");
// }
// }
// if (tuningRun)
// {
// // Run Tuning Operation
// for (size_t i = 0; i < operations.size(); ++i)
// {
// ret = h_ClmlIntf->clTuneMLOpQCOM(queue, operations[i], descriptorSet, tuning_cache, NULL);
// CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
// }
// }
// if (tuningRun)
// {
// size_t cacheLenBytes = 0;
// size_t lenRet = 0;
// CLML_SDK_TEST_AND_EXIT(cacheFileName);
// ret = h_ClmlIntf->clSaveMLTuningCacheQCOM(tuning_cache, 0, NULL, &cacheLenBytes);
// CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
// std::vector<unsigned char> savedCache(cacheLenBytes, 0);
// ret = h_ClmlIntf->clSaveMLTuningCacheQCOM(tuning_cache, savedCache.size(), savedCache.data(), &lenRet);
// CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
// std::ofstream cache_out(cacheFileName, std::ios_base::binary);
// if (cache_out){
// cache_out.write(reinterpret_cast<char*>(savedCache.data()), savedCache.size());
// cache_out.close();
// }
// h_ClmlIntf->clReleaseMLTuningCacheQCOM(tuning_cache);
// }
clReleaseCommandQueue(queue);
clReleaseContext(context);
clReleaseDevice(device_id);
return 0;
}
ignore the “end to complete” time. Why is it taking ~750 ms start to end?
A size: 2097152
B size: 2097152
C size: 2097152
enqueue write A
queue -> submit: 0.198912 ms
submit -> start: 0.039168 ms
start -> end: 0.028928 ms
end -> complete: 16723141762754.050781 ms
enqueue write B
queue -> submit: 0.133888 ms
submit -> start: 0.026112 ms
start -> end: 0.022016 ms
end -> complete: 16723141762753.015625 ms
enqueue write C
queue -> submit: 0.540160 ms
submit -> start: 0.038912 ms
start -> end: 0.023040 ms
end -> complete: 16723141762750.626953 ms
gemm time 1
gemm 2
queue -> submit: 2.874880 ms
submit -> start: 3.655936 ms
start -> end: 767.124992 ms
end -> complete: 16723141761976.927734 ms
gemm time 2
gemm 2
queue -> submit: 0.000000 ms
submit -> start: 0.337152 ms
start -> end: 777.488896 ms
end -> complete: 16723141761198.480469 ms