GEMM using OpenCL ML extensions by Qualcomm is slow?

varunnaw · August 14, 2024, 4:45pm

Hi,

Are you really sure that Qualcomm’s OpenCL ML extensions are fast? I saw your blog posts showing the speedup for MobileNetV1 so I decided to give it a try alone before I use it as a byoc in TVM.

I am getting very poor results (760 ms) for this half float GEMM:

A : 1024x1024 B: 1024x1024 C: 1024x1024 C = AB

I was hoping to get at least less than 20 ms, which is what ARM Compute Library achieves.

Please see if I am making some mistake. My Qualcomm chip is snapdragon xr2 gen 1 with adreno 650 gpu.

// #include<stdio.h>
// #include<stdlib.h>
// #include<string.h>
// #include<unistd.h>

#include<assert.h>


#include <iostream>
#include <fstream>

#include <vector>
#include <list>
#include <string>
#include <algorithm>
#include <math.h>

#define CL_TARGET_OPENCL_VERSION 200

// #include"CL/cl.h"
#include"CL/cl_qcom_ml_ops.h"


#include"util/util.h"
#include"util/half_float.h"

#define BUFFER_SIZE 1024

// use recordable queue
// use qcom image extensions

void showEvent(cl_event &event, const std::string& event_name) {
        cl_ulong queue;
        cl_ulong submit;
        cl_ulong start;
        cl_ulong end;
        cl_ulong complete;
        clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &queue, NULL);
        clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submit, NULL);
        clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
        clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
        clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &complete, NULL);
        fprintf(stdout, "%s\n", event_name.c_str());
        fprintf(stdout, "\tqueue -> submit: %f ms\n", ((double)(submit - queue)) / 1000000.0);
        fprintf(stdout, "\tsubmit -> start: %f ms\n", ((double)(start - submit)) / 1000000.0);
        fprintf(stdout, "\tstart -> end: %f ms\n", ((double)(end - start)) / 1000000.0);
        fprintf(stdout, "\tend -> complete: %f ms\n", ((double)(complete - end)) / 1000000.0);
}


int main() {
        cl_int ret = 0;
        cl_uint num_platforms;
        cl_uint num_devices;
        cl_event event;

        cl_platform_id          platform        = NULL;
        cl_context              context         = NULL;
        cl_ml_tensor_qcom       unusedTensor    = NULL;
        cl_ml_tensor_qcom       finalOutput     = NULL;
        cl_device_id            device_id       = NULL;
        cl_command_queue        queue           = NULL;
        cl_ml_tuningcache_qcom  tuning_cache    = NULL;

        // get platform
        ret = clGetPlatformIDs(1, &platform, &num_platforms);
        assert(ret == CL_SUCCESS);

        // get device
        ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, &num_devices);
        assert(ret == CL_SUCCESS);
        
        // create context
        {
                const cl_context_properties context_properties[] = {CL_CONTEXT_PRIORITY_HINT_QCOM, CL_PRIORITY_HINT_HIGH_QCOM, 0};
                context = clCreateContext(context_properties, 1, &device_id, NULL, NULL, &ret);
                assert(ret == CL_SUCCESS);
                clSetPerfHintQCOM(context, CL_PERF_HINT_HIGH_QCOM);
        }

        // create queue
        {
                const cl_queue_properties command_queue_properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0};
                queue = clCreateCommandQueueWithProperties(context, device_id, command_queue_properties, &ret);
                assert(ret == CL_SUCCESS);
        }

        static const cl_uint MAX_VERSIONS = 256;
        cl_int majorVersions[MAX_VERSIONS];
        cl_int minorVersions[MAX_VERSIONS];
        cl_uint numVersions = 0;
        ret = clQueryMLInterfaceVersionsQCOM(NULL, NULL, 0, &numVersions);
        CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
        CLML_SDK_TEST_AND_EXIT(numVersions > 0u);
        CLML_SDK_TEST_AND_EXIT(numVersions <= MAX_VERSIONS);

        ret = clQueryMLInterfaceVersionsQCOM(majorVersions, minorVersions, numVersions, NULL);
        CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);

        CLMLInterfaceV1QCOM* h_ClmlIntf = NULL;
        for (cl_uint i = 0; i < numVersions; ++i)
        {
                if (majorVersions[i] == 1)
                {
                h_ClmlIntf = clGetMLInterfaceV1QCOM(0);
                CLML_SDK_TEST_AND_EXIT(h_ClmlIntf != NULL);
                break;
                }
        }
        // CLMLInterfaceV3QCOM* h_ClmlIntf = NULL;
        // for (cl_uint i = 0; i < numVersions; ++i)
        // {
        //         if (majorVersions[i] == 3)
        //         {
        //         h_ClmlIntf = clGetMLInterfaceV3QCOM(0);
        //         CLML_SDK_TEST_AND_EXIT(h_ClmlIntf != NULL);
        //         break;
        //         }
        // }

        cl_uint M = 1024;
        cl_uint K = 1024;
        cl_uint N = 1024;

        cl_ml_value_qcom alpha;
        alpha.type = CL_FLOAT;
        alpha.u.fp32 = 1.0;
        cl_ml_value_qcom beta;
        beta.type = CL_FLOAT;
        beta.u.fp32 = 0.0;

        cl_ml_tensor_qcom A = NULL;
        cl_ml_tensor_qcom B = NULL;
        cl_ml_tensor_qcom C = NULL;
        // create A
        //                                                                         N  C  H  W
        cl_ml_tensor_desc_qcom Adesc = {CL_HALF_FLOAT, CL_TENSOR_LAYOUT_NHWC_QCOM, 1, 1, M, K, 0, CL_TENSOR_DIMENSIONS_4D_QCOM, {}};
        ret = h_ClmlIntf->clCreateMLTensorQCOM(context, NULL, &Adesc, &A);
        CLML_SDK_TEST_AND_EXIT(A && ret == CL_SUCCESS);

        // create B
        cl_ml_tensor_desc_qcom Bdesc = {CL_HALF_FLOAT, CL_TENSOR_LAYOUT_NHWC_QCOM, 1, 1, K, N, 0, CL_TENSOR_DIMENSIONS_4D_QCOM, {}};
        ret = h_ClmlIntf->clCreateMLTensorQCOM(context, NULL, &Bdesc, &B);
        CLML_SDK_TEST_AND_EXIT(B && ret == CL_SUCCESS);

        // create C
        cl_ml_tensor_desc_qcom Cdesc = {CL_HALF_FLOAT, CL_TENSOR_LAYOUT_NHWC_QCOM, 1, 1, M, N, 0, CL_TENSOR_DIMENSIONS_4D_QCOM, {}};
        ret = h_ClmlIntf->clCreateMLTensorQCOM(context, NULL, &Cdesc, &C);
        CLML_SDK_TEST_AND_EXIT(C && ret == CL_SUCCESS);

        cl_ml_op_qcom op_gemm = NULL;
        const cl_ml_op_gemm_desc_qcom gemmDesc = {M, N, K, CL_GEMM_TRANSFORM_NONE_QCOM, CL_GEMM_TRANSFORM_NONE_QCOM, alpha, beta, CL_ARITHMETIC_MODE_FP16_QCOM};
        // ret = h_ClmlIntf->clCreateMLOpGemmQCOM(context, NULL, &gemmDesc, A, B, C, &op_gemm, tuningRun ? NULL : tuning_cache);
        ret = h_ClmlIntf->clCreateMLOpGemmQCOM(context, NULL, &gemmDesc, A, B, C, &op_gemm, NULL);
        CLML_SDK_TEST_AND_EXIT(op_gemm && ret == CL_SUCCESS);

        cl_mem AMem, BMem, CMem;
        cl_uint size = 0;

        ret = h_ClmlIntf->clGetMLTensorMemorySizeQCOM(context, A, &size);
        CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
        AMem = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size, NULL, &ret);
        CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
        printf("A size: %u\n", size);

        ret = h_ClmlIntf->clGetMLTensorMemorySizeQCOM(context, B, &size);
        CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
        BMem = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size, NULL, &ret);
        CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
        printf("B size: %u\n", size);

        ret = h_ClmlIntf->clGetMLTensorMemorySizeQCOM(context, C, &size);
        CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
        CMem = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size, NULL, &ret);
        CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
        printf("C size: %u\n", size);


        cl_half *AData = (cl_half*)malloc(M*K*sizeof(cl_half));
        for (int i = 0;i < M*K;++i) ((__fp16*)AData)[i] = (__fp16)1.0;
        ret = h_ClmlIntf->clEnqueueWriteMLTensorDataQCOM(queue, AData,
                CL_TENSOR_LAYOUT_NHWC_QCOM, A, AMem, 0, NULL, &event);
        CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
        clFinish(queue);
        showEvent(event, "enqueue write A ");

        cl_half *BData = (cl_half*)malloc(K*N*sizeof(cl_half));
        for (int i = 0;i < K*N;++i) ((__fp16*)BData)[i] = (__fp16)1.0;
        ret = h_ClmlIntf->clEnqueueWriteMLTensorDataQCOM(queue, BData,
                CL_TENSOR_LAYOUT_NHWC_QCOM, B, BMem, 0, NULL, &event);
        CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
        clFinish(queue);
        showEvent(event, "enqueue write B ");

        cl_half *CData = (cl_half*)malloc(M*N*sizeof(cl_half));
        for (int i = 0;i < M*N;++i) ((__fp16*)CData)[i] = (__fp16)0.0;
        ret = h_ClmlIntf->clEnqueueWriteMLTensorDataQCOM(queue, CData,
                CL_TENSOR_LAYOUT_NHWC_QCOM, C, CMem, 0, NULL, &event);
        CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
        clFinish(queue);
        showEvent(event, "enqueue write C ");

        // cl_half * host_ptr = (cl_half*)malloc(M * N * sizeof(cl_half));
        // clEnqueueReadBuffer(queue, CMem, CL_TRUE, 0, M*N*sizeof(cl_half), host_ptr,0, NULL, NULL);
        // for (int row = 0;row < M;++row) {
        //         for (int col = 0;col < N;++col) {
        //                 float val = to_float(*((cl_half*)&host_ptr[row*N + col]));
        //                 printf("%.1f ", val);
        //         }
        //         printf("\n");
        // }



        cl_ml_tensor_memory_desc_qcom descArray[] =
        {
                // {tensor, memory}
                {A, AMem},
                {B, BMem},
                {C, CMem}
        };

        cl_ml_tensor_mem_desc_set_qcom descriptorSet;
        ret = h_ClmlIntf->clCreateMLTensorMemoryDescriptorSetQCOM(&descriptorSet);
        CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);

        ret = h_ClmlIntf->clUpdateMLTensorMemoryDescriptorSetQCOM(descriptorSet, sizeof(descArray)/sizeof(descArray[0]), descArray);
        CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);

        cl_event opEvent;
        ret = h_ClmlIntf->clEnqueueMLOpQCOM(queue, op_gemm, descriptorSet, 0, NULL, &opEvent);
        CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);


        // if (!tuningRun) {

        // }
        // else {
        //         cl_ulong tuned_exec_time;
        //         h_ClmlIntf->clTuneMLOpQCOM(queue, op_gemm, descriptorSet, tuning_cache, &tuned_exec_time);
        //         printf("tuned exec time: %lu\n", tuned_exec_time);

        //         // h_ClmlIntf->clSaveMLTuningCacheQCOM(tuning_cache, )
        // }


        ret = clWaitForEvents(1, &opEvent);
        CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
        printf("gemm time 1\n");
        showEvent(opEvent, "gemm 2");
        clFinish(queue);

        ret = h_ClmlIntf->clEnqueueMLOpQCOM(queue, op_gemm, descriptorSet, 0, NULL, &opEvent);
        CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);

        ret = clWaitForEvents(1, &opEvent);
        CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
        printf("gemm time 2\n");
        showEvent(opEvent, "gemm 2");
        clFinish(queue);

        // cl_half * hptr = (cl_half*)malloc(M * N * sizeof(cl_half));
        // h_ClmlIntf->clEnqueueReadMLTensorDataQCOM(queue, C, CMem, (void*)hptr, Cdesc.layout, 0, NULL, &event);
        // CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
        // showEvent(event);

        // for (int row = 0;row < M;++row) {
        //         for (int col = 0;col < N;++col) {
        //                 float val = to_float(*((cl_half*)&hptr[row*N + col]));
        //                 printf("%.1f ", val);
        //         }
        //         printf("\n");
        // }

        cl_half * host_ptr = (cl_half*)malloc(M * N * sizeof(cl_half));
        clEnqueueReadBuffer(queue, CMem, CL_TRUE, 0, M*N*sizeof(cl_half), host_ptr,0, NULL, NULL);
        for (int row = 0;row < M;++row) {
                for (int col = 0;col < N;++col) {
                        float val = to_float(*((cl_half*)&host_ptr[row*N + col]));
                        assert(val == M);
                        // printf("%.1f ", val);
                }
                // printf("\n");
        }


        ret = h_ClmlIntf->clReleaseMLTensorMemoryDescriptorSetQCOM(descriptorSet);
        CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);

        clReleaseMemObject(AMem);
        clReleaseMemObject(BMem);
        clReleaseMemObject(CMem);
        h_ClmlIntf->clReleaseMLOpQCOM(op_gemm);



        // std::vector<cl_ml_tensor_memory_desc_qcom> tensorMemDescs;

        // cl_ml_tensor_mem_desc_set_qcom descriptorSet;
        // ret = h_ClmlIntf->clCreateMLTensorMemoryDescriptorSetQCOM(&descriptorSet);
        // CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);

        // ret = h_ClmlIntf->clUpdateMLTensorMemoryDescriptorSetQCOM(descriptorSet,
        //                                         static_cast<uint32_t>(tensorMemDescs.size()),
        //                                         tensorMemDescs.data());
        // CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);

        

        // cl_ml_tensor_memory_desc_qcom AMemDesc = {};
        // AMemDesc.tensor = A;
        // ret = allocateTensorMemory(h_ClmlIntf, context, &AMemDesc);
        // CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
        // tensorMemDescs.push_back(AMemDesc);

        // std::vector<cl_half> AData;
        // AData.resize(M * K);
        // for (size_t i = 0; i < AData.size(); ++i)
        // {
        //         AData[i] = to_half(static_cast<float>(i % 5));
        // }
        // cl_event evt = NULL;
        // ret = h_ClmlIntf->clEnqueueWriteMLTensorDataQCOM(
        //     queue,
        //     AData.data(),
        //     CL_TENSOR_LAYOUT_NHWC_QCOM,
        //     A,
        //     Adesc.memory,
        //     0, // n waitlist
        //     NULL, // waitlist
        //     &evt); // event
        // CLML_SDK_TEST_AND_EXIT((evt != NULL) && result == CL_SUCCESS);



        

        // for (size_t i = 0; i < tensorMemInfos.size(); ++i)
        // {
        //         // Allocate memory for this tensor
        //         cl_ml_tensor_memory_desc_qcom tensormemdesc = {};
        //         tensormemdesc.tensor = tensorMemInfos[i].tensor;
        //         ret = allocateTensorMemory(h_ClmlIntf, context, &tensormemdesc);
        //         CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
        //         tensorMemDescs.push_back(tensormemdesc);

        //         if (tuningRun)
        //         {
        //         continue;
        //         }

        //         // Store the Desccriptor of the final tensor for later reading
        //         if (finalOutput == tensorMemInfos[i].tensor)
        //         {
        //         finalOutputDescSet = tensormemdesc;
        //         }
        //         if (tensorMemInfos[i].filename.empty())
        //         {
        //         continue;
        //         }

        //         // If this tensor has an associated file, load it using writetensordata into the created memory
        //         size_t len = tensorMemInfos[i].filename.size();
        //         if (std::strcmp(&tensorMemInfos[i].filename[len - 5], "qfp16") == 0)
        //         {
        //         preTrainedWeights.push_back(readFP16FromFile(weightsFilePath + tensorMemInfos[i].filename));

        //         cl_event evt = NULL;
        //         ret = h_ClmlIntf->clEnqueueWriteMLTensorDataQCOM(
        //                 queue,
        //                 preTrainedWeights.back().data(),
        //                 CL_TENSOR_LAYOUT_NHWC_QCOM,
        //                 tensorMemInfos[i].tensor,
        //                 tensormemdesc.memory,
        //                 0, // n waitlist
        //                 NULL, // waitlist
        //                 &evt); // event
        //         CLML_SDK_TEST_AND_EXIT(evt && ret == CL_SUCCESS);
        //         writeEvents.push_back(evt);
        //         }
        //         else
        //         {
        //         CLML_SDK_TEST_AND_EXIT(!"Unexpected file type");
        //         }
        // }



        // if (tuningRun)
        // {
        //         // Run Tuning Operation
        //         for (size_t i = 0; i < operations.size(); ++i)
        //         {
        //         ret = h_ClmlIntf->clTuneMLOpQCOM(queue, operations[i], descriptorSet, tuning_cache, NULL);
        //         CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);
        //         }
        // }


        // if (tuningRun)
        // {
        //         size_t                  cacheLenBytes       = 0;
        //         size_t                  lenRet              = 0;
        //         CLML_SDK_TEST_AND_EXIT(cacheFileName);
        //         ret = h_ClmlIntf->clSaveMLTuningCacheQCOM(tuning_cache, 0, NULL, &cacheLenBytes);
        //         CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);

        //         std::vector<unsigned char> savedCache(cacheLenBytes, 0);
        //         ret = h_ClmlIntf->clSaveMLTuningCacheQCOM(tuning_cache, savedCache.size(), savedCache.data(), &lenRet);
        //         CLML_SDK_TEST_AND_EXIT(ret == CL_SUCCESS);

        //         std::ofstream cache_out(cacheFileName, std::ios_base::binary);
        //         if (cache_out){
        //         cache_out.write(reinterpret_cast<char*>(savedCache.data()), savedCache.size());
        //         cache_out.close();
        //         }
        //         h_ClmlIntf->clReleaseMLTuningCacheQCOM(tuning_cache);
        // }

        clReleaseCommandQueue(queue);
        clReleaseContext(context);
        clReleaseDevice(device_id);

        return 0;
}

ignore the “end to complete” time. Why is it taking ~750 ms start to end?

A size: 2097152
B size: 2097152
C size: 2097152
enqueue write A 
        queue -> submit: 0.198912 ms
        submit -> start: 0.039168 ms
        start -> end: 0.028928 ms
        end -> complete: 16723141762754.050781 ms
enqueue write B 
        queue -> submit: 0.133888 ms
        submit -> start: 0.026112 ms
        start -> end: 0.022016 ms
        end -> complete: 16723141762753.015625 ms
enqueue write C 
        queue -> submit: 0.540160 ms
        submit -> start: 0.038912 ms
        start -> end: 0.023040 ms
        end -> complete: 16723141762750.626953 ms
gemm time 1
gemm 2
        queue -> submit: 2.874880 ms
        submit -> start: 3.655936 ms
        start -> end: 767.124992 ms
        end -> complete: 16723141761976.927734 ms
gemm time 2
gemm 2
        queue -> submit: 0.000000 ms
        submit -> start: 0.337152 ms
        start -> end: 777.488896 ms
        end -> complete: 16723141761198.480469 ms