Merge pull request #9604 from alalek:ocl_kernel_profiling

af8fc3e2 · Vadim Pisarevsky · 22467599 · e5890fc9 · af8fc3e2 · af8fc3e2
Commit af8fc3e2 authored Sep 28, 2017 by Vadim Pisarevsky
Show whitespace changes
Inline Side-by-side

Showing with 146 additions and 20 deletions

ocl.hpp modules/core/include/opencv2/core/ocl.hpp +11 -1

ocl.cpp modules/core/src/ocl.cpp +120 -17

ocl_deprecated.hpp modules/core/src/ocl_deprecated.hpp +15 -2

No files found.
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@@ -333,8 +333,12 @@ public:
    void* ptr() const;
    static Queue& getDefault();

+    /// @brief Returns OpenCL command queue with enable profiling mode support
+    const Queue& getProfilingQueue() const;
+
+    struct Impl; friend struct Impl;
+    inline Impl* getImpl() const { return p; }
 protected:
-    struct Impl;
    Impl* p;
 };

@@ -569,6 +573,12 @@ public:
             size_t localsize[], bool sync, const Queue& q=Queue());
    bool runTask(bool sync, const Queue& q=Queue());

+    /** @brief Similar to synchronized run() call with returning of kernel execution time
+     * Separate OpenCL command queue may be used (with CL_QUEUE_PROFILING_ENABLE)
+     * @return Execution time in nanoseconds or negative number on error
+     */
+    int64 runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q=Queue());
+
    size_t workGroupSize() const;
    size_t preferedWorkGroupSizeMultiple() const;
    bool compileWorkGroupSize(size_t wsz[]) const;

--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -1840,9 +1840,35 @@ void initializeContextFromHandle(Context& ctx, void* platform, void* _context, v

 struct Queue::Impl
 {
-    Impl(const Context& c, const Device& d)
+    inline void __init()
    {
        refcount = 1;
+        handle = 0;
+        isProfilingQueue_ = false;
+    }
+
+    Impl(cl_command_queue q)
+    {
+        __init();
+        handle = q;
+
+        cl_command_queue_properties props = 0;
+        cl_int result = clGetCommandQueueInfo(handle, CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &props, NULL);
+        CV_Assert(result && "clGetCommandQueueInfo(CL_QUEUE_PROPERTIES)");
+        isProfilingQueue_ = !!(props & CL_QUEUE_PROFILING_ENABLE);
+    }
+
+    Impl(cl_command_queue q, bool isProfilingQueue)
+    {
+        __init();
+        handle = q;
+        isProfilingQueue_ = isProfilingQueue;
+    }
+
+    Impl(const Context& c, const Device& d, bool withProfiling = false)
+    {
+        __init();
+
        const Context* pc = &c;
        cl_context ch = (cl_context)pc->ptr();
        if( !ch )
@@ -1854,8 +1880,10 @@ struct Queue::Impl
        if( !dh )
            dh = (cl_device_id)pc->device(0).ptr();
        cl_int retval = 0;
-        handle = clCreateCommandQueue(ch, dh, 0, &retval);
+        cl_command_queue_properties props = withProfiling ? CL_QUEUE_PROFILING_ENABLE : 0;
+        handle = clCreateCommandQueue(ch, dh, props, &retval);
        CV_OclDbgAssert(retval == CL_SUCCESS);
+        isProfilingQueue_ = withProfiling;
    }

    ~Impl()
@@ -1873,9 +1901,37 @@ struct Queue::Impl
        }
    }

+    const cv::ocl::Queue& getProfilingQueue(const cv::ocl::Queue& self)
+    {
+        if (isProfilingQueue_)
+            return self;
+
+        if (profiling_queue_.ptr())
+            return profiling_queue_;
+
+        cl_context ctx = 0;
+        CV_Assert(CL_SUCCESS == clGetCommandQueueInfo(handle, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, NULL));
+
+        cl_device_id device = 0;
+        CV_Assert(CL_SUCCESS == clGetCommandQueueInfo(handle, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL));
+
+        cl_int result = CL_SUCCESS;
+        cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE;
+        cl_command_queue q = clCreateCommandQueue(ctx, device, props, &result);
+        CV_Assert(result == CL_SUCCESS && "clCreateCommandQueue(with CL_QUEUE_PROFILING_ENABLE)");
+
+        Queue queue;
+        queue.p = new Impl(q, true);
+        profiling_queue_ = queue;
+
+        return profiling_queue_;
+    }
+
    IMPLEMENT_REFCOUNTABLE();

    cl_command_queue handle;
+    bool isProfilingQueue_;
+    cv::ocl::Queue profiling_queue_;
 };

 Queue::Queue()
@@ -1929,6 +1985,12 @@ void Queue::finish()
    }
 }

+const Queue& Queue::getProfilingQueue() const
+{
+    CV_Assert(p);
+    return p->getProfilingQueue(*this);
+}
+
 void* Queue::ptr() const
 {
    return p ? p->handle : 0;
@@ -2032,6 +2094,9 @@ struct Kernel::Impl
        release();
    }

+    bool run(int dims, size_t _globalsize[], size_t _localsize[],
+            bool sync, int64* timeNS, const Queue& q);
+
    ~Impl()
    {
        if(handle)
@@ -2259,19 +2324,15 @@ int Kernel::set(int i, const KernelArg& arg)
    return i+1;
 }

-
 bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
                 bool sync, const Queue& q)
 {
-    CV_INSTRUMENT_REGION_OPENCL_RUN(p->name.c_str());
-
-    if(!p || !p->handle || p->isInProgress)
+    if (!p)
        return false;

-    cl_command_queue qq = getQueue(q);
    size_t globalsize[CV_MAX_DIM] = {1,1,1};
    size_t total = 1;
-    CV_Assert(_globalsize != 0);
+    CV_Assert(_globalsize != NULL);
    for (int i = 0; i < dims; i++)
    {
        size_t val = _localsize ? _localsize[i] :
@@ -2283,12 +2344,28 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
        globalsize[i] = divUp(_globalsize[i], (unsigned int)val) * val;
    }
    CV_Assert(total > 0);
-    if( p->haveTempDstUMats )
+
+    return p->run(dims, globalsize, _localsize, sync, NULL, q);
+}
+
+
+bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[],
+        bool sync, int64* timeNS, const Queue& q)
+{
+    CV_INSTRUMENT_REGION_OPENCL_RUN(p->name.c_str());
+
+    if (!handle || isInProgress)
+        return false;
+
+    cl_command_queue qq = getQueue(q);
+    if (haveTempDstUMats)
+        sync = true;
+    if (timeNS)
        sync = true;
    cl_event asyncEvent = 0;
-    cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims,
-                                           NULL, globalsize, _localsize, 0, 0,
-                                           sync ? 0 : &asyncEvent);
+    cl_int retval = clEnqueueNDRangeKernel(qq, handle, (cl_uint)dims,
+                                           NULL, globalsize, localsize, 0, 0,
+                                           (sync && !timeNS) ? 0 : &asyncEvent);
 #if CV_OPENCL_SHOW_RUN_ERRORS
    if (retval != CL_SUCCESS)
    {
@@ -2296,16 +2373,31 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
        fflush(stdout);
    }
 #endif
-    if( sync || retval != CL_SUCCESS )
+    if (sync || retval != CL_SUCCESS)
    {
        CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
-        p->cleanupUMats();
+        if (timeNS)
+        {
+            if (retval == CL_SUCCESS)
+            {
+                clWaitForEvents(1, &asyncEvent);
+                cl_ulong startTime, stopTime;
+                CV_Assert(CL_SUCCESS == clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, NULL));
+                CV_Assert(CL_SUCCESS == clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_END, sizeof(stopTime), &stopTime, NULL));
+                *timeNS = (int64)(stopTime - startTime);
            }
            else
            {
-        p->addref();
-        p->isInProgress = true;
-        CV_OclDbgAssert(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);
+                *timeNS = -1;
+            }
+        }
+        cleanupUMats();
+    }
+    else
+    {
+        addref();
+        isInProgress = true;
+        CV_OclDbgAssert(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, this) == CL_SUCCESS);
    }
    if (asyncEvent)
        clReleaseEvent(asyncEvent);
@@ -2336,6 +2428,17 @@ bool Kernel::runTask(bool sync, const Queue& q)
    return retval == CL_SUCCESS;
 }

+int64 Kernel::runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q_)
+{
+    CV_Assert(p && p->handle && !p->isInProgress);
+    Queue q = q_.ptr() ? q_ : Queue::getDefault();
+    CV_Assert(q.ptr());
+    q.finish(); // call clFinish() on base queue
+    Queue profilingQueue = q.getProfilingQueue();
+    int64 timeNs = -1;
+    bool res = p->run(dims, globalsize, localsize, true, &timeNs, profilingQueue);
+    return res ? timeNs : -1;
+}

 size_t Kernel::workGroupSize() const
 {

--- a/modules/core/src/ocl_deprecated.hpp
+++ b/modules/core/src/ocl_deprecated.hpp
@@ -759,7 +759,7 @@ OCL_FUNC_P(cl_mem, clCreateBuffer,

 /*
 OCL_FUNC(cl_int, clRetainCommandQueue, (cl_command_queue command_queue), (command_queue))
-
+*/
 OCL_FUNC(cl_int, clGetCommandQueueInfo,
   (cl_command_queue command_queue,
   cl_command_queue_info param_name,
@@ -767,7 +767,7 @@ OCL_FUNC(cl_int, clGetCommandQueueInfo,
   void * param_value,
   size_t * param_value_size_ret),
   (command_queue, param_name, param_value_size, param_value, param_value_size_ret))
-
+/*
 OCL_FUNC_P(cl_mem, clCreateSubBuffer,
    (cl_mem buffer,
    cl_mem_flags flags,
@@ -1202,6 +1202,19 @@ OCL_FUNC(cl_int, clSetEventCallback,

 OCL_FUNC(cl_int, clReleaseEvent, (cl_event event), (event))

+OCL_FUNC(cl_int, clWaitForEvents,
+    (cl_uint num_events, const cl_event *event_list),
+    (num_events, event_list))
+
+
+OCL_FUNC(cl_int, clGetEventProfilingInfo, (
+    cl_event event,
+    cl_profiling_info param_name,
+    size_t param_value_size,
+    void *param_value,
+    size_t *param_value_size_ret),
+    (event, param_name, param_value_size, param_value, param_value_size_ret))
+
 }

 #endif