Merge pull request #1189 from pengx17:2.4_sort_by_key

95bdd4b6 · Roman Donchenko · OpenCV Buildbot · a7da1299 · 8b90cb37 · 95bdd4b6
Commit 95bdd4b6 authored Aug 07, 2013 by Roman Donchenko Committed by OpenCV Buildbot Aug 07, 2013
7 changed files
--- a/modules/ocl/doc/operations_on_matrices.rst
+++ b/modules/ocl/doc/operations_on_matrices.rst
@@ -481,4 +481,40 @@ Performs generalized matrix multiplication.
            * **GEMM_1_T** transpose  ``src1``
            * **GEMM_2_T** transpose  ``src2``

-.. seealso:: :ocv:func:`gemm`
\ No newline at end of file
+.. seealso:: :ocv:func:`gemm`
+
+ocl::sortByKey
+------------------
+Returns void
+
+.. ocv:function:: void ocl::sortByKey(oclMat& keys, oclMat& values, int method, bool isGreaterThan = false)
+
+    :param keys:   The keys to be used as sorting indices.
+
+    :param values: The array of values.
+
+    :param isGreaterThan: Determine sorting order.
+
+    :param method: supported sorting methods:
+            * **SORT_BITONIC**   bitonic sort, only support power-of-2 buffer size
+            * **SORT_SELECTION** selection sort, currently cannot sort duplicate keys
+            * **SORT_MERGE**     merge sort
+            * **SORT_RADIX**     radix sort, only support signed int/float keys(``CV_32S``/``CV_32F``)
+            
+Returns the sorted result of all the elements in values based on equivalent keys.
+
+The element unit in the values to be sorted is determined from the data type, 
+i.e., a ``CV_32FC2`` input ``{a1a2, b1b2}`` will be considered as two elements, regardless its matrix dimension.
+
+Both keys and values will be sorted inplace. 
+
+Keys needs to be a **single** channel `oclMat`.
+
+Example::
+    input -
+    keys   = {2,    3,   1}   (CV_8UC1)
+    values = {10,5, 4,3, 6,2} (CV_8UC2)
+    sortByKey(keys, values, SORT_SELECTION, false);
+    output -
+    keys   = {1,    2,   3}   (CV_8UC1)
+    values = {6,2, 10,5, 4,3} (CV_8UC2)
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@@ -1673,6 +1673,31 @@ namespace cv
            oclMat diff_buf;
            oclMat norm_buf;
        };
+        // current supported sorting methods
+        enum
+        {
+            SORT_BITONIC,   // only support power-of-2 buffer size
+            SORT_SELECTION, // cannot sort duplicate keys
+            SORT_MERGE,
+            SORT_RADIX      // only support signed int/float keys(CV_32S/CV_32F)
+        };
+        //! Returns the sorted result of all the elements in input based on equivalent keys.
+        //
+        //  The element unit in the values to be sorted is determined from the data type, 
+        //  i.e., a CV_32FC2 input {a1a2, b1b2} will be considered as two elements, regardless its
+        //  matrix dimension.
+        //  both keys and values will be sorted inplace
+        //  Key needs to be single channel oclMat.
+        //
+        //  Example:
+        //  input -
+        //    keys   = {2,    3,   1}   (CV_8UC1)
+        //    values = {10,5, 4,3, 6,2} (CV_8UC2)
+        //  sortByKey(keys, values, SORT_SELECTION, false);
+        //  output -
+        //    keys   = {1,    2,   3}   (CV_8UC1)
+        //    values = {6,2, 10,5, 4,3} (CV_8UC2)
+        void CV_EXPORTS sortByKey(oclMat& keys, oclMat& values, int method, bool isGreaterThan = false);
    }
 }
 #if defined _MSC_VER && _MSC_VER >= 1200

--- a/modules/ocl/src/opencl/kernel_radix_sort_by_key.cl
+++ b/modules/ocl/src/opencl/kernel_radix_sort_by_key.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@outlook.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable 
+
+#ifndef N   // number of radices
+#define N 4
+#endif
+
+#ifndef K_T
+#define K_T float
+#endif
+
+#ifndef V_T
+#define V_T float
+#endif
+
+#ifndef IS_GT
+#define IS_GT 0
+#endif
+
+
+// from Thrust::b40c, link:
+// https://github.com/thrust/thrust/blob/master/thrust/system/cuda/detail/detail/b40c/radixsort_key_conversion.h
+__inline uint convertKey(uint converted_key)
+{
+#ifdef K_FLT
+    unsigned int mask = (converted_key & 0x80000000) ? 0xffffffff : 0x80000000;
+    converted_key ^= mask;
+#elif defined(K_INT)
+    const uint SIGN_MASK = 1u << ((sizeof(int) * 8) - 1);
+    converted_key ^= SIGN_MASK;	
+#else
+
+#endif
+    return converted_key;
+}
+
+//FIXME(pengx17): 
+// exclusive scan, need to be optimized as this is too naive...
+kernel
+    void naiveScanAddition(
+    __global int * input,
+    __global int * output,
+    int size
+    )
+{
+    if(get_global_id(0) == 0)
+    {
+        output[0] = 0;
+        for(int i = 1; i < size; i ++)
+        {
+            output[i] = output[i - 1] + input[i - 1];
+        }
+    }
+}
+
+// following is ported from
+// https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_uint_kernels.cl
+kernel
+    void histogramRadixN (
+    __global K_T* unsortedKeys,
+    __global int * buckets,
+    uint shiftCount
+    )
+{
+    const int RADIX_T     = N;
+    const int RADICES_T   = (1 << RADIX_T);
+    const int NUM_OF_ELEMENTS_PER_WORK_ITEM_T = RADICES_T; 
+    const int MASK_T      = (1 << RADIX_T) - 1;
+    int localBuckets[16] = {0,0,0,0,0,0,0,0,
+                            0,0,0,0,0,0,0,0};
+    int globalId    = get_global_id(0);
+    int numOfGroups = get_num_groups(0);
+
+    /* Calculate thread-histograms */
+    for(int i = 0; i < NUM_OF_ELEMENTS_PER_WORK_ITEM_T; ++i)
+    {
+        uint value = convertKey(as_uint(unsortedKeys[mad24(globalId, NUM_OF_ELEMENTS_PER_WORK_ITEM_T, i)]));
+        value = (value >> shiftCount) & MASK_T;
+#if IS_GT
+        localBuckets[RADICES_T - value - 1]++;
+#else
+        localBuckets[value]++;
+#endif
+    }
+
+    for(int i = 0; i < NUM_OF_ELEMENTS_PER_WORK_ITEM_T; ++i)
+    {
+        buckets[mad24(i, RADICES_T * numOfGroups, globalId) ] = localBuckets[i];
+    }
+}
+
+kernel
+    void permuteRadixN (
+    __global K_T*  unsortedKeys,
+    __global V_T*  unsortedVals,
+    __global int* scanedBuckets,
+    uint shiftCount,
+    __global K_T*  sortedKeys,
+    __global V_T*  sortedVals
+    )
+{
+    const int RADIX_T     = N;
+    const int RADICES_T   = (1 << RADIX_T);
+    const int MASK_T = (1<<RADIX_T)  -1;
+
+    int globalId  = get_global_id(0);
+    int numOfGroups = get_num_groups(0);
+    const int NUM_OF_ELEMENTS_PER_WORK_GROUP_T = numOfGroups << N;
+    int  localIndex[16];
+
+    /*Load the index to local memory*/
+    for(int i = 0; i < RADICES_T; ++i)
+    {
+#if IS_GT
+        localIndex[i] = scanedBuckets[mad24(RADICES_T - i - 1, NUM_OF_ELEMENTS_PER_WORK_GROUP_T, globalId)];
+#else
+        localIndex[i] = scanedBuckets[mad24(i, NUM_OF_ELEMENTS_PER_WORK_GROUP_T, globalId)];
+#endif
+    }
+    /* Permute elements to appropriate location */
+    for(int i = 0; i < RADICES_T; ++i)
+    {
+        int old_idx = mad24(globalId, RADICES_T, i);
+        K_T  ovalue = unsortedKeys[old_idx];
+        uint value = convertKey(as_uint(ovalue));
+        uint maskedValue = (value >> shiftCount) & MASK_T;
+        uint index = localIndex[maskedValue];
+        sortedKeys[index] = ovalue;
+        sortedVals[index] = unsortedVals[old_idx];
+        localIndex[maskedValue] = index + 1;
+    }
+}
--- a/modules/ocl/src/opencl/kernel_sort_by_key.cl
+++ b/modules/ocl/src/opencl/kernel_sort_by_key.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@outlook.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef K_T
+#define K_T float
+#endif
+
+#ifndef V_T
+#define V_T float
+#endif
+
+#ifndef IS_GT
+#define IS_GT false
+#endif
+
+#if IS_GT
+#define my_comp(x,y) ((x) > (y))
+#else
+#define my_comp(x,y) ((x) < (y))
+#endif
+
+/////////////////////// Bitonic sort ////////////////////////////
+// ported from 
+// https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_by_key_kernels.cl
+__kernel
+    void bitonicSort
+    (
+        __global K_T * keys,
+        __global V_T * vals,
+        int count,
+        int stage,
+        int passOfStage
+    )
+{
+    const int threadId = get_global_id(0);
+    if(threadId >= count / 2)
+    {
+        return;
+    }
+    const int pairDistance = 1 << (stage - passOfStage);
+    const int blockWidth   = 2 * pairDistance;
+
+    int leftId = min( (threadId % pairDistance) 
+                   + (threadId / pairDistance) * blockWidth, count );
+
+    int rightId = min( leftId + pairDistance, count );
+
+    int temp;
+
+    const V_T lval = vals[leftId];
+    const V_T rval = vals[rightId]; 
+
+    const K_T lkey = keys[leftId];
+    const K_T rkey = keys[rightId];
+
+    int sameDirectionBlockWidth = 1 << stage;
+
+    if((threadId/sameDirectionBlockWidth) % 2 == 1)
+    {
+        temp = rightId;
+        rightId = leftId;
+        leftId = temp;
+    }
+
+    const bool compareResult = my_comp(lkey, rkey);
+
+    if(compareResult)
+    {
+        keys[rightId] = rkey;
+        keys[leftId]  = lkey;
+        vals[rightId] = rval;
+        vals[leftId]  = lval;
+    }
+    else
+    {
+        keys[rightId] = lkey;
+        keys[leftId]  = rkey;
+        vals[rightId] = lval;
+        vals[leftId]  = rval;
+    }
+}
+
+/////////////////////// Selection sort ////////////////////////////
+//kernel is ported from Bolt library:
+//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl
+__kernel
+    void selectionSortLocal
+    (
+        __global K_T * keys,
+        __global V_T * vals,
+        const int count,
+        __local  K_T * scratch
+    )
+{
+    int          i  = get_local_id(0); // index in workgroup
+    int numOfGroups = get_num_groups(0); // index in workgroup
+    int groupID     = get_group_id(0);
+    int         wg  = get_local_size(0); // workgroup size = block size
+    int n; // number of elements to be processed for this work group
+
+    int offset   = groupID * wg;
+    int same     = 0;
+    
+    vals      += offset;
+    keys      += offset;
+    n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg;
+
+    int clamped_i= min(i, n - 1);
+
+    K_T key1 = keys[clamped_i], key2;
+    V_T val1 = vals[clamped_i];
+    scratch[i] = key1;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(i >= n)
+    {
+        return;
+    }
+
+    int pos = 0;
+    for (int j=0;j<n;++j)
+    {
+        key2  = scratch[j];
+        if(my_comp(key2, key1)) 
+            pos++;//calculate the rank of this element in this work group
+        else 
+        {
+            if(my_comp(key1, key2))
+                continue;
+            else 
+            {
+                // key1 and key2 are same
+                same++;
+            }
+        }
+    }
+    for (int j=0; j< same; j++)
+    {
+        vals[pos + j] = val1;
+        keys[pos + j] = key1;
+    }
+}
+__kernel
+    void selectionSortFinal
+    (
+        __global K_T * keys,
+        __global V_T * vals,
+        const int count
+    )
+{
+    const int          i  = get_local_id(0); // index in workgroup
+    const int numOfGroups = get_num_groups(0); // index in workgroup
+    const int groupID     = get_group_id(0);
+    const int         wg  = get_local_size(0); // workgroup size = block size
+    int pos = 0, same = 0;
+    const int offset = get_group_id(0) * wg;
+    const int remainder = count - wg*(numOfGroups-1);
+
+    if((offset + i ) >= count)
+        return;
+    V_T val1 = vals[offset + i];
+
+    K_T key1 = keys[offset + i];
+    K_T key2;
+
+    for(int j=0; j<numOfGroups-1; j++ )
+    {
+        for(int k=0; k<wg; k++)
+        {
+            key2 = keys[j*wg + k]; 
+            if(my_comp(key1, key2))
+                break;
+            else
+            {
+                //Increment only if the value is not the same. 
+                if(my_comp(key2, key1))
+                    pos++;
+                else 
+                    same++;
+            }
+        }
+    }
+
+    for(int k=0; k<remainder; k++)
+    {
+        key2 = keys[(numOfGroups-1)*wg + k]; 
+        if(my_comp(key1, key2))
+            break;
+        else
+        {
+            //Don't increment if the value is the same. 
+            if(my_comp(key2, key1))
+                pos++;
+            else 
+                same++;
+        }
+    }  
+    for (int j=0; j< same; j++)
+    {
+        vals[pos + j] = val1;
+        keys[pos + j] = key1;
+    }
+}
--- a/modules/ocl/src/opencl/kernel_stablesort_by_key.cl
+++ b/modules/ocl/src/opencl/kernel_stablesort_by_key.cl
--- a/modules/ocl/src/sort_by_key.cpp
+++ b/modules/ocl/src/sort_by_key.cpp
--- a/modules/ocl/test/test_sort.cpp
+++ b/modules/ocl/test/test_sort.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@outlook.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include <map>
+#include <functional>
+#include "precomp.hpp"
+
+using namespace std;
+using namespace cvtest;
+using namespace testing;
+using namespace cv;
+
+
+namespace
+{
+IMPLEMENT_PARAM_CLASS(IsGreaterThan, bool)
+IMPLEMENT_PARAM_CLASS(InputSize, int)
+IMPLEMENT_PARAM_CLASS(SortMethod, int)
+
+
+template<class T> 
+struct KV_CVTYPE{ static int toType() {return 0;} };
+
+template<> struct KV_CVTYPE<int>  { static int toType() {return CV_32SC1;} };
+template<> struct KV_CVTYPE<float>{ static int toType() {return CV_32FC1;} };
+template<> struct KV_CVTYPE<Vec2i>{ static int toType() {return CV_32SC2;} };
+template<> struct KV_CVTYPE<Vec2f>{ static int toType() {return CV_32FC2;} };
+
+template<class key_type, class val_type>
+bool kvgreater(pair<key_type, val_type> p1, pair<key_type, val_type> p2)
+{
+    return p1.first > p2.first;
+}
+
+template<class key_type, class val_type>
+bool kvless(pair<key_type, val_type> p1, pair<key_type, val_type> p2)
+{
+    return p1.first < p2.first;
+}
+
+template<class key_type, class val_type>
+void toKVPair(
+    MatConstIterator_<key_type> kit,
+    MatConstIterator_<val_type> vit,
+    int vecSize,
+    vector<pair<key_type, val_type> >& kvres
+    )
+{
+    kvres.clear();
+    for(int i = 0; i < vecSize; i ++)
+    {
+        kvres.push_back(make_pair(*kit, *vit));
+        ++kit;
+        ++vit;
+    }
+}
+
+template<class key_type, class val_type>
+void kvquicksort(Mat& keys, Mat& vals, bool isGreater = false)
+{
+    vector<pair<key_type, val_type> > kvres;
+    toKVPair(keys.begin<key_type>(), vals.begin<val_type>(), keys.cols, kvres);
+    
+    if(isGreater)
+    {
+        std::sort(kvres.begin(), kvres.end(), kvgreater<key_type, val_type>);
+    }
+    else
+    {
+        std::sort(kvres.begin(), kvres.end(), kvless<key_type, val_type>);
+    }
+    key_type * kptr = keys.ptr<key_type>();
+    val_type * vptr = vals.ptr<val_type>();
+    for(int i = 0; i < keys.cols; i ++)
+    {
+        kptr[i] = kvres[i].first;
+        vptr[i] = kvres[i].second;
+    }
+}
+
+class SortByKey_STL
+{
+public:
+    static void sort(cv::Mat&, cv::Mat&, bool is_gt);
+private:
+    typedef void (*quick_sorter)(cv::Mat&, cv::Mat&, bool);
+    SortByKey_STL();
+    quick_sorter quick_sorters[CV_64FC4][CV_64FC4];
+    static SortByKey_STL instance;
+};
+
+SortByKey_STL SortByKey_STL::instance = SortByKey_STL();
+
+SortByKey_STL::SortByKey_STL()
+{
+    memset(instance.quick_sorters, 0, sizeof(quick_sorters));
+#define NEW_SORTER(KT, VT) \
+    instance.quick_sorters[KV_CVTYPE<KT>::toType()][KV_CVTYPE<VT>::toType()] = kvquicksort<KT, VT>;
+
+    NEW_SORTER(int, int);
+    NEW_SORTER(int, Vec2i);
+    NEW_SORTER(int, float);
+    NEW_SORTER(int, Vec2f);
+
+    NEW_SORTER(float, int);
+    NEW_SORTER(float, Vec2i);
+    NEW_SORTER(float, float);
+    NEW_SORTER(float, Vec2f);
+#undef NEW_SORTER
+}
+
+void SortByKey_STL::sort(cv::Mat& keys, cv::Mat& vals, bool is_gt)
+{
+    instance.quick_sorters[keys.type()][vals.type()](keys, vals, is_gt);
+}
+
+bool checkUnstableSorterResult(const Mat& gkeys_, const Mat& gvals_,
+                               const Mat& /*dkeys_*/, const Mat& dvals_)
+{
+    int cn_val = gvals_.channels();
+    int count  = gkeys_.cols;
+
+    //for convenience we convert depth to float and channels to 1
+    Mat gkeys, gvals, dkeys, dvals;
+    gkeys_.reshape(1).convertTo(gkeys, CV_32F);
+    gvals_.reshape(1).convertTo(gvals, CV_32F);
+    //dkeys_.reshape(1).convertTo(dkeys, CV_32F);
+    dvals_.reshape(1).convertTo(dvals, CV_32F);
+    float * gkptr = gkeys.ptr<float>();
+    float * gvptr = gvals.ptr<float>();
+    //float * dkptr = dkeys.ptr<float>();
+    float * dvptr = dvals.ptr<float>();
+
+    for(int i = 0; i < count - 1; ++i)
+    {
+        int iden_count = 0;
+        // firstly calculate the number of identical keys
+        while(gkptr[i + iden_count] == gkptr[i + 1 + iden_count])
+        {
+            ++ iden_count;
+        }
+        
+        // sort dv and gv
+        int num_of_val = (iden_count + 1) * cn_val;
+        std::sort(gvptr + i * cn_val, gvptr + i * cn_val + num_of_val);
+        std::sort(dvptr + i * cn_val, dvptr + i * cn_val + num_of_val);
+
+        // then check if [i, i + iden_count) is the same
+        for(int j = 0; j < num_of_val; ++j)
+        {
+            if(gvptr[i + j] != dvptr[i + j])
+            {
+                return false;
+            }
+        }
+        i += iden_count;
+    }
+    return true;
+}
+}
+
+#define INPUT_SIZES  Values(InputSize(0x10), InputSize(0x100), InputSize(0x10000)) //2^4, 2^8, 2^16
+#define KEY_TYPES    Values(MatType(CV_32SC1), MatType(CV_32FC1))
+#define VAL_TYPES    Values(MatType(CV_32SC1), MatType(CV_32SC2), MatType(CV_32FC1), MatType(CV_32FC2))
+#define SORT_METHODS Values(SortMethod(cv::ocl::SORT_BITONIC),SortMethod(cv::ocl::SORT_MERGE),SortMethod(cv::ocl::SORT_RADIX)/*,SortMethod(cv::ocl::SORT_SELECTION)*/)
+#define F_OR_T       Values(IsGreaterThan(false), IsGreaterThan(true))
+
+PARAM_TEST_CASE(SortByKey, InputSize, MatType, MatType, SortMethod, IsGreaterThan)
+{
+    InputSize input_size;
+    MatType key_type, val_type;
+    SortMethod method;
+    IsGreaterThan is_gt;
+
+    Mat mat_key, mat_val;
+    virtual void SetUp()
+    {
+        input_size = GET_PARAM(0);
+        key_type   = GET_PARAM(1);
+        val_type   = GET_PARAM(2);
+        method     = GET_PARAM(3);
+        is_gt      = GET_PARAM(4);
+
+        using namespace cv;
+        // fill key and val
+        mat_key = randomMat(Size(input_size, 1), key_type, INT_MIN, INT_MAX);
+        mat_val = randomMat(Size(input_size, 1), val_type, INT_MIN, INT_MAX);
+    }
+};
+
+TEST_P(SortByKey, Accuracy)
+{
+    using namespace cv;
+    ocl::oclMat oclmat_key(mat_key);
+    ocl::oclMat oclmat_val(mat_val);
+
+    ocl::sortByKey(oclmat_key, oclmat_val, method, is_gt);
+    SortByKey_STL::sort(mat_key, mat_val, is_gt);
+
+    EXPECT_MAT_NEAR(mat_key, oclmat_key, 0.0);
+    EXPECT_TRUE(checkUnstableSorterResult(mat_key, mat_val, oclmat_key, oclmat_val));
+}
+INSTANTIATE_TEST_CASE_P(OCL_SORT, SortByKey, Combine(INPUT_SIZES, KEY_TYPES, VAL_TYPES, SORT_METHODS, F_OR_T));