Merge pull request #1663 from vpisarev:ocl_experiments3

616db74e · Andrey Pavlenko · OpenCV Buildbot · 31f0ab6c · 485d36d3 · 616db74e
Commit 616db74e authored Oct 29, 2013 by Andrey Pavlenko Committed by OpenCV Buildbot Oct 29, 2013
25 changed files
--- a/cmake/OpenCVDetectPython.cmake
+++ b/cmake/OpenCVDetectPython.cmake
@@ -12,7 +12,10 @@ if(WIN32 AND NOT PYTHON_EXECUTABLE)
    )
  endforeach()
 endif()
+find_host_package(PythonInterp 2.7)
+if(NOT PYTHONINTERP_FOUND)
 find_host_package(PythonInterp "${MIN_VER_PYTHON}")
+endif()
 unset(HAVE_SPHINX CACHE)

--- a/modules/core/doc/operations_on_arrays.rst
+++ b/modules/core/doc/operations_on_arrays.rst
@@ -378,7 +378,7 @@ Calculates the covariance matrix of a set of vectors.
 .. ocv:function:: void calcCovarMatrix( const Mat* samples, int nsamples, Mat& covar, Mat& mean, int flags, int ctype=CV_64F)
-.. ocv:function:: void calcCovarMatrix( InputArray samples, OutputArray covar, OutputArray mean, int flags, int ctype=CV_64F)
+.. ocv:function:: void calcCovarMatrix( InputArray samples, OutputArray covar, InputOutputArray mean, int flags, int ctype=CV_64F)
 .. ocv:pyfunction:: cv2.calcCovarMatrix(samples, flags[, covar[, mean[, ctype]]]) -> covar, mean

--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -158,6 +158,9 @@ enum { REDUCE_SUM = 0,
 //! swaps two matrices
 CV_EXPORTS void swap(Mat& a, Mat& b);
+//! swaps two umatrices
+CV_EXPORTS void swap( UMat& a, UMat& b );
 //! 1D interpolation function: returns coordinate of the "donor" pixel for the specified location p.
 CV_EXPORTS_W int borderInterpolate(int p, int len, int borderType);
@@ -439,7 +442,7 @@ CV_EXPORTS void calcCovarMatrix( const Mat* samples, int nsamples, Mat& covar, M
 //! computes covariation matrix of a set of samples
 CV_EXPORTS_W void calcCovarMatrix( InputArray samples, OutputArray covar,
-                                   OutputArray mean, int flags, int ctype = CV_64F);
+                                   InputOutputArray mean, int flags, int ctype = CV_64F);
 CV_EXPORTS_W void PCACompute(InputArray data, InputOutputArray mean,
                             OutputArray eigenvectors, int maxComponents = 0);

--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -472,6 +472,9 @@ class CV_EXPORTS RNG;
 class CV_EXPORTS Mat;
 class CV_EXPORTS MatExpr;
+class CV_EXPORTS UMat;
+class CV_EXPORTS UMatExpr;
 class CV_EXPORTS SparseMat;
 typedef Mat MatND;

--- a/modules/core/include/opencv2/core/cuda.inl.hpp
+++ b/modules/core/include/opencv2/core/cuda.inl.hpp
@@ -595,7 +595,7 @@ namespace cv {
 inline
 Mat::Mat(const cuda::GpuMat& m)
-    : flags(0), dims(0), rows(0), cols(0), data(0), refcount(0), datastart(0), dataend(0), datalimit(0), allocator(0), size(&rows)
+    : flags(0), dims(0), rows(0), cols(0), data(0), datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows)
 {
    m.download(*this);
 }

--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -271,7 +271,7 @@ void cv::split(InputArray _m, OutputArrayOfArrays _mv)
        _mv.release();
        return;
    }
-    CV_Assert( !_mv.fixedType() || CV_MAT_TYPE(_mv.flags) == m.depth() );
+    CV_Assert( !_mv.fixedType() || _mv.empty() || _mv.type() == m.depth() );
    _mv.create(m.channels(), 1, m.depth());
    Mat* dst = &_mv.getMatRef(0);
    split(m, dst);

--- a/modules/core/src/matop.cpp
+++ b/modules/core/src/matop.cpp
@@ -1610,7 +1610,7 @@ MatExpr Mat::mul(InputArray m, double scale) const
    MatExpr e;
    if(m.kind() == _InputArray::EXPR)
    {
-        const MatExpr& me = *(const MatExpr*)m.obj;
+        const MatExpr& me = *(const MatExpr*)m.getObj();
        me.op->multiply(MatExpr(*this), me, e, scale);
    }
    else

--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -50,6 +50,7 @@
 #include "opencv2/core/private.hpp"
 #include "opencv2/core/private.cuda.hpp"
+#include "opencv2/core/ocl.hpp"
 #include <assert.h>
 #include <ctype.h>
@@ -105,7 +106,7 @@ extern const uchar g_Saturate8u[];
 #if defined WIN32 || defined _WIN32
 void deleteThreadAllocData();
-void deleteThreadRNGData();
+void deleteThreadData();
 #endif
 template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
@@ -215,6 +216,19 @@ inline bool checkScalar(const Mat& sc, int atype, int sckind, int akind)
 void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize );
+struct TLSData
+{
+    TLSData();
+    RNG rng;
+    int device;
+    ocl::Queue oclQueue;
+    int useOpenCL; // 1 - use, 0 - do not use, -1 - auto/not initialized
+    static TLSData* get();
+};
+namespace ocl { MatAllocator* getOpenCLAllocator(); }
 }
 #endif /*_CXCORE_INTERNAL_H_*/
--- a/modules/core/src/rand.cpp
+++ b/modules/core/src/rand.cpp
@@ -727,85 +727,11 @@ void RNG::fill( InputOutputArray _mat, int disttype,
    }
 }
-#ifdef WIN32
-#ifdef HAVE_WINRT
-// using C++11 thread attribute for local thread data
-__declspec( thread ) RNG* rng = NULL;
- void deleteThreadRNGData()
- {
-    if (rng)
-        delete rng;
 }
-RNG& theRNG()
+cv::RNG& cv::theRNG()
 {
-    if (!rng)
+    return TLSData::get()->rng;
-    {
-        rng =  new RNG;
-    }
-    return *rng;
-}
-#else
-#ifdef WINCE
-#	define TLS_OUT_OF_INDEXES ((DWORD)0xFFFFFFFF)
-#endif
-static DWORD tlsRNGKey = TLS_OUT_OF_INDEXES;
- void deleteThreadRNGData()
- {
-     if( tlsRNGKey != TLS_OUT_OF_INDEXES )
-         delete (RNG*)TlsGetValue( tlsRNGKey );
-}
-RNG& theRNG()
-{
-    if( tlsRNGKey == TLS_OUT_OF_INDEXES )
-    {
-       tlsRNGKey = TlsAlloc();
-       CV_Assert(tlsRNGKey != TLS_OUT_OF_INDEXES);
-    }
-    RNG* rng = (RNG*)TlsGetValue( tlsRNGKey );
-    if( !rng )
-    {
-       rng = new RNG;
-       TlsSetValue( tlsRNGKey, rng );
-    }
-    return *rng;
-}
-#endif //HAVE_WINRT
-#else
-static pthread_key_t tlsRNGKey = 0;
-static pthread_once_t tlsRNGKeyOnce = PTHREAD_ONCE_INIT;
-static void deleteRNG(void* data)
-{
-    delete (RNG*)data;
-}
-static void makeRNGKey()
-{
-    int errcode = pthread_key_create(&tlsRNGKey, deleteRNG);
-    CV_Assert(errcode == 0);
-}
-RNG& theRNG()
-{
-    pthread_once(&tlsRNGKeyOnce, makeRNGKey);
-    RNG* rng = (RNG*)pthread_getspecific(tlsRNGKey);
-    if( !rng )
-    {
-        rng = new RNG;
-        pthread_setspecific(tlsRNGKey, rng);
-    }
-    return *rng;
-}
-#endif
 }
 void cv::randu(InputOutputArray dst, InputArray low, InputArray high)

--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -716,7 +716,7 @@ BOOL WINAPI DllMain( HINSTANCE, DWORD  fdwReason, LPVOID )
    if( fdwReason == DLL_THREAD_DETACH || fdwReason == DLL_PROCESS_DETACH )
    {
        cv::deleteThreadAllocData();
-        cv::deleteThreadRNGData();
+        cv::deleteThreadData();
    }
    return TRUE;
 }
@@ -830,4 +830,92 @@ bool Mutex::trylock() { return impl->trylock(); }
 }
+//////////////////////////////// thread-local storage ////////////////////////////////
+namespace cv
+{
+TLSData::TLSData()
+{
+    device = 0;
+    useOpenCL = -1;
+}
+#ifdef WIN32
+#ifdef HAVE_WINRT
+    // using C++11 thread attribute for local thread data
+    static __declspec( thread ) TLSData* g_tlsdata = NULL;
+    static void deleteThreadRNGData()
+    {
+        if (g_tlsdata)
+            delete g_tlsdata;
+    }
+    TLSData* TLSData::get()
+    {
+        if (!g_tlsdata)
+        {
+            g_tlsdata = new TLSData;
+        }
+        return g_tlsdata;
+    }
+#else
+#ifdef WINCE
+#   define TLS_OUT_OF_INDEXES ((DWORD)0xFFFFFFFF)
+#endif
+    static DWORD tlsKey = TLS_OUT_OF_INDEXES;
+    void deleteThreadData()
+    {
+        if( tlsKey != TLS_OUT_OF_INDEXES )
+            delete (TLSData*)TlsGetValue( tlsKey );
+    }
+    TLSData* TLSData::get()
+    {
+        if( tlsKey == TLS_OUT_OF_INDEXES )
+        {
+            tlsKey = TlsAlloc();
+            CV_Assert(tlsKey != TLS_OUT_OF_INDEXES);
+        }
+        TLSData* d = (TLSData*)TlsGetValue( tlsKey );
+        if( !d )
+        {
+            d = new TLSData;
+            TlsSetValue( tlsKey, d );
+        }
+        return d;
+    }
+#endif //HAVE_WINRT
+#else
+    static pthread_key_t tlsKey = 0;
+    static pthread_once_t tlsKeyOnce = PTHREAD_ONCE_INIT;
+    static void deleteTLSData(void* data)
+    {
+        delete (TLSData*)data;
+    }
+    static void makeKey()
+    {
+        int errcode = pthread_key_create(&tlsKey, deleteTLSData);
+        CV_Assert(errcode == 0);
+    }
+    TLSData* TLSData::get()
+    {
+        pthread_once(&tlsKeyOnce, makeKey);
+        TLSData* d = (TLSData*)pthread_getspecific(tlsKey);
+        if( !d )
+        {
+            d = new TLSData;
+            pthread_setspecific(tlsKey, d);
+        }
+        return d;
+    }
+#endif
+}
 /* End of file. */
--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
--- a/modules/core/test/test_umat.cpp
+++ b/modules/core/test/test_umat.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "test_precomp.hpp"
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include "opencv2/core/ocl.hpp"
+using namespace cv;
+using namespace std;
+class CV_UMatTest : public cvtest::BaseTest
+{
+public:
+    CV_UMatTest() {}
+    ~CV_UMatTest() {}
+protected:
+    void run(int);
+    struct test_excep
+    {
+        test_excep(const string& _s=string("")) : s(_s) {};
+        string s;
+    };
+    bool TestUMat();
+    void checkDiff(const Mat& m1, const Mat& m2, const string& s)
+    {
+        if (norm(m1, m2, NORM_INF) != 0)
+            throw test_excep(s);
+    }
+    void checkDiffF(const Mat& m1, const Mat& m2, const string& s)
+    {
+        if (norm(m1, m2, NORM_INF) > 1e-5)
+            throw test_excep(s);
+    }
+};
+#define STR(a) STR2(a)
+#define STR2(a) #a
+#define CHECK_DIFF(a, b) checkDiff(a, b, "(" #a ")  !=  (" #b ")  at l." STR(__LINE__))
+#define CHECK_DIFF_FLT(a, b) checkDiffF(a, b, "(" #a ")  !=(eps)  (" #b ")  at l." STR(__LINE__))
+bool CV_UMatTest::TestUMat()
+{
+    try
+    {
+        Mat a(100, 100, CV_16S), b;
+        randu(a, Scalar::all(-100), Scalar::all(100));
+        Rect roi(1, 3, 10, 20);
+        Mat ra(a, roi), rb;
+        UMat ua, ura;
+        a.copyTo(ua);
+        ua.copyTo(b);
+        CHECK_DIFF(a, b);
+        ura = ua(roi);
+        ura.copyTo(rb);
+        CHECK_DIFF(ra, rb);
+        ra += Scalar::all(1.f);
+        {
+        Mat temp = ura.getMat(ACCESS_RW);
+        temp += Scalar::all(1.f);
+        }
+        ra.copyTo(rb);
+        CHECK_DIFF(ra, rb);
+    }
+    catch (const test_excep& e)
+    {
+        ts->printf(cvtest::TS::LOG, "%s\n", e.s.c_str());
+        ts->set_failed_test_info(cvtest::TS::FAIL_MISMATCH);
+        return false;
+    }
+    return true;
+}
+void CV_UMatTest::run( int /* start_from */)
+{
+    printf("Use OpenCL: %s\nHave OpenCL: %s\n",
+           ocl::useOpenCL() ? "TRUE" : "FALSE",
+           ocl::haveOpenCL() ? "TRUE" : "FALSE" );
+    if (!TestUMat())
+        return;
+    ts->set_failed_test_info(cvtest::TS::OK);
+}
+TEST(Core_UMat, base) { CV_UMatTest test; test.safe_run(); }
--- a/modules/imgproc/perf/perf_histogram.cpp
+++ b/modules/imgproc/perf/perf_histogram.cpp
@@ -9,7 +9,7 @@ using std::tr1::get;
 typedef tr1::tuple<Size, MatType> Size_Source_t;
 typedef TestBaseWithParam<Size_Source_t> Size_Source;
-typedef TestBaseWithParam<Size> MatSize;
+typedef TestBaseWithParam<Size> TestMatSize;
 static const float rangeHight = 256.0f;
 static const float rangeLow = 0.0f;
@@ -99,6 +99,7 @@ PERF_TEST_P(Size_Source, calcHist3d,
    SANITY_CHECK(hist);
 }
+#define MatSize TestMatSize
 PERF_TEST_P(MatSize, equalizeHist,
            testing::Values(TYPICAL_MAT_SIZES)
            )
@@ -115,6 +116,7 @@ PERF_TEST_P(MatSize, equalizeHist,
    SANITY_CHECK(destination);
 }
+#undef MatSize
 typedef tr1::tuple<Size, double> Sz_ClipLimit_t;
 typedef TestBaseWithParam<Sz_ClipLimit_t> Sz_ClipLimit;

--- a/modules/legacy/src/em.cpp
+++ b/modules/legacy/src/em.cpp
@@ -102,7 +102,8 @@ float
 CvEM::predict( const CvMat* _sample, CvMat* _probs ) const
 {
    Mat prbs0 = cvarrToMat(_probs), prbs = prbs0, sample = cvarrToMat(_sample);
-    int cls = static_cast<int>(emObj.predict(sample, _probs ? _OutputArray(prbs) : cv::noArray())[1]);
+    int cls = static_cast<int>(emObj.predict(sample, _probs ? _OutputArray(prbs) :
+                                             (OutputArray)cv::noArray())[1]);
    if(_probs)
    {
        if( prbs.data != prbs0.data )
@@ -208,13 +209,16 @@ bool CvEM::train( const Mat& _samples, const Mat& _sample_idx,
    bool isOk = false;
    if( _params.start_step == EM::START_AUTO_STEP )
        isOk = emObj.train(_samples,
-                           logLikelihoods, _labels ? _OutputArray(*_labels) : cv::noArray(), probs);
+                           logLikelihoods, _labels ? _OutputArray(*_labels) :
+                           (OutputArray)cv::noArray(), probs);
    else if( _params.start_step == EM::START_E_STEP )
        isOk = emObj.trainE(_samples, means, covshdrs, weights,
-                            logLikelihoods, _labels ? _OutputArray(*_labels) : cv::noArray(), probs);
+                            logLikelihoods, _labels ? _OutputArray(*_labels) :
+                            (OutputArray)cv::noArray(), probs);
    else if( _params.start_step == EM::START_M_STEP )
        isOk = emObj.trainM(_samples, prbs,
-                            logLikelihoods, _labels ? _OutputArray(*_labels) : cv::noArray(), probs);
+                            logLikelihoods, _labels ? _OutputArray(*_labels) :
+                            (OutputArray)cv::noArray(), probs);
    else
        CV_Error(CV_StsBadArg, "Bad start type of EM algorithm");
@@ -230,7 +234,9 @@ bool CvEM::train( const Mat& _samples, const Mat& _sample_idx,
 float
 CvEM::predict( const Mat& _sample, Mat* _probs ) const
 {
-    return static_cast<float>(emObj.predict(_sample, _probs ? _OutputArray(*_probs) : cv::noArray())[1]);
+    return static_cast<float>(emObj.predict(_sample, _probs ?
+                                            _OutputArray(*_probs) :
+                                            (OutputArray)cv::noArray())[1]);
 }
 int CvEM::getNClusters() const

--- a/modules/legacy/src/features2d.cpp
+++ b/modules/legacy/src/features2d.cpp
@@ -82,7 +82,7 @@ cvExtractSURF( const CvArr* _img, const CvArr* _mask,
    surf->set("upright", params.upright != 0);
    surf->set("extended", params.extended != 0);
-    surf->operator()(img, mask, kpt, _descriptors ? _OutputArray(descr) : noArray(),
+    surf->operator()(img, mask, kpt, _descriptors ? _OutputArray(descr) : (OutputArray)noArray(),
                     useProvidedKeyPts != 0);
    if( _keypoints )

--- a/modules/ocl/src/matrix_operations.cpp
+++ b/modules/ocl/src/matrix_operations.cpp
@@ -154,30 +154,24 @@ void cv::ocl::oclMat::upload(const Mat &m)
 cv::ocl::oclMat::operator cv::_InputArray()
 {
-    _InputArray newInputArray;
+    return _InputArray(cv::_InputArray::OCL_MAT, this);
-    newInputArray.flags = cv::_InputArray::OCL_MAT;
-    newInputArray.obj   = reinterpret_cast<void *>(this);
-    return newInputArray;
 }
 cv::ocl::oclMat::operator cv::_OutputArray()
 {
-    _OutputArray newOutputArray;
+    return _OutputArray(cv::_InputArray::OCL_MAT, this);
-    newOutputArray.flags = cv::_InputArray::OCL_MAT;
-    newOutputArray.obj   = reinterpret_cast<void *>(this);
-    return newOutputArray;
 }
 cv::ocl::oclMat& cv::ocl::getOclMatRef(InputArray src)
 {
-    CV_Assert(src.flags & cv::_InputArray::OCL_MAT);
+    CV_Assert(src.kind() == cv::_InputArray::OCL_MAT);
-    return *reinterpret_cast<oclMat*>(src.obj);
+    return *(oclMat*)src.getObj();
 }
 cv::ocl::oclMat& cv::ocl::getOclMatRef(OutputArray src)
 {
-    CV_Assert(src.flags & cv::_InputArray::OCL_MAT);
+    CV_Assert(src.kind() == cv::_InputArray::OCL_MAT);
-    return *reinterpret_cast<oclMat*>(src.obj);
+    return *(oclMat*)src.getObj();
 }
 void cv::ocl::oclMat::download(cv::Mat &m) const

--- a/modules/python/src2/cv2.cpp
+++ b/modules/python/src2/cv2.cpp
@@ -175,27 +175,27 @@ static PyObject* failmsgp(const char *fmt, ...)
  return 0;
 }
-static size_t REFCOUNT_OFFSET = (size_t)&(((PyObject*)0)->ob_refcnt) +
-    (0x12345678 != *(const size_t*)"\x78\x56\x34\x12\0\0\0\0\0")*sizeof(int);
-static inline PyObject* pyObjectFromRefcount(const int* refcount)
-{
-    return (PyObject*)((size_t)refcount - REFCOUNT_OFFSET);
-}
-static inline int* refcountFromPyObject(const PyObject* obj)
-{
-    return (int*)((size_t)obj + REFCOUNT_OFFSET);
-}
 class NumpyAllocator : public MatAllocator
 {
 public:
-    NumpyAllocator() {}
+    NumpyAllocator() { stdAllocator = Mat::getStdAllocator(); }
    ~NumpyAllocator() {}
-    void allocate(int dims, const int* sizes, int type, int*& refcount,
+    UMatData* allocate(PyObject* o, int dims, const int* sizes, int type, size_t* step) const
-                  uchar*& datastart, uchar*& data, size_t* step)
+    {
+        UMatData* u = new UMatData(this);
+        u->refcount = 1;
+        u->data = u->origdata = (uchar*)PyArray_DATA((PyArrayObject*) o);
+        npy_intp* _strides = PyArray_STRIDES((PyArrayObject*) o);
+        for( int i = 0; i < dims - 1; i++ )
+            step[i] = (size_t)_strides[i];
+        step[dims-1] = CV_ELEM_SIZE(type);
+        u->size = sizes[0]*step[0];
+        u->userdata = o;
+        return u;
+    }
+    UMatData* allocate(int dims0, const int* sizes, int type, size_t* step) const
    {
        PyEnsureGIL gil;
@@ -203,10 +203,10 @@ public:
        int cn = CV_MAT_CN(type);
        const int f = (int)(sizeof(size_t)/8);
        int typenum = depth == CV_8U ? NPY_UBYTE : depth == CV_8S ? NPY_BYTE :
-                      depth == CV_16U ? NPY_USHORT : depth == CV_16S ? NPY_SHORT :
+        depth == CV_16U ? NPY_USHORT : depth == CV_16S ? NPY_SHORT :
-                      depth == CV_32S ? NPY_INT : depth == CV_32F ? NPY_FLOAT :
+        depth == CV_32S ? NPY_INT : depth == CV_32F ? NPY_FLOAT :
-                      depth == CV_64F ? NPY_DOUBLE : f*NPY_ULONGLONG + (f^1)*NPY_UINT;
+        depth == CV_64F ? NPY_DOUBLE : f*NPY_ULONGLONG + (f^1)*NPY_UINT;
-        int i;
+        int i, dims = dims0;
        cv::AutoBuffer<npy_intp> _sizes(dims + 1);
        for( i = 0; i < dims; i++ )
            _sizes[i] = sizes[i];
@@ -215,22 +215,58 @@ public:
        PyObject* o = PyArray_SimpleNew(dims, _sizes, typenum);
        if(!o)
            CV_Error_(Error::StsError, ("The numpy array of typenum=%d, ndims=%d can not be created", typenum, dims));
-        refcount = refcountFromPyObject(o);
+        return allocate(o, dims0, sizes, type, step);
-        npy_intp* _strides = PyArray_STRIDES((PyArrayObject*) o);
-        for( i = 0; i < dims - (cn > 1); i++ )
-            step[i] = (size_t)_strides[i];
-        datastart = data = (uchar*)PyArray_DATA((PyArrayObject*) o);
    }
-    void deallocate(int* refcount, uchar*, uchar*)
+    bool allocate(UMatData* u, int accessFlags) const
    {
-        PyEnsureGIL gil;
+        return stdAllocator->allocate(u, accessFlags);
-        if( !refcount )
+    }
-            return;
-        PyObject* o = pyObjectFromRefcount(refcount);
+    void deallocate(UMatData* u) const
-        Py_INCREF(o);
+    {
-        Py_DECREF(o);
+        if(u)
+        {
+            PyEnsureGIL gil;
+            PyObject* o = (PyObject*)u->userdata;
+            Py_DECREF(o);
+            delete u;
+        }
    }
+    void map(UMatData*, int) const
+    {
+    }
+    void unmap(UMatData* u) const
+    {
+        if(u->urefcount == 0)
+            deallocate(u);
+    }
+    void download(UMatData* u, void* dstptr,
+                  int dims, const size_t sz[],
+                  const size_t srcofs[], const size_t srcstep[],
+                  const size_t dststep[]) const
+    {
+        stdAllocator->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
+    }
+    void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
+                const size_t dstofs[], const size_t dststep[],
+                const size_t srcstep[]) const
+    {
+        stdAllocator->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
+    }
+    void copy(UMatData* usrc, UMatData* udst, int dims, const size_t sz[],
+              const size_t srcofs[], const size_t srcstep[],
+              const size_t dstofs[], const size_t dststep[], bool sync) const
+    {
+        stdAllocator->copy(usrc, udst, dims, sz, srcofs, srcstep, dstofs, dststep, sync);
+    }
+    const MatAllocator* stdAllocator;
 };
 NumpyAllocator g_numpyAllocator;
@@ -400,16 +436,12 @@ static bool pyopencv_to(PyObject* o, Mat& m, const ArgInfo info)
    }
    m = Mat(ndims, size, type, PyArray_DATA(oarr), step);
+    m.u = g_numpyAllocator.allocate(o, ndims, size, type, step);
-    if( m.data )
+    if( !needcopy )
    {
-        m.refcount = refcountFromPyObject(o);
+        Py_INCREF(o);
-        if (!needcopy)
+    }
-        {
-            m.addref(); // protect the original numpy array from deallocation
-                        // (since Mat destructor will decrement the reference counter)
-        }
-    };
    m.allocator = &g_numpyAllocator;
    return true;
@@ -421,14 +453,15 @@ PyObject* pyopencv_from(const Mat& m)
    if( !m.data )
        Py_RETURN_NONE;
    Mat temp, *p = (Mat*)&m;
-    if(!p->refcount || p->allocator != &g_numpyAllocator)
+    if(!p->u || p->allocator != &g_numpyAllocator)
    {
        temp.allocator = &g_numpyAllocator;
        ERRWRAP2(m.copyTo(temp));
        p = &temp;
    }
-    p->addref();
+    PyObject* o = (PyObject*)p->u->userdata;
-    return pyObjectFromRefcount(p->refcount);
+    Py_INCREF(o);
+    return o;
 }
 template<>

--- a/modules/superres/src/optical_flow.cpp
+++ b/modules/superres/src/optical_flow.cpp
@@ -163,7 +163,9 @@ namespace
    void Farneback::impl(const Mat& input0, const Mat& input1, OutputArray dst)
    {
-        calcOpticalFlowFarneback(input0, input1, dst, pyrScale_, numLevels_, winSize_, numIters_, polyN_, polySigma_, flags_);
+        calcOpticalFlowFarneback(input0, input1, (InputOutputArray)dst, pyrScale_,
+                                 numLevels_, winSize_, numIters_,
+                                 polyN_, polySigma_, flags_);
    }
 }
@@ -325,7 +327,7 @@ namespace
        alg_->set("iterations", iterations_);
        alg_->set("useInitialFlow", useInitialFlow_);
-        alg_->calc(input0, input1, dst);
+        alg_->calc(input0, input1, (InputOutputArray)dst);
    }
    void DualTVL1::collectGarbage()

--- a/modules/video/src/compat_video.cpp
+++ b/modules/video/src/compat_video.cpp
@@ -352,7 +352,7 @@ cvCalcOpticalFlowPyrLK( const void* arrA, const void* arrB,
    if( error )
        err = cv::Mat(count, 1, CV_32F, (void*)error);
    cv::calcOpticalFlowPyrLK( A, B, ptA, ptB, st,
-                              error ? cv::_OutputArray(err) : cv::noArray(),
+                              error ? cv::_OutputArray(err) : (cv::_OutputArray)cv::noArray(),
                              winSize, level, criteria, flags);
 }

--- a/modules/video/src/optflowgf.cpp
+++ b/modules/video/src/optflowgf.cpp
@@ -564,7 +564,7 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1,
 }
 void cv::calcOpticalFlowFarneback( InputArray _prev0, InputArray _next0,
-                               OutputArray _flow0, double pyr_scale, int levels, int winsize,
+                               InputOutputArray _flow0, double pyr_scale, int levels, int winsize,
                               int iterations, int poly_n, double poly_sigma, int flags )
 {
    Mat prev0 = _prev0.getMat(), next0 = _next0.getMat();