Commit f071a48e authored by Alexander Alekhin's avatar Alexander Alekhin

Merge pull request #10143 from pengli:ocl4dnn

parents a1479cc2 636d6368
......@@ -410,13 +410,13 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
* @param outputName name for layer which output is needed to get
* @details If @p outputName is empty, runs forward pass for the whole network.
*/
CV_WRAP void forward(std::vector<Mat>& outputBlobs, const String& outputName = String());
CV_WRAP void forward(OutputArrayOfArrays outputBlobs, const String& outputName = String());
/** @brief Runs forward pass to compute outputs of layers listed in @p outBlobNames.
* @param outputBlobs contains blobs for first outputs of specified layers.
* @param outBlobNames names for layers which outputs are needed to get
*/
CV_WRAP void forward(std::vector<Mat>& outputBlobs,
CV_WRAP void forward(OutputArrayOfArrays outputBlobs,
const std::vector<String>& outBlobNames);
/** @brief Runs forward pass to compute outputs of layers listed in @p outBlobNames.
......
......@@ -1638,7 +1638,7 @@ struct Net::Impl
CV_Error(Error::StsOutOfRange, "Layer \"" + ld.name + "\" produce only " + toString(ld.outputBlobs.size()) +
" outputs, the #" + toString(pin.oid) + " was requsted");
}
if (preferableBackend != DNN_TARGET_CPU)
if (preferableBackend != DNN_BACKEND_DEFAULT)
{
// Transfer data to CPU if it's require.
ld.outputBlobsWrappers[pin.oid]->copyToHost();
......@@ -1654,10 +1654,35 @@ struct Net::Impl
return ld.outputBlobs[pin.oid];
}
void getBlob(UMat& umat, const LayerPin& pin)
{
CV_TRACE_FUNCTION();
if (!pin.valid())
CV_Error(Error::StsObjectNotFound, "Requested blob not found");
LayerData &ld = layers[pin.lid];
if ((size_t)pin.oid >= ld.outputBlobs.size())
{
CV_Error(Error::StsOutOfRange, "Layer \"" + ld.name + "\" produce only " + toString(ld.outputBlobs.size()) +
" outputs, the #" + toString(pin.oid) + " was requsted");
}
if (ld.umat_outputBlobs.size() > 0 && !ld.umat_outputBlobs[pin.oid].empty())
umat = ld.umat_outputBlobs[pin.oid];
else
umat = UMat();
}
Mat getBlob(String outputName)
{
return getBlob(getPinByAlias(outputName));
}
void getBlob(UMat& umat, String outputName)
{
getBlob(umat, getPinByAlias(outputName));
}
};
Net::Net() : impl(new Net::Impl)
......@@ -1735,7 +1760,7 @@ Mat Net::forward(const String& outputName)
return impl->getBlob(layerName);
}
void Net::forward(std::vector<Mat>& outputBlobs, const String& outputName)
void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
{
CV_TRACE_FUNCTION();
......@@ -1751,16 +1776,40 @@ void Net::forward(std::vector<Mat>& outputBlobs, const String& outputName)
LayerPin pin = impl->getPinByAlias(layerName);
LayerData &ld = impl->layers[pin.lid];
if (ld.umat_outputBlobs.size() > 0)
if (outputBlobs.isUMat())
{
for (int i = 0; i < ld.umat_outputBlobs.size(); i++)
ld.umat_outputBlobs[i].copyTo(ld.outputBlobs[i]);
if (ld.umat_outputBlobs.size() > 0)
{
UMat umat;
impl->getBlob(umat, layerName);
outputBlobs.assign(umat);
}
}
else if (outputBlobs.isMat())
{
outputBlobs.assign(impl->getBlob(layerName));
}
else if (outputBlobs.isMatVector())
{
if (ld.umat_outputBlobs.size() > 0)
{
for (int i = 0; i < ld.umat_outputBlobs.size(); i++)
ld.umat_outputBlobs[i].copyTo(ld.outputBlobs[i]);
}
std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
outputvec = ld.outputBlobs;
}
else if (outputBlobs.isUMatVector())
{
if (ld.umat_outputBlobs.size() > 0)
{
std::vector<UMat> & outputvec = *(std::vector<UMat> *)outputBlobs.getObj();
outputvec = ld.umat_outputBlobs;
}
}
outputBlobs = ld.outputBlobs;
}
void Net::forward(std::vector<Mat>& outputBlobs,
void Net::forward(OutputArrayOfArrays outputBlobs,
const std::vector<String>& outBlobNames)
{
CV_TRACE_FUNCTION();
......@@ -1768,7 +1817,7 @@ void Net::forward(std::vector<Mat>& outputBlobs,
std::vector<LayerPin> pins;
for (int i = 0; i < outBlobNames.size(); i++)
{
pins.push_back(impl->getPinByAlias(outBlobNames[i]));
pins.push_back(impl->getPinByAlias(outBlobNames[i]));
}
impl->setUpNet(pins);
......@@ -1777,11 +1826,14 @@ void Net::forward(std::vector<Mat>& outputBlobs,
impl->forwardToLayer(impl->getLayerData(out.lid));
outputBlobs.clear();
std::vector<Mat> matvec;
for (int i = 0; i < pins.size(); i++)
{
outputBlobs.push_back(impl->getBlob(pins[i]));
matvec.push_back(impl->getBlob(pins[i]));
}
std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
outputvec = matvec;
}
void Net::forward(std::vector<std::vector<Mat> >& outputBlobs,
......
......@@ -286,8 +286,13 @@ public:
UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
for (size_t i = 0; i < inputs.size(); i++)
{
UMat& srcMat = inputs[i];
UMat& dstMat = outputs[i];
MatShape inshape, outshape;
inshape = shape(outerSize, innerSize);
outshape = shape(outerSize, numOutput);
UMat srcMat, dstMat;
srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]);
dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
dstMat.setTo(0.0f);
if (!innerProductOp->Forward(srcMat, umat_blobs[0], (bias) ? umat_blobs[1] : UMat(), dstMat))
......
......@@ -65,8 +65,6 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
int padded_width, int height,
int width, int ld)
{
ocl::Context ctx = ocl::Context::getDefault();
ocl::Queue queue = ocl::Queue::getDefault();
ocl::Image2D image;
if (!is_matrix_a && transpose)
......@@ -192,9 +190,6 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
// just padding one line is enough as the sub group block read
// will clamp to edge according to the spec.
ocl::Context ctx = ocl::Context::getDefault();
ocl::Queue queue = ocl::Queue::getDefault();
ocl::Image2D ImA;
ocl::Image2D ImB;
......@@ -446,7 +441,6 @@ bool ocl4dnnGEMV<float>(const CBLAS_TRANSPOSE TransA,
const int32_t offx, const float beta, UMat y,
const int32_t offy)
{
ocl::Queue queue = ocl::Queue::getDefault();
bool ret = false;
if (TransA == CblasNoTrans)
......@@ -507,8 +501,6 @@ bool ocl4dnnAXPY(const int32_t N, const Dtype alpha,
const UMat X, const int32_t offX, UMat Y,
const int32_t offY)
{
ocl::Context ctx = ocl::Context::getDefault();
ocl::Kernel oclk_axpy(CL_KERNEL_SELECT("axpy"), cv::ocl::dnn::math_oclsrc);
if (oclk_axpy.empty())
return false;
......
......@@ -198,8 +198,6 @@ void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
addDef("as_Dtype2", "as_float2");
addDef("as_Dtype4", "as_float4");
addDef("as_Dtype8", "as_float8");
addDef("Dtype_ID", (int)CV_32F);
addDef("Dtype_SIZE", (int)sizeof(Dtype));
}
typedef enum {
......
......@@ -92,7 +92,6 @@ bool OCL4DNNLRN<Dtype>::Forward(const UMat& bottom, UMat& top)
template<typename Dtype>
bool OCL4DNNLRN<Dtype>::crossChannelForward(const UMat& bottom, UMat& top)
{
ocl::Queue queue = ocl::Queue::getDefault();
CHECK_EQ(phase_test_, true) << "Only support forward inference.";
cl_uint argIdx = 0;
......
......@@ -97,7 +97,6 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
UMat& top_mask)
{
bool ret = true;
ocl::Queue queue = ocl::Queue::getDefault();
size_t global[] = { 128 * 128 };
size_t local[] = { 128 };
cl_uint argIdx = 0;
......
......@@ -83,7 +83,6 @@ template<typename Dtype>
bool OCL4DNNSoftmax<Dtype>::Forward(const UMat& bottom, UMat& top)
{
bool ret = false;
ocl::Queue queue = ocl::Queue::getDefault();
bool intel_subgroup = ocl::Device::getDefault().intelSubgroupsSupport();
if (intel_subgroup && inner_num_ < 128)
{
......
......@@ -91,7 +91,6 @@
#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))
#if defined(convolve_simd) || defined(Conv_Interleaved)
#if Dtype_SIZE == 4
#define INT_TYPE uint
#define INT_TYPE2 uint2
#define INT_TYPE4 uint4
......@@ -100,9 +99,6 @@
#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read4
#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read8
#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read
#else
#error "Unsupported type"
#endif
#endif
#ifdef KERNEL_BASIC
......@@ -186,11 +182,7 @@ __kernel void ConvolveBasic(
#elif defined KERNEL_IDLF
#if TYPE == TYPE_HALF
#define VLOAD4(_v, _p) do { (_v).s0 = *(_p); (_v).s1 = *(_p + 1); (_v).s2 = *(_p + 2); (_v).s3 = *(_p + 3); } while(0)
#else
#define VLOAD4(_v, _p) do { _v = vload4(0, _p); } while(0)
#endif
// Each work-item computes a OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT region of one output map.
// Each work-group (which will be mapped to 1 SIMD16/SIMD8 EU thread) will compute 16/8 different feature maps, but each feature map is for the same region of the imput image.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment