Commit 86e8a105 authored by Alexander Alekhin's avatar Alexander Alekhin

Merge pull request #9090 from vpisarev:dnn_optim_scale_concat

parents a586ef72 0488d9bd
......@@ -152,6 +152,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
class CV_EXPORTS ActivationLayer;
class CV_EXPORTS BatchNormLayer;
class CV_EXPORTS ScaleLayer;
/** @brief This interface class allows to build new Layers - are building blocks of networks.
*
......@@ -269,6 +270,19 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
*/
virtual bool setBatchNorm(const Ptr<BatchNormLayer>& layer);
/**
* @brief Tries to attach to the layer the subsequent scaling layer, i.e. do the layer fusion in a partial case.
* @param[in] layer The subsequent scaling layer.
*
* Returns true if the scaling layer has been attached successfully.
*/
virtual bool setScale(const Ptr<ScaleLayer>& layer);
/**
* @brief "Deattaches" all the layers, attached to particular layer.
*/
virtual void unsetAttached();
virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs,
std::vector<MatShape> &outputs,
......@@ -495,9 +509,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
/** @overload */
CV_WRAP void getLayerShapes(const std::vector<MatShape>& netInputShapes,
const int layerId,
std::vector<MatShape>* inLayerShapes,
std::vector<MatShape>* outLayerShapes) const;
const int layerId,
std::vector<MatShape>* inLayerShapes,
std::vector<MatShape>* outLayerShapes) const;
/** @brief Computes FLOP for whole loaded model with specified input shapes.
* @param netInputShapes vector of shapes for all net inputs.
* @returns computed FLOP.
......@@ -507,10 +522,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
CV_WRAP int64 getFLOPS(const MatShape& netInputShape) const;
/** @overload */
CV_WRAP int64 getFLOPS(const int layerId,
const std::vector<MatShape>& netInputShapes) const;
const std::vector<MatShape>& netInputShapes) const;
/** @overload */
CV_WRAP int64 getFLOPS(const int layerId,
const MatShape& netInputShape) const;
const MatShape& netInputShape) const;
/** @brief Returns list of types for layer used in model.
* @param layersTypes output parameter for returning types.
......@@ -557,8 +572,13 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
CV_WRAP void getMemoryConsumption(const MatShape& netInputShape,
CV_OUT std::vector<int>& layerIds, CV_OUT std::vector<size_t>& weights,
CV_OUT std::vector<size_t>& blobs) const;
private:
/** @brief Enables or disables layer fusion in the network.
* @param fusion true to enable the fusion, false to disable. The fusion is enabled by default.
*/
CV_WRAP void enableFusion(bool fusion);
private:
struct Impl;
Ptr<Impl> impl;
};
......
This diff is collapsed.
......@@ -94,6 +94,78 @@ public:
backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1; // By channels
}
class ChannelConcatInvoker : public ParallelLoopBody
{
public:
std::vector<Mat*>* inputs;
Mat* output;
int nstripes;
std::vector<const float*> chptrs;
static void run(std::vector<Mat*>& inputs, Mat& output, int nstripes)
{
ChannelConcatInvoker cc;
cc.inputs = &inputs;
cc.output = &output;
cc.nstripes = nstripes;
size_t i, ninputs = inputs.size();
int nchannels = 0, batchsz = output.size[0];
for( i = 0; i < ninputs; i++ )
{
Mat& inp = *inputs[i];
CV_Assert( inp.isContinuous() && inp.type() == CV_32F &&
inp.dims == 4 && inp.size[0] == output.size[0] &&
inp.size[2] == output.size[2] &&
inp.size[3] == output.size[3] );
nchannels += inp.size[1];
}
CV_Assert( nchannels == output.size[1] );
CV_Assert( output.isContinuous() && output.type() == CV_32F );
cc.chptrs.resize(nchannels*batchsz);
int ofs = 0;
for( i = 0; i < ninputs; i++)
{
Mat& inp = *inputs[i];
for( int j = 0; j < batchsz; j++ )
for( int k = 0; k < inp.size[1]; k++ )
{
const float* ptr = inp.ptr<float>(j, k);
cc.chptrs[ofs + j*nchannels + k] = ptr;
}
ofs += inp.size[1];
}
parallel_for_(Range(0, nstripes), cc, nstripes);
}
ChannelConcatInvoker() {}
void operator()(const Range& r) const
{
size_t planeSize = (size_t)output->size[2]*output->size[3];
size_t nch = chptrs.size();
size_t total = nch*planeSize;
size_t stripeSize = (total + nstripes - 1)/nstripes;
size_t stripeStart = r.start*stripeSize;
size_t stripeEnd = std::min(total, r.end*stripeSize);
const float** ptrs = (const float**)&chptrs[0];
float* outptr = output->ptr<float>();
size_t blockSize0 = 1 << 16;
for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
{
size_t ch = ofs0/planeSize;
size_t ofs = ofs0 - ch*planeSize;
size_t blockSize = std::min(blockSize0, planeSize - ofs);
memcpy(outptr + ofs0, ptrs[ch] + ofs, blockSize*sizeof(outptr[0]));
ofs0 += blockSize;
}
}
};
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
{
CV_TRACE_FUNCTION();
......@@ -101,14 +173,23 @@ public:
int cAxis = clamp(axis, inputs[0]->dims);
Mat& outMat = outputs[0];
std::vector<Range> ranges(outputs[0].dims, Range::all());
ranges[cAxis].start = 0;
for (size_t i = 0; i < inputs.size(); i++)
if( cAxis == 1 && outMat.dims == 4 )
{
int nstripes = getNumThreads();
ChannelConcatInvoker::run(inputs, outMat, nstripes);
}
else
{
ranges[cAxis].end = ranges[cAxis].start + inputs[i]->size[cAxis];
inputs[i]->copyTo(outMat(&ranges[0]));
ranges[cAxis].start = ranges[cAxis].end;
std::vector<Range> ranges(outputs[0].dims, Range::all());
ranges[cAxis].start = 0;
for (size_t i = 0; i < inputs.size(); i++)
{
ranges[cAxis].end = ranges[cAxis].start + inputs[i]->size[cAxis];
inputs[i]->copyTo(outMat(&ranges[0]));
ranges[cAxis].start = ranges[cAxis].end;
}
}
}
......
......@@ -148,6 +148,7 @@ public:
std::vector<float> reluslope;
Ptr<ActivationLayer> activ;
Ptr<BatchNormLayer> bnorm;
Ptr<ScaleLayer> scaleLayer;
MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const
{
......@@ -202,6 +203,9 @@ public:
bool setBatchNorm(const Ptr<BatchNormLayer>& layer )
{
// for now the scale layer followed by the batch norm cannot be fused, only vice versa.
if( !scaleLayer.empty() )
return false;
bnorm = layer;
// we will need to re-compute the weights with the batch
// norm coefficients taken into account
......@@ -209,6 +213,15 @@ public:
return !bnorm.empty();
}
bool setScale(const Ptr<ScaleLayer>& layer)
{
scaleLayer = layer;
// we will need to re-compute the weights with the scaling
// coefficients taken into account
weightsMat.release();
return !scaleLayer.empty();
}
virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
{
#ifdef HAVE_HALIDE
......@@ -678,32 +691,56 @@ public:
biasvec[k] = biasMat.at<float>(k);
}
if( !bnorm.empty() )
if( !bnorm.empty() || !scaleLayer.empty() )
{
Mat scale, shift;
bnorm->getScaleShift(scale, shift);
Mat scale, shift, scale2, shift2;
const float *scaleptr = 0, *shiftptr = 0;
const float *scaleptr2 = 0, *shiftptr2 = 0;
CV_Assert( scale.isContinuous() && shift.isContinuous() &&
scale.type() == CV_32F && shift.type() == CV_32F &&
scale.total() == (size_t)outCn &&
shift.total() == (size_t)outCn );
if( !bnorm.empty() )
{
bnorm->getScaleShift(scale, shift);
CV_Assert( scale.isContinuous() && shift.isContinuous() &&
scale.type() == CV_32F && shift.type() == CV_32F &&
scale.total() == (size_t)outCn &&
shift.total() == (size_t)outCn );
scaleptr = scale.ptr<float>();
shiftptr = shift.ptr<float>();
}
if( !scaleLayer.empty() )
{
scale2 = scaleLayer->blobs[0];
CV_Assert( scale2.isContinuous() && scale2.type() == CV_32F &&
scale2.total() == (size_t)outCn );
scaleptr2 = scale2.ptr<float>();
if( scaleLayer->hasBias )
{
shift2 = scaleLayer->blobs[1];
CV_Assert( shift2.isContinuous() && shift2.type() == CV_32F &&
shift2.total() == (size_t)outCn );
shiftptr2 = shift2.ptr<float>();
}
}
for( int i = 0; i < outCn; i++ )
{
float s = scale.at<float>(i);
float delta = shift.at<float>(i);
float s1 = scaleptr ? scaleptr[i] : 1.f;
float delta1 = shiftptr ? shiftptr[i] : 0.f;
float s2 = scaleptr2 ? scaleptr2[i] : 1.f;
float delta2 = shiftptr2 ? shiftptr2[i] : 0.f;
float* w_i = weightsMat.ptr<float>(i);
int j, wcols = weightsMat.cols;
for( j = 0; j < wcols; j++ )
w_i[j] *= s;
w_i[j] *= (s1*s2);
biasvec[i] = biasvec[i]*s + delta;
biasvec[i] = biasvec[i]*(s1*s2) + (delta1*s2 + delta2);
}
}
biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1];
}
reluslope.clear();
if( activ )
{
Ptr<ReLULayer> activ_relu = activ.dynamicCast<ReLULayer>();
......
......@@ -517,7 +517,8 @@ TEST_P(Concat, Accuracy)
Net net;
std::vector<int> convLayerIds(numChannels.channels);
std::vector<int> convLayerIds;
convLayerIds.reserve(numChannels.channels);
for (int i = 0, n = numChannels.channels; i < n; ++i)
{
if (!numChannels[i])
......@@ -537,8 +538,9 @@ TEST_P(Concat, Accuracy)
convParam.name = ss.str();
convParam.blobs.push_back(weights);
convLayerIds[i] = net.addLayer(convParam.name, convParam.type, convParam);
net.connect(0, 0, convLayerIds[i], 0);
int layerId = net.addLayer(convParam.name, convParam.type, convParam);
convLayerIds.push_back(layerId);
net.connect(0, 0, layerId, 0);
}
LayerParams concatParam;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment