Commit 0488d9bd authored by Vadim Pisarevsky's avatar Vadim Pisarevsky

optimize out scaleLayer & concatLayer whenever possible

fixed problem in concat layer by disabling memory re-use in layers with multiple inputs

trying to fix the tests when Halide is used to run deep nets

another attempt to fix Halide tests

see if the Halide tests will pass with concat layer fusion turned off

trying to fix failures in halide tests; another try

one more experiment to make halide_concat & halide_enet tests pass

continue attempts to fix halide tests

moving on

uncomment parallel concat layer

seemingly fixed failures in Halide tests and re-enabled concat layer fusion; thanks to dkurt for the patch
parent 431e2e6d
......@@ -152,6 +152,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
class CV_EXPORTS ActivationLayer;
class CV_EXPORTS BatchNormLayer;
class CV_EXPORTS ScaleLayer;
/** @brief This interface class allows to build new Layers - are building blocks of networks.
*
......@@ -269,6 +270,19 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
*/
virtual bool setBatchNorm(const Ptr<BatchNormLayer>& layer);
/**
* @brief Tries to attach to the layer the subsequent scaling layer, i.e. do the layer fusion in a partial case.
* @param[in] layer The subsequent scaling layer.
*
* Returns true if the scaling layer has been attached successfully.
*/
virtual bool setScale(const Ptr<ScaleLayer>& layer);
/**
* @brief "Deattaches" all the layers, attached to particular layer.
*/
virtual void unsetAttached();
virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs,
std::vector<MatShape> &outputs,
......@@ -495,9 +509,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
/** @overload */
CV_WRAP void getLayerShapes(const std::vector<MatShape>& netInputShapes,
const int layerId,
std::vector<MatShape>* inLayerShapes,
std::vector<MatShape>* outLayerShapes) const;
const int layerId,
std::vector<MatShape>* inLayerShapes,
std::vector<MatShape>* outLayerShapes) const;
/** @brief Computes FLOP for whole loaded model with specified input shapes.
* @param netInputShapes vector of shapes for all net inputs.
* @returns computed FLOP.
......@@ -507,10 +522,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
CV_WRAP int64 getFLOPS(const MatShape& netInputShape) const;
/** @overload */
CV_WRAP int64 getFLOPS(const int layerId,
const std::vector<MatShape>& netInputShapes) const;
const std::vector<MatShape>& netInputShapes) const;
/** @overload */
CV_WRAP int64 getFLOPS(const int layerId,
const MatShape& netInputShape) const;
const MatShape& netInputShape) const;
/** @brief Returns list of types for layer used in model.
* @param layersTypes output parameter for returning types.
......@@ -557,8 +572,13 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
CV_WRAP void getMemoryConsumption(const MatShape& netInputShape,
CV_OUT std::vector<int>& layerIds, CV_OUT std::vector<size_t>& weights,
CV_OUT std::vector<size_t>& blobs) const;
private:
/** @brief Enables or disables layer fusion in the network.
* @param fusion true to enable the fusion, false to disable. The fusion is enabled by default.
*/
CV_WRAP void enableFusion(bool fusion);
private:
struct Impl;
Ptr<Impl> impl;
};
......
This diff is collapsed.
......@@ -94,6 +94,78 @@ public:
backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1; // By channels
}
class ChannelConcatInvoker : public ParallelLoopBody
{
public:
std::vector<Mat*>* inputs;
Mat* output;
int nstripes;
std::vector<const float*> chptrs;
static void run(std::vector<Mat*>& inputs, Mat& output, int nstripes)
{
ChannelConcatInvoker cc;
cc.inputs = &inputs;
cc.output = &output;
cc.nstripes = nstripes;
size_t i, ninputs = inputs.size();
int nchannels = 0, batchsz = output.size[0];
for( i = 0; i < ninputs; i++ )
{
Mat& inp = *inputs[i];
CV_Assert( inp.isContinuous() && inp.type() == CV_32F &&
inp.dims == 4 && inp.size[0] == output.size[0] &&
inp.size[2] == output.size[2] &&
inp.size[3] == output.size[3] );
nchannels += inp.size[1];
}
CV_Assert( nchannels == output.size[1] );
CV_Assert( output.isContinuous() && output.type() == CV_32F );
cc.chptrs.resize(nchannels*batchsz);
int ofs = 0;
for( i = 0; i < ninputs; i++)
{
Mat& inp = *inputs[i];
for( int j = 0; j < batchsz; j++ )
for( int k = 0; k < inp.size[1]; k++ )
{
const float* ptr = inp.ptr<float>(j, k);
cc.chptrs[ofs + j*nchannels + k] = ptr;
}
ofs += inp.size[1];
}
parallel_for_(Range(0, nstripes), cc, nstripes);
}
ChannelConcatInvoker() {}
void operator()(const Range& r) const
{
size_t planeSize = (size_t)output->size[2]*output->size[3];
size_t nch = chptrs.size();
size_t total = nch*planeSize;
size_t stripeSize = (total + nstripes - 1)/nstripes;
size_t stripeStart = r.start*stripeSize;
size_t stripeEnd = std::min(total, r.end*stripeSize);
const float** ptrs = (const float**)&chptrs[0];
float* outptr = output->ptr<float>();
size_t blockSize0 = 1 << 16;
for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
{
size_t ch = ofs0/planeSize;
size_t ofs = ofs0 - ch*planeSize;
size_t blockSize = std::min(blockSize0, planeSize - ofs);
memcpy(outptr + ofs0, ptrs[ch] + ofs, blockSize*sizeof(outptr[0]));
ofs0 += blockSize;
}
}
};
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
{
CV_TRACE_FUNCTION();
......@@ -101,14 +173,23 @@ public:
int cAxis = clamp(axis, inputs[0]->dims);
Mat& outMat = outputs[0];
std::vector<Range> ranges(outputs[0].dims, Range::all());
ranges[cAxis].start = 0;
for (size_t i = 0; i < inputs.size(); i++)
if( cAxis == 1 && outMat.dims == 4 )
{
int nstripes = getNumThreads();
ChannelConcatInvoker::run(inputs, outMat, nstripes);
}
else
{
ranges[cAxis].end = ranges[cAxis].start + inputs[i]->size[cAxis];
inputs[i]->copyTo(outMat(&ranges[0]));
ranges[cAxis].start = ranges[cAxis].end;
std::vector<Range> ranges(outputs[0].dims, Range::all());
ranges[cAxis].start = 0;
for (size_t i = 0; i < inputs.size(); i++)
{
ranges[cAxis].end = ranges[cAxis].start + inputs[i]->size[cAxis];
inputs[i]->copyTo(outMat(&ranges[0]));
ranges[cAxis].start = ranges[cAxis].end;
}
}
}
......
......@@ -148,6 +148,7 @@ public:
std::vector<float> reluslope;
Ptr<ActivationLayer> activ;
Ptr<BatchNormLayer> bnorm;
Ptr<ScaleLayer> scaleLayer;
MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const
{
......@@ -202,6 +203,9 @@ public:
bool setBatchNorm(const Ptr<BatchNormLayer>& layer )
{
// for now the scale layer followed by the batch norm cannot be fused, only vice versa.
if( !scaleLayer.empty() )
return false;
bnorm = layer;
// we will need to re-compute the weights with the batch
// norm coefficients taken into account
......@@ -209,6 +213,15 @@ public:
return !bnorm.empty();
}
bool setScale(const Ptr<ScaleLayer>& layer)
{
scaleLayer = layer;
// we will need to re-compute the weights with the scaling
// coefficients taken into account
weightsMat.release();
return !scaleLayer.empty();
}
virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
{
#ifdef HAVE_HALIDE
......@@ -678,32 +691,56 @@ public:
biasvec[k] = biasMat.at<float>(k);
}
if( !bnorm.empty() )
if( !bnorm.empty() || !scaleLayer.empty() )
{
Mat scale, shift;
bnorm->getScaleShift(scale, shift);
Mat scale, shift, scale2, shift2;
const float *scaleptr = 0, *shiftptr = 0;
const float *scaleptr2 = 0, *shiftptr2 = 0;
CV_Assert( scale.isContinuous() && shift.isContinuous() &&
scale.type() == CV_32F && shift.type() == CV_32F &&
scale.total() == (size_t)outCn &&
shift.total() == (size_t)outCn );
if( !bnorm.empty() )
{
bnorm->getScaleShift(scale, shift);
CV_Assert( scale.isContinuous() && shift.isContinuous() &&
scale.type() == CV_32F && shift.type() == CV_32F &&
scale.total() == (size_t)outCn &&
shift.total() == (size_t)outCn );
scaleptr = scale.ptr<float>();
shiftptr = shift.ptr<float>();
}
if( !scaleLayer.empty() )
{
scale2 = scaleLayer->blobs[0];
CV_Assert( scale2.isContinuous() && scale2.type() == CV_32F &&
scale2.total() == (size_t)outCn );
scaleptr2 = scale2.ptr<float>();
if( scaleLayer->hasBias )
{
shift2 = scaleLayer->blobs[1];
CV_Assert( shift2.isContinuous() && shift2.type() == CV_32F &&
shift2.total() == (size_t)outCn );
shiftptr2 = shift2.ptr<float>();
}
}
for( int i = 0; i < outCn; i++ )
{
float s = scale.at<float>(i);
float delta = shift.at<float>(i);
float s1 = scaleptr ? scaleptr[i] : 1.f;
float delta1 = shiftptr ? shiftptr[i] : 0.f;
float s2 = scaleptr2 ? scaleptr2[i] : 1.f;
float delta2 = shiftptr2 ? shiftptr2[i] : 0.f;
float* w_i = weightsMat.ptr<float>(i);
int j, wcols = weightsMat.cols;
for( j = 0; j < wcols; j++ )
w_i[j] *= s;
w_i[j] *= (s1*s2);
biasvec[i] = biasvec[i]*s + delta;
biasvec[i] = biasvec[i]*(s1*s2) + (delta1*s2 + delta2);
}
}
biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1];
}
reluslope.clear();
if( activ )
{
Ptr<ReLULayer> activ_relu = activ.dynamicCast<ReLULayer>();
......
......@@ -517,7 +517,8 @@ TEST_P(Concat, Accuracy)
Net net;
std::vector<int> convLayerIds(numChannels.channels);
std::vector<int> convLayerIds;
convLayerIds.reserve(numChannels.channels);
for (int i = 0, n = numChannels.channels; i < n; ++i)
{
if (!numChannels[i])
......@@ -537,8 +538,9 @@ TEST_P(Concat, Accuracy)
convParam.name = ss.str();
convParam.blobs.push_back(weights);
convLayerIds[i] = net.addLayer(convParam.name, convParam.type, convParam);
net.connect(0, 0, convLayerIds[i], 0);
int layerId = net.addLayer(convParam.name, convParam.type, convParam);
convLayerIds.push_back(layerId);
net.connect(0, 0, layerId, 0);
}
LayerParams concatParam;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment