Commit 62ba5d75 authored by Dmitry Kurtaev's avatar Dmitry Kurtaev Committed by Vadim Pisarevsky

Added Halide OpenCL target for deep learning networks (#1246)

parent a4a8b84e
......@@ -69,7 +69,8 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
*/
enum Target
{
DNN_TARGET_CPU
DNN_TARGET_CPU,
DNN_TARGET_OPENCL
};
/** @brief Initialize dnn module and built-in layers.
......@@ -138,6 +139,11 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
virtual ~BackendWrapper(); //!< Virtual destructor to make polymorphism.
/**
* @brief Transfer data to CPU host memory.
*/
virtual void copyToHost() = 0;
int backendId; //!< Backend identifier.
int targetId; //!< Target identifier.
};
......@@ -220,14 +226,16 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
* @param[in] node Backend node with Halide functions.
* @param[in] inputs Blobs that will be used in forward invocations.
* @param[in] outputs Blobs that will be used in forward invocations.
* @see BackendNode
* @param[in] targetId Target identifier
* @see BackendNode, Target
*
* Layer don't use own Halide::Func members because we can have applied
* layers fusing. In this way the fused function should be scheduled.
*/
virtual void applyHalideScheduler(Ptr<BackendNode>& node,
const std::vector<Mat*> &inputs,
const std::vector<Mat> &outputs) const;
const std::vector<Mat> &outputs,
int targetId) const;
/**
* @brief Implement layers fusing.
......@@ -394,6 +402,13 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
*/
void setPreferableBackend(int backendId);
/**
* @brief Ask network to make computations on specific target device.
* @param[in] targetId target identifier.
* @see Target
*/
void setPreferableTarget(int targetId);
/** @brief Sets the new value for the layer output blob
* @param name descriptor of the updating layer output blob.
* @param blob new blob.
......
......@@ -41,114 +41,131 @@ static void loadNet(std::string weights, std::string proto, std::string schedule
net->setInput(blobFromImage(input, 1.0, false));
net->setPreferableBackend(DNN_BACKEND_HALIDE);
net->setPreferableTarget(targetId);
net->setHalideScheduler(scheduler);
net->forward(outputLayer);
}
////////////////////////////////////////////////////////////////////////////////
// CPU target
////////////////////////////////////////////////////////////////////////////////
PERF_TEST(GoogLeNet, HalidePerfTest)
{
try {
Net net;
loadNet("dnn/bvlc_googlenet.caffemodel", "dnn/bvlc_googlenet.prototxt",
loadNet("dnn/bvlc_googlenet2.caffemodel", "dnn/bvlc_googlenet.prototxt",
"", 227, 227, "prob", "caffe", DNN_TARGET_CPU, &net);
TEST_CYCLE_N(10)
{
net.forward();
}
TEST_CYCLE() net.forward();
SANITY_CHECK_NOTHING();
} catch (SkipTestException& e) {
throw PerfSkipTestException();
}
}
PERF_TEST(AlexNet, HalidePerfTest)
{
try {
Net net;
loadNet("dnn/bvlc_alexnet.caffemodel", "dnn/bvlc_alexnet.prototxt",
"dnn/halide_scheduler_alexnet.yml", 227, 227, "prob", "caffe",
DNN_TARGET_CPU, &net);
TEST_CYCLE_N(10)
{
net.forward();
}
TEST_CYCLE() net.forward();
SANITY_CHECK_NOTHING();
} catch (SkipTestException& e) {
throw PerfSkipTestException();
}
}
PERF_TEST(ResNet50, HalidePerfTest)
{
try {
Net net;
loadNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt",
"dnn/halide_scheduler_resnet_50.yml", 224, 224, "prob", "caffe",
DNN_TARGET_CPU, &net);
TEST_CYCLE_N(10)
{
net.forward();
}
TEST_CYCLE() net.forward();
SANITY_CHECK_NOTHING();
} catch (SkipTestException& e) {
throw PerfSkipTestException();
}
}
PERF_TEST(SqueezeNet_v1_1, HalidePerfTest)
{
try {
Net net;
loadNet("dnn/squeezenet_v1_1.caffemodel", "dnn/squeezenet_v1_1.prototxt",
"dnn/halide_scheduler_squeezenet_v1_1.yml", 227, 227, "prob",
"caffe", DNN_TARGET_CPU, &net);
TEST_CYCLE_N(10)
{
net.forward();
}
TEST_CYCLE() net.forward();
SANITY_CHECK_NOTHING();
} catch (SkipTestException& e) {
throw PerfSkipTestException();
}
}
PERF_TEST(Inception_5h, HalidePerfTest)
{
try {
Net net;
loadNet("dnn/tensorflow_inception_graph.pb", "",
"dnn/halide_scheduler_inception_5h.yml",
224, 224, "softmax2", "tensorflow", DNN_TARGET_CPU, &net);
TEST_CYCLE_N(10)
{
net.forward("softmax2");
}
TEST_CYCLE() net.forward("softmax2");
SANITY_CHECK_NOTHING();
} catch (SkipTestException& e) {
throw PerfSkipTestException();
}
}
PERF_TEST(ENet, HalidePerfTest)
{
try {
Net net;
loadNet("dnn/Enet-model-best.net", "", "dnn/halide_scheduler_enet.yml",
512, 256, "l367_Deconvolution", "torch", DNN_TARGET_CPU, &net);
TEST_CYCLE() net.forward();
SANITY_CHECK_NOTHING();
}
////////////////////////////////////////////////////////////////////////////////
// OpenCL target
////////////////////////////////////////////////////////////////////////////////
PERF_TEST(GoogLeNet_opencl, HalidePerfTest)
{
Net net;
loadNet("dnn/bvlc_googlenet.caffemodel", "dnn/bvlc_googlenet.prototxt",
"", 227, 227, "prob", "caffe", DNN_TARGET_OPENCL, &net);
TEST_CYCLE() net.forward();
SANITY_CHECK_NOTHING();
}
TEST_CYCLE_N(10)
{
net.forward("l367_Deconvolution");
}
PERF_TEST(AlexNet_opencl, HalidePerfTest)
{
Net net;
loadNet("dnn/bvlc_alexnet.caffemodel", "dnn/bvlc_alexnet.prototxt",
"dnn/halide_scheduler_opencl_alexnet.yml", 227, 227, "prob", "caffe",
DNN_TARGET_OPENCL, &net);
TEST_CYCLE() net.forward();
SANITY_CHECK_NOTHING();
}
PERF_TEST(ResNet50_opencl, HalidePerfTest)
{
Net net;
loadNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt",
"dnn/halide_scheduler_opencl_resnet_50.yml", 224, 224, "prob", "caffe",
DNN_TARGET_OPENCL, &net);
TEST_CYCLE() net.forward();
SANITY_CHECK_NOTHING();
}
PERF_TEST(SqueezeNet_v1_1_opencl, HalidePerfTest)
{
Net net;
loadNet("dnn/squeezenet_v1_1.caffemodel", "dnn/squeezenet_v1_1.prototxt",
"dnn/halide_scheduler_opencl_squeezenet_v1_1.yml", 227, 227, "prob",
"caffe", DNN_TARGET_OPENCL, &net);
TEST_CYCLE() net.forward();
SANITY_CHECK_NOTHING();
}
PERF_TEST(Inception_5h_opencl, HalidePerfTest)
{
Net net;
loadNet("dnn/tensorflow_inception_graph.pb", "",
"dnn/halide_scheduler_opencl_inception_5h.yml",
224, 224, "softmax2", "tensorflow", DNN_TARGET_OPENCL, &net);
TEST_CYCLE() net.forward("softmax2");
SANITY_CHECK_NOTHING();
}
PERF_TEST(ENet_opencl, HalidePerfTest)
{
Net net;
loadNet("dnn/Enet-model-best.net", "", "dnn/halide_scheduler_opencl_enet.yml",
512, 256, "l367_Deconvolution", "torch", DNN_TARGET_OPENCL, &net);
TEST_CYCLE() net.forward();
SANITY_CHECK_NOTHING();
} catch (SkipTestException& e) {
throw PerfSkipTestException();
}
}
#endif // HAVE_HALIDE
......
......@@ -205,7 +205,7 @@ struct LayerPin
class BackendWrapManager
{
public:
Ptr<BackendWrapper> wrap(const Mat& m, int backendId, int targetId = DNN_TARGET_CPU)
Ptr<BackendWrapper> wrap(const Mat& m, int backendId, int targetId)
{
CV_Assert(backendId != DNN_BACKEND_DEFAULT);
......@@ -236,7 +236,7 @@ public:
}
std::vector<Ptr<BackendWrapper> > wrap(const std::vector<Mat*>& mats,
int backendId, int targetId = DNN_TARGET_CPU)
int backendId, int targetId)
{
const int num = mats.size();
std::vector<Ptr<BackendWrapper> > dst(num);
......@@ -248,7 +248,7 @@ public:
}
std::vector<Ptr<BackendWrapper> > wrap(const std::vector<Mat>& mats,
int backendId, int targetId = DNN_TARGET_CPU)
int backendId, int targetId)
{
const int num = mats.size();
std::vector<Ptr<BackendWrapper> > dst(num);
......@@ -617,6 +617,7 @@ struct Net::Impl
lastLayerId = 1;
netWasAllocated = false;
preferableBackend = DNN_BACKEND_DEFAULT;
preferableTarget = DNN_TARGET_CPU;
}
Ptr<DataLayer> netInputLayer;
......@@ -626,6 +627,7 @@ struct Net::Impl
std::map<String, int> layerNameToId;
BlobManager blobManager;
int preferableBackend;
int preferableTarget;
String halideConfigFile;
// Backend-specific wrapping manager.
BackendWrapManager backendWrapper;
......@@ -652,10 +654,11 @@ struct Net::Impl
{
// Use automatic scheduling provided by layer.
layer->applyHalideScheduler(ld.backendNodes[DNN_BACKEND_HALIDE],
ld.inputBlobs, ld.outputBlobs);
ld.inputBlobs, ld.outputBlobs,
preferableTarget);
}
dnn::compileHalide(ld.outputBlobs, ld.backendNodes[DNN_BACKEND_HALIDE],
DNN_TARGET_CPU);
preferableTarget);
}
}
}
......@@ -859,7 +862,10 @@ struct Net::Impl
{
backendWrapper.reset();
if (preferableBackend == DNN_BACKEND_DEFAULT)
{
CV_Assert(preferableTarget == DNN_TARGET_CPU);
return;
}
// Iterator to current layer.
MapIdToLayerData::iterator it = layers.begin();
......@@ -905,7 +911,8 @@ struct Net::Impl
// No layers fusion.
ldTop.skipFlags[preferableBackend] = false;
std::vector<Ptr<BackendWrapper> > inputs =
backendWrapper.wrap(ldTop.inputBlobs, preferableBackend);
backendWrapper.wrap(ldTop.inputBlobs, preferableBackend,
preferableTarget);
if (preferableBackend == DNN_BACKEND_HALIDE)
{
ldTop.backendNodes[DNN_BACKEND_HALIDE] = layerTop->initHalide(inputs);
......@@ -1040,7 +1047,7 @@ struct Net::Impl
else if (!ld.skipFlags[preferableBackend])
{
std::vector<Ptr<BackendWrapper> > outputs =
backendWrapper.wrap(ld.outputBlobs, preferableBackend);
backendWrapper.wrap(ld.outputBlobs, preferableBackend, preferableTarget);
Ptr<BackendNode> node = ld.backendNodes[preferableBackend];
if (preferableBackend == DNN_BACKEND_HALIDE)
{
......@@ -1154,6 +1161,16 @@ struct Net::Impl
CV_Error(Error::StsOutOfRange, "Layer \"" + ld.name + "\" produce only " + toString(ld.outputBlobs.size()) +
" outputs, the #" + toString(pin.oid) + " was requsted");
}
if (preferableBackend != DNN_BACKEND_DEFAULT)
{
// Transfer data to CPU if it's require.
backendWrapper.wrap(ld.outputBlobs[pin.oid], preferableBackend,
preferableTarget)->copyToHost();
}
else
{
CV_Assert(preferableTarget == DNN_TARGET_CPU);
}
return ld.outputBlobs[pin.oid];
}
......@@ -1314,6 +1331,13 @@ void Net::setPreferableBackend(int backendId)
impl->preferableBackend = backendId;
}
void Net::setPreferableTarget(int targetId)
{
impl->netWasAllocated = impl->netWasAllocated &&
impl->preferableTarget == targetId;
impl->preferableTarget = targetId;
}
void Net::setInputsNames(const std::vector<String> &inputBlobNames)
{
impl->netInputLayer->setNames(inputBlobNames);
......@@ -1702,10 +1726,70 @@ Ptr<BackendNode> Layer::initHalide(const std::vector<Ptr<BackendWrapper> > &)
}
void Layer::applyHalideScheduler(Ptr<BackendNode>& node, const std::vector<Mat*> &inputs,
const std::vector<Mat> &outputs) const
const std::vector<Mat> &outputs, int targetId) const
{
CV_Error(Error::StsNotImplemented, "Scheduling of " + type +
" layers is not implemented.");
#ifdef HAVE_HALIDE
Halide::Var x("x"), y("y"), c("c"), n("n"), co("co"), ci("ci"),
xo("xo"), xi("xi"), yo("yo"), yi("yi"), tile("tile");
Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
int outW, outH, outC, outN;
getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
if (targetId == DNN_TARGET_CPU)
{
if (outW == 1 && outH == 1)
{
if (outC + outN == 1)
return;
if (outC > 8)
top.split(c, co, ci, 8)
.fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
.parallel(tile)
.vectorize(ci, 8);
else
top.fuse(x, y, tile).fuse(c, tile, tile).fuse(n, tile, tile)
.parallel(tile);
}
else
{
if (outH > 2)
{
top.reorder(x, c, y)
.split(y, yo, yi, 2)
.fuse(yo, n, tile)
.parallel(tile)
.unroll(yi)
.vectorize(x, outW >= 16 ? 16 : outW);
}
}
}
else if (targetId == DNN_TARGET_OPENCL)
{
int c_split = outC > 8 ? (outC > 16 ? 8 : 4) : outC;
if (outW == 1 && outH == 1)
{
top.split(c, co, ci, c_split)
.fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
.gpu_blocks(tile)
.gpu_threads(ci);
}
else
{
int x_split = outW > 8 ? (outW >= 32 ? 16 : 8) : outW;
int y_split = outH > 8 ? (outH >= 32 ? 16 : 8) : outH;
top.split(x, xo, xi, x_split).split(y, yo, yi, y_split)
.split(c, co, ci, c_split)
.gpu_blocks(xo, yo, co)
.gpu_threads(xi, yi)
.reorder(xi, yi, ci, xo, yo, co)
.vectorize(ci);
}
}
else
CV_Error(Error::StsNotImplemented, "Unknown target identifier");
#endif // HAVE_HALIDE
}
Ptr<BackendNode> Layer::tryAttach(const Ptr<BackendNode>& node)
......
......@@ -143,6 +143,26 @@ static void applyComputeRoot(const FileNode& directive, Halide::Func& func)
func.compute_root();
}
static void applyGpuBlocks(const FileNode& directive, Halide::Func& func)
{
std::string varName;
for (int i = 0, n = directive.size(); i < n; ++i)
{
directive[i] >> varName;
func.gpu_blocks(Halide::Var(varName));
}
}
static void applyGpuThreads(const FileNode& directive, Halide::Func& func)
{
std::string varName;
for (int i = 0, n = directive.size(); i < n; ++i)
{
directive[i] >> varName;
func.gpu_threads(Halide::Var(varName));
}
}
static void apply(const FileNode& directives, Halide::Func& func,
std::map<std::string, Halide::Func>& funcsMap,
const FileNode& params)
......@@ -167,6 +187,10 @@ static void apply(const FileNode& directives, Halide::Func& func,
applyComputeAt(directive, func, funcsMap);
else if (directive.name() == "compute_root")
applyComputeRoot(directive, func);
else if (directive.name() == "gpu_blocks")
applyGpuBlocks(directive, func);
else if (directive.name() == "gpu_threads")
applyGpuThreads(directive, func);
else
CV_Error(Error::StsNotImplemented, "Scheduling directive " +
directive.name() + " is not implemented.");
......
......@@ -157,6 +157,8 @@ public:
bias(i) = (hasBias ? biasData[i] : 0.0f) -
weights(i) * meanData[i] * varMeanScale;
}
weights.set_host_dirty();
bias.set_host_dirty();
top(x, y, c, n) = input * weights(c) + bias(c);
return top;
}
......
......@@ -130,29 +130,6 @@ public:
#endif // HAVE_HALIDE
return Ptr<BackendNode>();
}
virtual void applyHalideScheduler(Ptr<BackendNode>& node,
const std::vector<Mat*> &inputs,
const std::vector<Mat> &outputs) const
{
#ifdef HAVE_HALIDE
Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"), yi("yi"), yo("yo");
Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
int outW, outH, outC, outN;
getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
if (outW == 1 || outH <= 2)
return;
top.reorder(x, c, y)
.split(y, yo, yi, 2)
.fuse(yo, n, tile)
.parallel(tile)
.unroll(yi)
.vectorize(x, outW >= 16 ? 16 : outW);
#endif // HAVE_HALIDE
}
};
Ptr<ConcatLayer> ConcatLayer::create(const LayerParams& params)
......
......@@ -99,9 +99,15 @@ public:
virtual void applyHalideScheduler(Ptr<BackendNode>& node,
const std::vector<Mat*> &inputs,
const std::vector<Mat> &outputs) const
const std::vector<Mat> &outputs,
int targetId) const
{
#ifdef HAVE_HALIDE
if (targetId != DNN_TARGET_CPU)
{
Layer::applyHalideScheduler(node, inputs, outputs, targetId);
return;
}
Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"), yi("yi"), yo("yo"), co("co"), ci("ci");
Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs[1];
Halide::Func& padded_input = node.dynamicCast<HalideBackendNode>()->funcs[0];
......
......@@ -422,7 +422,7 @@ struct ChannelsPReLUFunctor
{
Halide::Var x("x"), y("y"), c("c"), n("n");
auto weights = wrapToHalideBuffer(scale, {(int)scale.total()});
top(x, y, c, n) = select(input > 0.0f, input, weights(c) * input);
top(x, y, c, n) = select(input >= 0.0f, input, weights(c) * input);
}
#endif // HAVE_HALIDE
......
......@@ -198,29 +198,6 @@ public:
return Ptr<BackendNode>();
}
virtual void applyHalideScheduler(Ptr<BackendNode>& node,
const std::vector<Mat*> &inputs,
const std::vector<Mat> &outputs) const
{
#ifdef HAVE_HALIDE
Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"), yi("yi"), yo("yo");
Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
int outW, outH, outC, outN;
getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
if (outW == 1 || outH <= 2)
return;
top.reorder(x, c, y)
.split(y, yo, yi, 2)
.fuse(yo, n, tile)
.parallel(tile)
.unroll(yi)
.vectorize(x, outW >= 16 ? 16 : outW);
#endif // HAVE_HALIDE
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const
{
......
......@@ -252,31 +252,6 @@ public:
return Ptr<BackendNode>();
}
virtual void applyHalideScheduler(Ptr<BackendNode>& node,
const std::vector<Mat*> &inputs,
const std::vector<Mat> &outputs) const
{
#ifdef HAVE_HALIDE
int outW, outH, outC, outN;
getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
Halide::Var x("x"), y("y"), c("c"), n("n"), co("co"), ci("ci"), tile("tile");
Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
if (outC + outN == 1)
return;
if (outC > 8)
top.split(c, co, ci, 8)
.fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
.parallel(tile)
.vectorize(ci, 8);
else
top.fuse(x, y, tile).fuse(c, tile, tile).fuse(n, tile, tile)
.parallel(tile);
#endif // HAVE_HALIDE
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const
{
......
......@@ -272,9 +272,15 @@ public:
virtual void applyHalideScheduler(Ptr<BackendNode>& node,
const std::vector<Mat*> &inputs,
const std::vector<Mat> &outputs) const
const std::vector<Mat> &outputs,
int targetId) const
{
#ifdef HAVE_HALIDE
if (targetId != DNN_TARGET_CPU)
{
Layer::applyHalideScheduler(node, inputs, outputs, targetId);
return;
}
int outW, outH, outC, outN;
getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
......
......@@ -117,26 +117,6 @@ public:
#endif // HAVE_HALIDE
return Ptr<BackendNode>();
}
virtual void applyHalideScheduler(Ptr<BackendNode>& node,
const std::vector<Mat*> &inputs,
const std::vector<Mat> &outputs) const
{
#ifdef HAVE_HALIDE
Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"), yi("yi"), yo("yo");
Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
int outW, outH, outC, outN;
getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
top.reorder(x, c, y)
.split(y, yo, yi, 2)
.fuse(yo, n, tile)
.parallel(tile)
.unroll(yi)
.vectorize(x, outW >= 16 ? 16 : outW);
#endif // HAVE_HALIDE
}
};
Ptr<MaxUnpoolLayer> MaxUnpoolLayer::create(const LayerParams& params)
......
......@@ -10,6 +10,7 @@ Implementation of padding layer, which adds paddings to input blob.
*/
#include "../precomp.hpp"
#include "op_halide.hpp"
#include <vector>
namespace cv
......@@ -52,6 +53,12 @@ public:
return false;
}
virtual bool supportBackend(int backendId)
{
return backendId == DNN_BACKEND_DEFAULT ||
backendId == DNN_BACKEND_HALIDE && haveHalide();
}
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
{
for(int i = 0; i < inputs.size(); i++)
......@@ -94,6 +101,23 @@ public:
return inputDims > 0 && (int)shape.size() > inputDims ? paddingDim + 1 : paddingDim;
}
virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
{
#ifdef HAVE_HALIDE
int inW, inH, inC, inN;
Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
getCanonicalSize(inputBuffer, &inW, &inH, &inC, &inN);
Halide::Var x("x"), y("y"), c("c"), n("n");
Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
Halide::Func padded =
Halide::BoundaryConditions::constant_exterior(inputBuffer, paddingValue);
top(x, y, c, n) = padded(x, y, c, n);
return Ptr<BackendNode>(new HalideBackendNode(top));
#endif // HAVE_HALIDE
return Ptr<BackendNode>();
}
int paddingDim, padding, inputDims, index;
float paddingValue;
};
......
......@@ -388,9 +388,15 @@ public:
virtual void applyHalideScheduler(Ptr<BackendNode>& node,
const std::vector<Mat*> &inputs,
const std::vector<Mat> &outputs) const
const std::vector<Mat> &outputs,
int targetId) const
{
#ifdef HAVE_HALIDE
if (targetId != DNN_TARGET_CPU)
{
Layer::applyHalideScheduler(node, inputs, outputs, targetId);
return;
}
Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"),
xi("xi"), yi("yi"), ci("ci"), xo("xo"), yo("yo"), co("co");
Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
......
......@@ -187,33 +187,6 @@ public:
return Ptr<BackendNode>();
}
virtual void applyHalideScheduler(Ptr<BackendNode>& node,
const std::vector<Mat*> &inputs,
const std::vector<Mat> &outputs) const
{
#ifdef HAVE_HALIDE
int outW, outH, outC, outN;
getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
// Most common case when SoftMax is a layer after fully-connected.
// So we just schedule it in the same way.
Halide::Var x("x"), y("y"), c("c"), n("n"), co("co"), ci("ci"), tile("tile");
Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
if (outC + outN == 1)
return;
if (outC > 8)
top.split(c, co, ci, 8)
.fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
.parallel(tile)
.vectorize(ci, 8);
else
top.fuse(x, y, tile).fuse(c, tile, tile).fuse(n, tile, tile)
.parallel(tile);
#endif // HAVE_HALIDE
}
int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const
{
......
......@@ -7,6 +7,10 @@
#include "op_halide.hpp"
#ifdef HAVE_HALIDE
#include <HalideRuntimeOpenCL.h>
#endif // HAVE_HALIDE
namespace cv
{
namespace dnn
......@@ -72,7 +76,15 @@ HalideBackendWrapper::HalideBackendWrapper(int targetId, const cv::Mat& m)
: BackendWrapper(DNN_BACKEND_HALIDE, targetId)
{
buffer = wrapToHalideBuffer(m);
if (targetId != DNN_TARGET_CPU)
if (targetId == DNN_TARGET_CPU)
{
return;
}
else if (targetId == DNN_TARGET_OPENCL)
{
buffer.copy_to_device(halide_opencl_device_interface());
}
else
CV_Error(Error::StsNotImplemented, "Unknown target identifier");
}
......@@ -80,15 +92,32 @@ HalideBackendWrapper::HalideBackendWrapper(const Ptr<BackendWrapper>& base,
const MatShape& shape)
: BackendWrapper(DNN_BACKEND_HALIDE, base->targetId)
{
if (base->targetId != DNN_TARGET_CPU)
CV_Error(Error::StsNotImplemented, "Unknown target identifier");
int w, h, c, n;
getCanonicalSize(shape, &w, &h, &c, &n);
Halide::Buffer<float> baseBuffer = halideBuffer(base);
buffer = Halide::Buffer<float>((float*)baseBuffer.raw_buffer()->host,
{w, h, c, n});
if (baseBuffer.has_device_allocation())
{
buffer.raw_buffer()->device = baseBuffer.raw_buffer()->device;
buffer.raw_buffer()->device_interface = baseBuffer.raw_buffer()->device_interface;
buffer.set_device_dirty();
}
else
{
buffer.set_host_dirty(); // Indicate that data is on CPU.
CV_Assert(targetId == DNN_TARGET_CPU);
}
}
void HalideBackendWrapper::copyToHost()
{
CV_Assert(targetId == DNN_TARGET_CPU || buffer.device_dirty());
if (buffer.device_dirty())
{
buffer.device_sync();
buffer.copy_to_host();
}
}
#endif // HAVE_HALIDE
......@@ -144,6 +173,11 @@ void compileHalide(std::vector<Mat> &outputs, Ptr<BackendNode>& node, int target
Halide::Target target = Halide::get_host_target();
target.set_feature(Halide::Target::NoAsserts);
if (targetId == DNN_TARGET_OPENCL)
{
target.set_feature(Halide::Target::OpenCL);
}
CV_Assert(target.supported());
top.compile_jit(target);
#endif // HAVE_HALIDE
}
......
......@@ -57,6 +57,8 @@ namespace dnn
HalideBackendWrapper(const Ptr<BackendWrapper>& base, const MatShape& shape);
virtual void copyToHost();
Halide::Buffer<float> buffer;
};
#endif // HAVE_HALIDE
......
......@@ -48,6 +48,7 @@ static void test(const std::string& weights, const std::string& proto,
netHalide.setInput(blobFromImage(input.clone(), 1.0f, false));
netHalide.setPreferableBackend(DNN_BACKEND_HALIDE);
netHalide.setPreferableTarget(targetId);
netHalide.setHalideScheduler(scheduler);
outputHalide = netHalide.forward(outputLayer).clone();
......@@ -62,15 +63,20 @@ static void test(const std::string& weights, const std::string& proto,
// Swap backends.
netHalide.setPreferableBackend(DNN_BACKEND_DEFAULT);
netHalide.setPreferableTarget(DNN_TARGET_CPU);
outputDefault = netHalide.forward(outputLayer).clone();
netDefault.setPreferableBackend(DNN_BACKEND_HALIDE);
netDefault.setPreferableTarget(targetId);
netDefault.setHalideScheduler(scheduler);
outputHalide = netDefault.forward(outputLayer).clone();
normAssert(outputDefault, outputHalide);
}
////////////////////////////////////////////////////////////////////////////////
// CPU target
////////////////////////////////////////////////////////////////////////////////
TEST(Reproducibility_GoogLeNet_Halide, Accuracy)
{
test(findDataFile("dnn/bvlc_googlenet.caffemodel", false),
......@@ -115,6 +121,53 @@ TEST(Reproducibility_ENet_Halide, Accuracy)
findDataFile("dnn/halide_scheduler_enet.yml", false),
512, 512, "l367_Deconvolution", "torch", DNN_TARGET_CPU);
};
////////////////////////////////////////////////////////////////////////////////
// OpenCL target
////////////////////////////////////////////////////////////////////////////////
TEST(Reproducibility_GoogLeNet_Halide_opencl, Accuracy)
{
test(findDataFile("dnn/bvlc_googlenet.caffemodel", false),
findDataFile("dnn/bvlc_googlenet.prototxt", false),
"", 227, 227, "prob", "caffe", DNN_TARGET_OPENCL);
};
TEST(Reproducibility_AlexNet_Halide_opencl, Accuracy)
{
test(findDataFile("dnn/bvlc_alexnet.caffemodel", false),
findDataFile("dnn/bvlc_alexnet.prototxt", false),
findDataFile("dnn/halide_scheduler_opencl_alexnet.yml", false),
227, 227, "prob", "caffe", DNN_TARGET_OPENCL);
};
TEST(Reproducibility_ResNet_50_Halide_opencl, Accuracy)
{
test(findDataFile("dnn/ResNet-50-model.caffemodel", false),
findDataFile("dnn/ResNet-50-deploy.prototxt", false),
findDataFile("dnn/halide_scheduler_opencl_resnet_50.yml", false),
224, 224, "prob", "caffe", DNN_TARGET_OPENCL);
};
TEST(Reproducibility_SqueezeNet_v1_1_Halide_opencl, Accuracy)
{
test(findDataFile("dnn/squeezenet_v1_1.caffemodel", false),
findDataFile("dnn/squeezenet_v1_1.prototxt", false),
findDataFile("dnn/halide_scheduler_opencl_squeezenet_v1_1.yml", false),
227, 227, "prob", "caffe", DNN_TARGET_OPENCL);
};
TEST(Reproducibility_Inception_5h_Halide_opencl, Accuracy)
{
test(findDataFile("dnn/tensorflow_inception_graph.pb", false), "",
findDataFile("dnn/halide_scheduler_opencl_inception_5h.yml", false),
224, 224, "softmax2", "tensorflow", DNN_TARGET_OPENCL);
};
TEST(Reproducibility_ENet_Halide_opencl, Accuracy)
{
test(findDataFile("dnn/Enet-model-best.net", false), "",
findDataFile("dnn/halide_scheduler_opencl_enet.yml", false),
512, 512, "l367_Deconvolution", "torch", DNN_TARGET_OPENCL);
};
#endif // HAVE_HALIDE
} // namespace cvtest
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment