Commit 62ba5d75 authored by Dmitry Kurtaev's avatar Dmitry Kurtaev Committed by Vadim Pisarevsky

Added Halide OpenCL target for deep learning networks (#1246)

parent a4a8b84e
...@@ -69,7 +69,8 @@ namespace dnn //! This namespace is used for dnn module functionlaity. ...@@ -69,7 +69,8 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
*/ */
enum Target enum Target
{ {
DNN_TARGET_CPU DNN_TARGET_CPU,
DNN_TARGET_OPENCL
}; };
/** @brief Initialize dnn module and built-in layers. /** @brief Initialize dnn module and built-in layers.
...@@ -138,6 +139,11 @@ namespace dnn //! This namespace is used for dnn module functionlaity. ...@@ -138,6 +139,11 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
virtual ~BackendWrapper(); //!< Virtual destructor to make polymorphism. virtual ~BackendWrapper(); //!< Virtual destructor to make polymorphism.
/**
* @brief Transfer data to CPU host memory.
*/
virtual void copyToHost() = 0;
int backendId; //!< Backend identifier. int backendId; //!< Backend identifier.
int targetId; //!< Target identifier. int targetId; //!< Target identifier.
}; };
...@@ -220,14 +226,16 @@ namespace dnn //! This namespace is used for dnn module functionlaity. ...@@ -220,14 +226,16 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
* @param[in] node Backend node with Halide functions. * @param[in] node Backend node with Halide functions.
* @param[in] inputs Blobs that will be used in forward invocations. * @param[in] inputs Blobs that will be used in forward invocations.
* @param[in] outputs Blobs that will be used in forward invocations. * @param[in] outputs Blobs that will be used in forward invocations.
* @see BackendNode * @param[in] targetId Target identifier
* @see BackendNode, Target
* *
* Layer don't use own Halide::Func members because we can have applied * Layer don't use own Halide::Func members because we can have applied
* layers fusing. In this way the fused function should be scheduled. * layers fusing. In this way the fused function should be scheduled.
*/ */
virtual void applyHalideScheduler(Ptr<BackendNode>& node, virtual void applyHalideScheduler(Ptr<BackendNode>& node,
const std::vector<Mat*> &inputs, const std::vector<Mat*> &inputs,
const std::vector<Mat> &outputs) const; const std::vector<Mat> &outputs,
int targetId) const;
/** /**
* @brief Implement layers fusing. * @brief Implement layers fusing.
...@@ -394,6 +402,13 @@ namespace dnn //! This namespace is used for dnn module functionlaity. ...@@ -394,6 +402,13 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
*/ */
void setPreferableBackend(int backendId); void setPreferableBackend(int backendId);
/**
* @brief Ask network to make computations on specific target device.
* @param[in] targetId target identifier.
* @see Target
*/
void setPreferableTarget(int targetId);
/** @brief Sets the new value for the layer output blob /** @brief Sets the new value for the layer output blob
* @param name descriptor of the updating layer output blob. * @param name descriptor of the updating layer output blob.
* @param blob new blob. * @param blob new blob.
......
...@@ -41,114 +41,131 @@ static void loadNet(std::string weights, std::string proto, std::string schedule ...@@ -41,114 +41,131 @@ static void loadNet(std::string weights, std::string proto, std::string schedule
net->setInput(blobFromImage(input, 1.0, false)); net->setInput(blobFromImage(input, 1.0, false));
net->setPreferableBackend(DNN_BACKEND_HALIDE); net->setPreferableBackend(DNN_BACKEND_HALIDE);
net->setPreferableTarget(targetId);
net->setHalideScheduler(scheduler); net->setHalideScheduler(scheduler);
net->forward(outputLayer); net->forward(outputLayer);
} }
////////////////////////////////////////////////////////////////////////////////
// CPU target
////////////////////////////////////////////////////////////////////////////////
PERF_TEST(GoogLeNet, HalidePerfTest) PERF_TEST(GoogLeNet, HalidePerfTest)
{ {
try { Net net;
Net net; loadNet("dnn/bvlc_googlenet2.caffemodel", "dnn/bvlc_googlenet.prototxt",
loadNet("dnn/bvlc_googlenet.caffemodel", "dnn/bvlc_googlenet.prototxt", "", 227, 227, "prob", "caffe", DNN_TARGET_CPU, &net);
"", 227, 227, "prob", "caffe", DNN_TARGET_CPU, &net); TEST_CYCLE() net.forward();
SANITY_CHECK_NOTHING();
TEST_CYCLE_N(10)
{
net.forward();
}
SANITY_CHECK_NOTHING();
} catch (SkipTestException& e) {
throw PerfSkipTestException();
}
} }
PERF_TEST(AlexNet, HalidePerfTest) PERF_TEST(AlexNet, HalidePerfTest)
{ {
try { Net net;
Net net; loadNet("dnn/bvlc_alexnet.caffemodel", "dnn/bvlc_alexnet.prototxt",
loadNet("dnn/bvlc_alexnet.caffemodel", "dnn/bvlc_alexnet.prototxt", "dnn/halide_scheduler_alexnet.yml", 227, 227, "prob", "caffe",
"dnn/halide_scheduler_alexnet.yml", 227, 227, "prob", "caffe", DNN_TARGET_CPU, &net);
DNN_TARGET_CPU, &net); TEST_CYCLE() net.forward();
SANITY_CHECK_NOTHING();
TEST_CYCLE_N(10)
{
net.forward();
}
SANITY_CHECK_NOTHING();
} catch (SkipTestException& e) {
throw PerfSkipTestException();
}
} }
PERF_TEST(ResNet50, HalidePerfTest) PERF_TEST(ResNet50, HalidePerfTest)
{ {
try { Net net;
Net net; loadNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt",
loadNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt", "dnn/halide_scheduler_resnet_50.yml", 224, 224, "prob", "caffe",
"dnn/halide_scheduler_resnet_50.yml", 224, 224, "prob", "caffe", DNN_TARGET_CPU, &net);
DNN_TARGET_CPU, &net); TEST_CYCLE() net.forward();
SANITY_CHECK_NOTHING();
TEST_CYCLE_N(10)
{
net.forward();
}
SANITY_CHECK_NOTHING();
} catch (SkipTestException& e) {
throw PerfSkipTestException();
}
} }
PERF_TEST(SqueezeNet_v1_1, HalidePerfTest) PERF_TEST(SqueezeNet_v1_1, HalidePerfTest)
{ {
try { Net net;
Net net; loadNet("dnn/squeezenet_v1_1.caffemodel", "dnn/squeezenet_v1_1.prototxt",
loadNet("dnn/squeezenet_v1_1.caffemodel", "dnn/squeezenet_v1_1.prototxt", "dnn/halide_scheduler_squeezenet_v1_1.yml", 227, 227, "prob",
"dnn/halide_scheduler_squeezenet_v1_1.yml", 227, 227, "prob", "caffe", DNN_TARGET_CPU, &net);
"caffe", DNN_TARGET_CPU, &net); TEST_CYCLE() net.forward();
SANITY_CHECK_NOTHING();
TEST_CYCLE_N(10)
{
net.forward();
}
SANITY_CHECK_NOTHING();
} catch (SkipTestException& e) {
throw PerfSkipTestException();
}
} }
PERF_TEST(Inception_5h, HalidePerfTest) PERF_TEST(Inception_5h, HalidePerfTest)
{ {
try { Net net;
Net net; loadNet("dnn/tensorflow_inception_graph.pb", "",
loadNet("dnn/tensorflow_inception_graph.pb", "", "dnn/halide_scheduler_inception_5h.yml",
"dnn/halide_scheduler_inception_5h.yml", 224, 224, "softmax2", "tensorflow", DNN_TARGET_CPU, &net);
224, 224, "softmax2", "tensorflow", DNN_TARGET_CPU, &net); TEST_CYCLE() net.forward("softmax2");
SANITY_CHECK_NOTHING();
TEST_CYCLE_N(10)
{
net.forward("softmax2");
}
SANITY_CHECK_NOTHING();
} catch (SkipTestException& e) {
throw PerfSkipTestException();
}
} }
PERF_TEST(ENet, HalidePerfTest) PERF_TEST(ENet, HalidePerfTest)
{ {
try { Net net;
Net net; loadNet("dnn/Enet-model-best.net", "", "dnn/halide_scheduler_enet.yml",
loadNet("dnn/Enet-model-best.net", "", "dnn/halide_scheduler_enet.yml", 512, 256, "l367_Deconvolution", "torch", DNN_TARGET_CPU, &net);
512, 256, "l367_Deconvolution", "torch", DNN_TARGET_CPU, &net); TEST_CYCLE() net.forward();
SANITY_CHECK_NOTHING();
TEST_CYCLE_N(10) }
{ ////////////////////////////////////////////////////////////////////////////////
net.forward("l367_Deconvolution"); // OpenCL target
} ////////////////////////////////////////////////////////////////////////////////
SANITY_CHECK_NOTHING(); PERF_TEST(GoogLeNet_opencl, HalidePerfTest)
} catch (SkipTestException& e) { {
throw PerfSkipTestException(); Net net;
} loadNet("dnn/bvlc_googlenet.caffemodel", "dnn/bvlc_googlenet.prototxt",
"", 227, 227, "prob", "caffe", DNN_TARGET_OPENCL, &net);
TEST_CYCLE() net.forward();
SANITY_CHECK_NOTHING();
}
PERF_TEST(AlexNet_opencl, HalidePerfTest)
{
Net net;
loadNet("dnn/bvlc_alexnet.caffemodel", "dnn/bvlc_alexnet.prototxt",
"dnn/halide_scheduler_opencl_alexnet.yml", 227, 227, "prob", "caffe",
DNN_TARGET_OPENCL, &net);
TEST_CYCLE() net.forward();
SANITY_CHECK_NOTHING();
}
PERF_TEST(ResNet50_opencl, HalidePerfTest)
{
Net net;
loadNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt",
"dnn/halide_scheduler_opencl_resnet_50.yml", 224, 224, "prob", "caffe",
DNN_TARGET_OPENCL, &net);
TEST_CYCLE() net.forward();
SANITY_CHECK_NOTHING();
}
PERF_TEST(SqueezeNet_v1_1_opencl, HalidePerfTest)
{
Net net;
loadNet("dnn/squeezenet_v1_1.caffemodel", "dnn/squeezenet_v1_1.prototxt",
"dnn/halide_scheduler_opencl_squeezenet_v1_1.yml", 227, 227, "prob",
"caffe", DNN_TARGET_OPENCL, &net);
TEST_CYCLE() net.forward();
SANITY_CHECK_NOTHING();
}
PERF_TEST(Inception_5h_opencl, HalidePerfTest)
{
Net net;
loadNet("dnn/tensorflow_inception_graph.pb", "",
"dnn/halide_scheduler_opencl_inception_5h.yml",
224, 224, "softmax2", "tensorflow", DNN_TARGET_OPENCL, &net);
TEST_CYCLE() net.forward("softmax2");
SANITY_CHECK_NOTHING();
}
PERF_TEST(ENet_opencl, HalidePerfTest)
{
Net net;
loadNet("dnn/Enet-model-best.net", "", "dnn/halide_scheduler_opencl_enet.yml",
512, 256, "l367_Deconvolution", "torch", DNN_TARGET_OPENCL, &net);
TEST_CYCLE() net.forward();
SANITY_CHECK_NOTHING();
} }
#endif // HAVE_HALIDE #endif // HAVE_HALIDE
......
...@@ -205,7 +205,7 @@ struct LayerPin ...@@ -205,7 +205,7 @@ struct LayerPin
class BackendWrapManager class BackendWrapManager
{ {
public: public:
Ptr<BackendWrapper> wrap(const Mat& m, int backendId, int targetId = DNN_TARGET_CPU) Ptr<BackendWrapper> wrap(const Mat& m, int backendId, int targetId)
{ {
CV_Assert(backendId != DNN_BACKEND_DEFAULT); CV_Assert(backendId != DNN_BACKEND_DEFAULT);
...@@ -236,7 +236,7 @@ public: ...@@ -236,7 +236,7 @@ public:
} }
std::vector<Ptr<BackendWrapper> > wrap(const std::vector<Mat*>& mats, std::vector<Ptr<BackendWrapper> > wrap(const std::vector<Mat*>& mats,
int backendId, int targetId = DNN_TARGET_CPU) int backendId, int targetId)
{ {
const int num = mats.size(); const int num = mats.size();
std::vector<Ptr<BackendWrapper> > dst(num); std::vector<Ptr<BackendWrapper> > dst(num);
...@@ -248,7 +248,7 @@ public: ...@@ -248,7 +248,7 @@ public:
} }
std::vector<Ptr<BackendWrapper> > wrap(const std::vector<Mat>& mats, std::vector<Ptr<BackendWrapper> > wrap(const std::vector<Mat>& mats,
int backendId, int targetId = DNN_TARGET_CPU) int backendId, int targetId)
{ {
const int num = mats.size(); const int num = mats.size();
std::vector<Ptr<BackendWrapper> > dst(num); std::vector<Ptr<BackendWrapper> > dst(num);
...@@ -617,6 +617,7 @@ struct Net::Impl ...@@ -617,6 +617,7 @@ struct Net::Impl
lastLayerId = 1; lastLayerId = 1;
netWasAllocated = false; netWasAllocated = false;
preferableBackend = DNN_BACKEND_DEFAULT; preferableBackend = DNN_BACKEND_DEFAULT;
preferableTarget = DNN_TARGET_CPU;
} }
Ptr<DataLayer> netInputLayer; Ptr<DataLayer> netInputLayer;
...@@ -626,6 +627,7 @@ struct Net::Impl ...@@ -626,6 +627,7 @@ struct Net::Impl
std::map<String, int> layerNameToId; std::map<String, int> layerNameToId;
BlobManager blobManager; BlobManager blobManager;
int preferableBackend; int preferableBackend;
int preferableTarget;
String halideConfigFile; String halideConfigFile;
// Backend-specific wrapping manager. // Backend-specific wrapping manager.
BackendWrapManager backendWrapper; BackendWrapManager backendWrapper;
...@@ -652,10 +654,11 @@ struct Net::Impl ...@@ -652,10 +654,11 @@ struct Net::Impl
{ {
// Use automatic scheduling provided by layer. // Use automatic scheduling provided by layer.
layer->applyHalideScheduler(ld.backendNodes[DNN_BACKEND_HALIDE], layer->applyHalideScheduler(ld.backendNodes[DNN_BACKEND_HALIDE],
ld.inputBlobs, ld.outputBlobs); ld.inputBlobs, ld.outputBlobs,
preferableTarget);
} }
dnn::compileHalide(ld.outputBlobs, ld.backendNodes[DNN_BACKEND_HALIDE], dnn::compileHalide(ld.outputBlobs, ld.backendNodes[DNN_BACKEND_HALIDE],
DNN_TARGET_CPU); preferableTarget);
} }
} }
} }
...@@ -859,7 +862,10 @@ struct Net::Impl ...@@ -859,7 +862,10 @@ struct Net::Impl
{ {
backendWrapper.reset(); backendWrapper.reset();
if (preferableBackend == DNN_BACKEND_DEFAULT) if (preferableBackend == DNN_BACKEND_DEFAULT)
{
CV_Assert(preferableTarget == DNN_TARGET_CPU);
return; return;
}
// Iterator to current layer. // Iterator to current layer.
MapIdToLayerData::iterator it = layers.begin(); MapIdToLayerData::iterator it = layers.begin();
...@@ -905,7 +911,8 @@ struct Net::Impl ...@@ -905,7 +911,8 @@ struct Net::Impl
// No layers fusion. // No layers fusion.
ldTop.skipFlags[preferableBackend] = false; ldTop.skipFlags[preferableBackend] = false;
std::vector<Ptr<BackendWrapper> > inputs = std::vector<Ptr<BackendWrapper> > inputs =
backendWrapper.wrap(ldTop.inputBlobs, preferableBackend); backendWrapper.wrap(ldTop.inputBlobs, preferableBackend,
preferableTarget);
if (preferableBackend == DNN_BACKEND_HALIDE) if (preferableBackend == DNN_BACKEND_HALIDE)
{ {
ldTop.backendNodes[DNN_BACKEND_HALIDE] = layerTop->initHalide(inputs); ldTop.backendNodes[DNN_BACKEND_HALIDE] = layerTop->initHalide(inputs);
...@@ -1040,7 +1047,7 @@ struct Net::Impl ...@@ -1040,7 +1047,7 @@ struct Net::Impl
else if (!ld.skipFlags[preferableBackend]) else if (!ld.skipFlags[preferableBackend])
{ {
std::vector<Ptr<BackendWrapper> > outputs = std::vector<Ptr<BackendWrapper> > outputs =
backendWrapper.wrap(ld.outputBlobs, preferableBackend); backendWrapper.wrap(ld.outputBlobs, preferableBackend, preferableTarget);
Ptr<BackendNode> node = ld.backendNodes[preferableBackend]; Ptr<BackendNode> node = ld.backendNodes[preferableBackend];
if (preferableBackend == DNN_BACKEND_HALIDE) if (preferableBackend == DNN_BACKEND_HALIDE)
{ {
...@@ -1154,6 +1161,16 @@ struct Net::Impl ...@@ -1154,6 +1161,16 @@ struct Net::Impl
CV_Error(Error::StsOutOfRange, "Layer \"" + ld.name + "\" produce only " + toString(ld.outputBlobs.size()) + CV_Error(Error::StsOutOfRange, "Layer \"" + ld.name + "\" produce only " + toString(ld.outputBlobs.size()) +
" outputs, the #" + toString(pin.oid) + " was requsted"); " outputs, the #" + toString(pin.oid) + " was requsted");
} }
if (preferableBackend != DNN_BACKEND_DEFAULT)
{
// Transfer data to CPU if it's require.
backendWrapper.wrap(ld.outputBlobs[pin.oid], preferableBackend,
preferableTarget)->copyToHost();
}
else
{
CV_Assert(preferableTarget == DNN_TARGET_CPU);
}
return ld.outputBlobs[pin.oid]; return ld.outputBlobs[pin.oid];
} }
...@@ -1314,6 +1331,13 @@ void Net::setPreferableBackend(int backendId) ...@@ -1314,6 +1331,13 @@ void Net::setPreferableBackend(int backendId)
impl->preferableBackend = backendId; impl->preferableBackend = backendId;
} }
void Net::setPreferableTarget(int targetId)
{
impl->netWasAllocated = impl->netWasAllocated &&
impl->preferableTarget == targetId;
impl->preferableTarget = targetId;
}
void Net::setInputsNames(const std::vector<String> &inputBlobNames) void Net::setInputsNames(const std::vector<String> &inputBlobNames)
{ {
impl->netInputLayer->setNames(inputBlobNames); impl->netInputLayer->setNames(inputBlobNames);
...@@ -1702,10 +1726,70 @@ Ptr<BackendNode> Layer::initHalide(const std::vector<Ptr<BackendWrapper> > &) ...@@ -1702,10 +1726,70 @@ Ptr<BackendNode> Layer::initHalide(const std::vector<Ptr<BackendWrapper> > &)
} }
void Layer::applyHalideScheduler(Ptr<BackendNode>& node, const std::vector<Mat*> &inputs, void Layer::applyHalideScheduler(Ptr<BackendNode>& node, const std::vector<Mat*> &inputs,
const std::vector<Mat> &outputs) const const std::vector<Mat> &outputs, int targetId) const
{ {
CV_Error(Error::StsNotImplemented, "Scheduling of " + type + #ifdef HAVE_HALIDE
" layers is not implemented."); Halide::Var x("x"), y("y"), c("c"), n("n"), co("co"), ci("ci"),
xo("xo"), xi("xi"), yo("yo"), yi("yi"), tile("tile");
Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
int outW, outH, outC, outN;
getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
if (targetId == DNN_TARGET_CPU)
{
if (outW == 1 && outH == 1)
{
if (outC + outN == 1)
return;
if (outC > 8)
top.split(c, co, ci, 8)
.fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
.parallel(tile)
.vectorize(ci, 8);
else
top.fuse(x, y, tile).fuse(c, tile, tile).fuse(n, tile, tile)
.parallel(tile);
}
else
{
if (outH > 2)
{
top.reorder(x, c, y)
.split(y, yo, yi, 2)
.fuse(yo, n, tile)
.parallel(tile)
.unroll(yi)
.vectorize(x, outW >= 16 ? 16 : outW);
}
}
}
else if (targetId == DNN_TARGET_OPENCL)
{
int c_split = outC > 8 ? (outC > 16 ? 8 : 4) : outC;
if (outW == 1 && outH == 1)
{
top.split(c, co, ci, c_split)
.fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
.gpu_blocks(tile)
.gpu_threads(ci);
}
else
{
int x_split = outW > 8 ? (outW >= 32 ? 16 : 8) : outW;
int y_split = outH > 8 ? (outH >= 32 ? 16 : 8) : outH;
top.split(x, xo, xi, x_split).split(y, yo, yi, y_split)
.split(c, co, ci, c_split)
.gpu_blocks(xo, yo, co)
.gpu_threads(xi, yi)
.reorder(xi, yi, ci, xo, yo, co)
.vectorize(ci);
}
}
else
CV_Error(Error::StsNotImplemented, "Unknown target identifier");
#endif // HAVE_HALIDE
} }
Ptr<BackendNode> Layer::tryAttach(const Ptr<BackendNode>& node) Ptr<BackendNode> Layer::tryAttach(const Ptr<BackendNode>& node)
......
...@@ -143,6 +143,26 @@ static void applyComputeRoot(const FileNode& directive, Halide::Func& func) ...@@ -143,6 +143,26 @@ static void applyComputeRoot(const FileNode& directive, Halide::Func& func)
func.compute_root(); func.compute_root();
} }
static void applyGpuBlocks(const FileNode& directive, Halide::Func& func)
{
std::string varName;
for (int i = 0, n = directive.size(); i < n; ++i)
{
directive[i] >> varName;
func.gpu_blocks(Halide::Var(varName));
}
}
static void applyGpuThreads(const FileNode& directive, Halide::Func& func)
{
std::string varName;
for (int i = 0, n = directive.size(); i < n; ++i)
{
directive[i] >> varName;
func.gpu_threads(Halide::Var(varName));
}
}
static void apply(const FileNode& directives, Halide::Func& func, static void apply(const FileNode& directives, Halide::Func& func,
std::map<std::string, Halide::Func>& funcsMap, std::map<std::string, Halide::Func>& funcsMap,
const FileNode& params) const FileNode& params)
...@@ -167,6 +187,10 @@ static void apply(const FileNode& directives, Halide::Func& func, ...@@ -167,6 +187,10 @@ static void apply(const FileNode& directives, Halide::Func& func,
applyComputeAt(directive, func, funcsMap); applyComputeAt(directive, func, funcsMap);
else if (directive.name() == "compute_root") else if (directive.name() == "compute_root")
applyComputeRoot(directive, func); applyComputeRoot(directive, func);
else if (directive.name() == "gpu_blocks")
applyGpuBlocks(directive, func);
else if (directive.name() == "gpu_threads")
applyGpuThreads(directive, func);
else else
CV_Error(Error::StsNotImplemented, "Scheduling directive " + CV_Error(Error::StsNotImplemented, "Scheduling directive " +
directive.name() + " is not implemented."); directive.name() + " is not implemented.");
......
...@@ -157,6 +157,8 @@ public: ...@@ -157,6 +157,8 @@ public:
bias(i) = (hasBias ? biasData[i] : 0.0f) - bias(i) = (hasBias ? biasData[i] : 0.0f) -
weights(i) * meanData[i] * varMeanScale; weights(i) * meanData[i] * varMeanScale;
} }
weights.set_host_dirty();
bias.set_host_dirty();
top(x, y, c, n) = input * weights(c) + bias(c); top(x, y, c, n) = input * weights(c) + bias(c);
return top; return top;
} }
......
...@@ -130,29 +130,6 @@ public: ...@@ -130,29 +130,6 @@ public:
#endif // HAVE_HALIDE #endif // HAVE_HALIDE
return Ptr<BackendNode>(); return Ptr<BackendNode>();
} }
virtual void applyHalideScheduler(Ptr<BackendNode>& node,
const std::vector<Mat*> &inputs,
const std::vector<Mat> &outputs) const
{
#ifdef HAVE_HALIDE
Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"), yi("yi"), yo("yo");
Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
int outW, outH, outC, outN;
getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
if (outW == 1 || outH <= 2)
return;
top.reorder(x, c, y)
.split(y, yo, yi, 2)
.fuse(yo, n, tile)
.parallel(tile)
.unroll(yi)
.vectorize(x, outW >= 16 ? 16 : outW);
#endif // HAVE_HALIDE
}
}; };
Ptr<ConcatLayer> ConcatLayer::create(const LayerParams& params) Ptr<ConcatLayer> ConcatLayer::create(const LayerParams& params)
......
...@@ -99,9 +99,15 @@ public: ...@@ -99,9 +99,15 @@ public:
virtual void applyHalideScheduler(Ptr<BackendNode>& node, virtual void applyHalideScheduler(Ptr<BackendNode>& node,
const std::vector<Mat*> &inputs, const std::vector<Mat*> &inputs,
const std::vector<Mat> &outputs) const const std::vector<Mat> &outputs,
int targetId) const
{ {
#ifdef HAVE_HALIDE #ifdef HAVE_HALIDE
if (targetId != DNN_TARGET_CPU)
{
Layer::applyHalideScheduler(node, inputs, outputs, targetId);
return;
}
Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"), yi("yi"), yo("yo"), co("co"), ci("ci"); Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"), yi("yi"), yo("yo"), co("co"), ci("ci");
Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs[1]; Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs[1];
Halide::Func& padded_input = node.dynamicCast<HalideBackendNode>()->funcs[0]; Halide::Func& padded_input = node.dynamicCast<HalideBackendNode>()->funcs[0];
......
...@@ -422,7 +422,7 @@ struct ChannelsPReLUFunctor ...@@ -422,7 +422,7 @@ struct ChannelsPReLUFunctor
{ {
Halide::Var x("x"), y("y"), c("c"), n("n"); Halide::Var x("x"), y("y"), c("c"), n("n");
auto weights = wrapToHalideBuffer(scale, {(int)scale.total()}); auto weights = wrapToHalideBuffer(scale, {(int)scale.total()});
top(x, y, c, n) = select(input > 0.0f, input, weights(c) * input); top(x, y, c, n) = select(input >= 0.0f, input, weights(c) * input);
} }
#endif // HAVE_HALIDE #endif // HAVE_HALIDE
......
...@@ -198,29 +198,6 @@ public: ...@@ -198,29 +198,6 @@ public:
return Ptr<BackendNode>(); return Ptr<BackendNode>();
} }
virtual void applyHalideScheduler(Ptr<BackendNode>& node,
const std::vector<Mat*> &inputs,
const std::vector<Mat> &outputs) const
{
#ifdef HAVE_HALIDE
Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"), yi("yi"), yo("yo");
Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
int outW, outH, outC, outN;
getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
if (outW == 1 || outH <= 2)
return;
top.reorder(x, c, y)
.split(y, yo, yi, 2)
.fuse(yo, n, tile)
.parallel(tile)
.unroll(yi)
.vectorize(x, outW >= 16 ? 16 : outW);
#endif // HAVE_HALIDE
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs, virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const const std::vector<MatShape> &outputs) const
{ {
......
...@@ -252,31 +252,6 @@ public: ...@@ -252,31 +252,6 @@ public:
return Ptr<BackendNode>(); return Ptr<BackendNode>();
} }
virtual void applyHalideScheduler(Ptr<BackendNode>& node,
const std::vector<Mat*> &inputs,
const std::vector<Mat> &outputs) const
{
#ifdef HAVE_HALIDE
int outW, outH, outC, outN;
getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
Halide::Var x("x"), y("y"), c("c"), n("n"), co("co"), ci("ci"), tile("tile");
Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
if (outC + outN == 1)
return;
if (outC > 8)
top.split(c, co, ci, 8)
.fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
.parallel(tile)
.vectorize(ci, 8);
else
top.fuse(x, y, tile).fuse(c, tile, tile).fuse(n, tile, tile)
.parallel(tile);
#endif // HAVE_HALIDE
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs, virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const const std::vector<MatShape> &outputs) const
{ {
......
...@@ -272,9 +272,15 @@ public: ...@@ -272,9 +272,15 @@ public:
virtual void applyHalideScheduler(Ptr<BackendNode>& node, virtual void applyHalideScheduler(Ptr<BackendNode>& node,
const std::vector<Mat*> &inputs, const std::vector<Mat*> &inputs,
const std::vector<Mat> &outputs) const const std::vector<Mat> &outputs,
int targetId) const
{ {
#ifdef HAVE_HALIDE #ifdef HAVE_HALIDE
if (targetId != DNN_TARGET_CPU)
{
Layer::applyHalideScheduler(node, inputs, outputs, targetId);
return;
}
int outW, outH, outC, outN; int outW, outH, outC, outN;
getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN); getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
......
...@@ -117,26 +117,6 @@ public: ...@@ -117,26 +117,6 @@ public:
#endif // HAVE_HALIDE #endif // HAVE_HALIDE
return Ptr<BackendNode>(); return Ptr<BackendNode>();
} }
virtual void applyHalideScheduler(Ptr<BackendNode>& node,
const std::vector<Mat*> &inputs,
const std::vector<Mat> &outputs) const
{
#ifdef HAVE_HALIDE
Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"), yi("yi"), yo("yo");
Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
int outW, outH, outC, outN;
getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
top.reorder(x, c, y)
.split(y, yo, yi, 2)
.fuse(yo, n, tile)
.parallel(tile)
.unroll(yi)
.vectorize(x, outW >= 16 ? 16 : outW);
#endif // HAVE_HALIDE
}
}; };
Ptr<MaxUnpoolLayer> MaxUnpoolLayer::create(const LayerParams& params) Ptr<MaxUnpoolLayer> MaxUnpoolLayer::create(const LayerParams& params)
......
...@@ -10,6 +10,7 @@ Implementation of padding layer, which adds paddings to input blob. ...@@ -10,6 +10,7 @@ Implementation of padding layer, which adds paddings to input blob.
*/ */
#include "../precomp.hpp" #include "../precomp.hpp"
#include "op_halide.hpp"
#include <vector> #include <vector>
namespace cv namespace cv
...@@ -52,6 +53,12 @@ public: ...@@ -52,6 +53,12 @@ public:
return false; return false;
} }
virtual bool supportBackend(int backendId)
{
return backendId == DNN_BACKEND_DEFAULT ||
backendId == DNN_BACKEND_HALIDE && haveHalide();
}
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
{ {
for(int i = 0; i < inputs.size(); i++) for(int i = 0; i < inputs.size(); i++)
...@@ -94,6 +101,23 @@ public: ...@@ -94,6 +101,23 @@ public:
return inputDims > 0 && (int)shape.size() > inputDims ? paddingDim + 1 : paddingDim; return inputDims > 0 && (int)shape.size() > inputDims ? paddingDim + 1 : paddingDim;
} }
virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
{
#ifdef HAVE_HALIDE
int inW, inH, inC, inN;
Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
getCanonicalSize(inputBuffer, &inW, &inH, &inC, &inN);
Halide::Var x("x"), y("y"), c("c"), n("n");
Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
Halide::Func padded =
Halide::BoundaryConditions::constant_exterior(inputBuffer, paddingValue);
top(x, y, c, n) = padded(x, y, c, n);
return Ptr<BackendNode>(new HalideBackendNode(top));
#endif // HAVE_HALIDE
return Ptr<BackendNode>();
}
int paddingDim, padding, inputDims, index; int paddingDim, padding, inputDims, index;
float paddingValue; float paddingValue;
}; };
......
...@@ -388,9 +388,15 @@ public: ...@@ -388,9 +388,15 @@ public:
virtual void applyHalideScheduler(Ptr<BackendNode>& node, virtual void applyHalideScheduler(Ptr<BackendNode>& node,
const std::vector<Mat*> &inputs, const std::vector<Mat*> &inputs,
const std::vector<Mat> &outputs) const const std::vector<Mat> &outputs,
int targetId) const
{ {
#ifdef HAVE_HALIDE #ifdef HAVE_HALIDE
if (targetId != DNN_TARGET_CPU)
{
Layer::applyHalideScheduler(node, inputs, outputs, targetId);
return;
}
Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"), Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"),
xi("xi"), yi("yi"), ci("ci"), xo("xo"), yo("yo"), co("co"); xi("xi"), yi("yi"), ci("ci"), xo("xo"), yo("yo"), co("co");
Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back(); Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
......
...@@ -187,33 +187,6 @@ public: ...@@ -187,33 +187,6 @@ public:
return Ptr<BackendNode>(); return Ptr<BackendNode>();
} }
virtual void applyHalideScheduler(Ptr<BackendNode>& node,
const std::vector<Mat*> &inputs,
const std::vector<Mat> &outputs) const
{
#ifdef HAVE_HALIDE
int outW, outH, outC, outN;
getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
// Most common case when SoftMax is a layer after fully-connected.
// So we just schedule it in the same way.
Halide::Var x("x"), y("y"), c("c"), n("n"), co("co"), ci("ci"), tile("tile");
Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
if (outC + outN == 1)
return;
if (outC > 8)
top.split(c, co, ci, 8)
.fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
.parallel(tile)
.vectorize(ci, 8);
else
top.fuse(x, y, tile).fuse(c, tile, tile).fuse(n, tile, tile)
.parallel(tile);
#endif // HAVE_HALIDE
}
int64 getFLOPS(const std::vector<MatShape> &inputs, int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const const std::vector<MatShape> &outputs) const
{ {
......
...@@ -7,6 +7,10 @@ ...@@ -7,6 +7,10 @@
#include "op_halide.hpp" #include "op_halide.hpp"
#ifdef HAVE_HALIDE
#include <HalideRuntimeOpenCL.h>
#endif // HAVE_HALIDE
namespace cv namespace cv
{ {
namespace dnn namespace dnn
...@@ -72,7 +76,15 @@ HalideBackendWrapper::HalideBackendWrapper(int targetId, const cv::Mat& m) ...@@ -72,7 +76,15 @@ HalideBackendWrapper::HalideBackendWrapper(int targetId, const cv::Mat& m)
: BackendWrapper(DNN_BACKEND_HALIDE, targetId) : BackendWrapper(DNN_BACKEND_HALIDE, targetId)
{ {
buffer = wrapToHalideBuffer(m); buffer = wrapToHalideBuffer(m);
if (targetId != DNN_TARGET_CPU) if (targetId == DNN_TARGET_CPU)
{
return;
}
else if (targetId == DNN_TARGET_OPENCL)
{
buffer.copy_to_device(halide_opencl_device_interface());
}
else
CV_Error(Error::StsNotImplemented, "Unknown target identifier"); CV_Error(Error::StsNotImplemented, "Unknown target identifier");
} }
...@@ -80,15 +92,32 @@ HalideBackendWrapper::HalideBackendWrapper(const Ptr<BackendWrapper>& base, ...@@ -80,15 +92,32 @@ HalideBackendWrapper::HalideBackendWrapper(const Ptr<BackendWrapper>& base,
const MatShape& shape) const MatShape& shape)
: BackendWrapper(DNN_BACKEND_HALIDE, base->targetId) : BackendWrapper(DNN_BACKEND_HALIDE, base->targetId)
{ {
if (base->targetId != DNN_TARGET_CPU)
CV_Error(Error::StsNotImplemented, "Unknown target identifier");
int w, h, c, n; int w, h, c, n;
getCanonicalSize(shape, &w, &h, &c, &n); getCanonicalSize(shape, &w, &h, &c, &n);
Halide::Buffer<float> baseBuffer = halideBuffer(base); Halide::Buffer<float> baseBuffer = halideBuffer(base);
buffer = Halide::Buffer<float>((float*)baseBuffer.raw_buffer()->host, buffer = Halide::Buffer<float>((float*)baseBuffer.raw_buffer()->host,
{w, h, c, n}); {w, h, c, n});
buffer.set_host_dirty(); // Indicate that data is on CPU. if (baseBuffer.has_device_allocation())
{
buffer.raw_buffer()->device = baseBuffer.raw_buffer()->device;
buffer.raw_buffer()->device_interface = baseBuffer.raw_buffer()->device_interface;
buffer.set_device_dirty();
}
else
{
buffer.set_host_dirty(); // Indicate that data is on CPU.
CV_Assert(targetId == DNN_TARGET_CPU);
}
}
void HalideBackendWrapper::copyToHost()
{
CV_Assert(targetId == DNN_TARGET_CPU || buffer.device_dirty());
if (buffer.device_dirty())
{
buffer.device_sync();
buffer.copy_to_host();
}
} }
#endif // HAVE_HALIDE #endif // HAVE_HALIDE
...@@ -144,6 +173,11 @@ void compileHalide(std::vector<Mat> &outputs, Ptr<BackendNode>& node, int target ...@@ -144,6 +173,11 @@ void compileHalide(std::vector<Mat> &outputs, Ptr<BackendNode>& node, int target
Halide::Target target = Halide::get_host_target(); Halide::Target target = Halide::get_host_target();
target.set_feature(Halide::Target::NoAsserts); target.set_feature(Halide::Target::NoAsserts);
if (targetId == DNN_TARGET_OPENCL)
{
target.set_feature(Halide::Target::OpenCL);
}
CV_Assert(target.supported());
top.compile_jit(target); top.compile_jit(target);
#endif // HAVE_HALIDE #endif // HAVE_HALIDE
} }
......
...@@ -57,6 +57,8 @@ namespace dnn ...@@ -57,6 +57,8 @@ namespace dnn
HalideBackendWrapper(const Ptr<BackendWrapper>& base, const MatShape& shape); HalideBackendWrapper(const Ptr<BackendWrapper>& base, const MatShape& shape);
virtual void copyToHost();
Halide::Buffer<float> buffer; Halide::Buffer<float> buffer;
}; };
#endif // HAVE_HALIDE #endif // HAVE_HALIDE
......
...@@ -48,6 +48,7 @@ static void test(const std::string& weights, const std::string& proto, ...@@ -48,6 +48,7 @@ static void test(const std::string& weights, const std::string& proto,
netHalide.setInput(blobFromImage(input.clone(), 1.0f, false)); netHalide.setInput(blobFromImage(input.clone(), 1.0f, false));
netHalide.setPreferableBackend(DNN_BACKEND_HALIDE); netHalide.setPreferableBackend(DNN_BACKEND_HALIDE);
netHalide.setPreferableTarget(targetId);
netHalide.setHalideScheduler(scheduler); netHalide.setHalideScheduler(scheduler);
outputHalide = netHalide.forward(outputLayer).clone(); outputHalide = netHalide.forward(outputLayer).clone();
...@@ -62,15 +63,20 @@ static void test(const std::string& weights, const std::string& proto, ...@@ -62,15 +63,20 @@ static void test(const std::string& weights, const std::string& proto,
// Swap backends. // Swap backends.
netHalide.setPreferableBackend(DNN_BACKEND_DEFAULT); netHalide.setPreferableBackend(DNN_BACKEND_DEFAULT);
netHalide.setPreferableTarget(DNN_TARGET_CPU);
outputDefault = netHalide.forward(outputLayer).clone(); outputDefault = netHalide.forward(outputLayer).clone();
netDefault.setPreferableBackend(DNN_BACKEND_HALIDE); netDefault.setPreferableBackend(DNN_BACKEND_HALIDE);
netDefault.setPreferableTarget(targetId);
netDefault.setHalideScheduler(scheduler); netDefault.setHalideScheduler(scheduler);
outputHalide = netDefault.forward(outputLayer).clone(); outputHalide = netDefault.forward(outputLayer).clone();
normAssert(outputDefault, outputHalide); normAssert(outputDefault, outputHalide);
} }
////////////////////////////////////////////////////////////////////////////////
// CPU target
////////////////////////////////////////////////////////////////////////////////
TEST(Reproducibility_GoogLeNet_Halide, Accuracy) TEST(Reproducibility_GoogLeNet_Halide, Accuracy)
{ {
test(findDataFile("dnn/bvlc_googlenet.caffemodel", false), test(findDataFile("dnn/bvlc_googlenet.caffemodel", false),
...@@ -115,6 +121,53 @@ TEST(Reproducibility_ENet_Halide, Accuracy) ...@@ -115,6 +121,53 @@ TEST(Reproducibility_ENet_Halide, Accuracy)
findDataFile("dnn/halide_scheduler_enet.yml", false), findDataFile("dnn/halide_scheduler_enet.yml", false),
512, 512, "l367_Deconvolution", "torch", DNN_TARGET_CPU); 512, 512, "l367_Deconvolution", "torch", DNN_TARGET_CPU);
}; };
////////////////////////////////////////////////////////////////////////////////
// OpenCL target
////////////////////////////////////////////////////////////////////////////////
TEST(Reproducibility_GoogLeNet_Halide_opencl, Accuracy)
{
test(findDataFile("dnn/bvlc_googlenet.caffemodel", false),
findDataFile("dnn/bvlc_googlenet.prototxt", false),
"", 227, 227, "prob", "caffe", DNN_TARGET_OPENCL);
};
TEST(Reproducibility_AlexNet_Halide_opencl, Accuracy)
{
test(findDataFile("dnn/bvlc_alexnet.caffemodel", false),
findDataFile("dnn/bvlc_alexnet.prototxt", false),
findDataFile("dnn/halide_scheduler_opencl_alexnet.yml", false),
227, 227, "prob", "caffe", DNN_TARGET_OPENCL);
};
TEST(Reproducibility_ResNet_50_Halide_opencl, Accuracy)
{
test(findDataFile("dnn/ResNet-50-model.caffemodel", false),
findDataFile("dnn/ResNet-50-deploy.prototxt", false),
findDataFile("dnn/halide_scheduler_opencl_resnet_50.yml", false),
224, 224, "prob", "caffe", DNN_TARGET_OPENCL);
};
TEST(Reproducibility_SqueezeNet_v1_1_Halide_opencl, Accuracy)
{
test(findDataFile("dnn/squeezenet_v1_1.caffemodel", false),
findDataFile("dnn/squeezenet_v1_1.prototxt", false),
findDataFile("dnn/halide_scheduler_opencl_squeezenet_v1_1.yml", false),
227, 227, "prob", "caffe", DNN_TARGET_OPENCL);
};
TEST(Reproducibility_Inception_5h_Halide_opencl, Accuracy)
{
test(findDataFile("dnn/tensorflow_inception_graph.pb", false), "",
findDataFile("dnn/halide_scheduler_opencl_inception_5h.yml", false),
224, 224, "softmax2", "tensorflow", DNN_TARGET_OPENCL);
};
TEST(Reproducibility_ENet_Halide_opencl, Accuracy)
{
test(findDataFile("dnn/Enet-model-best.net", false), "",
findDataFile("dnn/halide_scheduler_opencl_enet.yml", false),
512, 512, "l367_Deconvolution", "torch", DNN_TARGET_OPENCL);
};
#endif // HAVE_HALIDE #endif // HAVE_HALIDE
} // namespace cvtest } // namespace cvtest
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment