Merge pull request #16039 from dmatveev:dm/gapi_tutorial_interactive_face_detection

* G-API-NG/Docs: Added a tutorial page on interactive face detection sample - Introduced a "--ser" option to run the pipeline serially for benchmarking purposes - Reorganized sample code to better fit the documentation; - Fixed a couple of issues (mainly typos) in the public headers * G-API-NG/Docs: Reflected meta-less compilation in new G-API tutorial * G-API-NG/Docs: Addressed review comments on Face Analytics Pipeline example

Merge pull request #16039 from dmatveev:dm/gapi_tutorial_interactive_face_detection
* G-API-NG/Docs: Added a tutorial page on interactive face detection sample - Introduced a "--ser" option to run the pipeline serially for benchmarking purposes - Reorganized sample code to better fit the documentation; - Fixed a couple of issues (mainly typos) in the public headers * G-API-NG/Docs: Reflected meta-less compilation in new G-API tutorial * G-API-NG/Docs: Addressed review comments on Face Analytics Pipeline example
c89780df · Dmitry Matveev · Alexander Alekhin · 3fddd3bf · c89780df · c89780df
Commit c89780df authored Dec 09, 2019 by Dmitry Matveev Committed by Alexander Alekhin Dec 09, 2019
5 changed files
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -287,7 +287,7 @@ CALLER_GRAPH           = NO
 GRAPHICAL_HIERARCHY    = YES
 DIRECTORY_GRAPH        = YES
 DOT_IMAGE_FORMAT       = svg
-INTERACTIVE_SVG        = YES
+INTERACTIVE_SVG        = NO
 DOT_PATH               =
 DOTFILE_DIRS           =
 MSCFILE_DIRS           =

--- a/doc/tutorials/gapi/interactive_face_detection/interactive_face_detection.markdown
+++ b/doc/tutorials/gapi/interactive_face_detection/interactive_face_detection.markdown
--- a/doc/tutorials/gapi/table_of_content_gapi.markdown
+++ b/doc/tutorials/gapi/table_of_content_gapi.markdown
@@ -3,6 +3,20 @@
 In this section you will learn about graph-based image processing and
 how G-API module can be used for that.

+- @subpage tutorial_gapi_interactive_face_detection
+
+    *Languages:* C++
+
+    *Compatibility:* \> OpenCV 4.2
+
+    *Author:* Dmitry Matveev
+
+    This tutorial illustrates how to build a hybrid video processing
+    pipeline with G-API where Deep Learning and image processing are
+    combined effectively to maximize the overall throughput. This
+    sample requires Intel® distribution of OpenVINO™ Toolkit version
+    2019R2 or later.
+
 - @subpage tutorial_gapi_anisotropic_segmentation

    *Languages:* C++

--- a/modules/gapi/include/opencv2/gapi/streaming/source.hpp
+++ b/modules/gapi/include/opencv2/gapi/streaming/source.hpp
@@ -24,12 +24,13 @@ namespace wip {
 * Implement this interface if you want customize the way how data is
 * streaming into GStreamingCompiled.
 *
- * Objects implementing this interface can be passes to
- * GStreamingCompiled via setSource()/cv::gin(). Regular compiled
- * graphs (GCompiled) don't support input objects of this type.
+ * Objects implementing this interface can be passed to
+ * GStreamingCompiled using setSource() with cv::gin(). Regular
+ * compiled graphs (GCompiled) don't support input objects of this
+ * type.
 *
 * Default cv::VideoCapture-based implementation is available, see
- * cv::gapi::GCaptureSource.
+ * cv::gapi::wip::GCaptureSource.
 *
 * @note stream sources are passed to G-API via shared pointers, so
 *  please use ptr() when passing a IStreamSource implementation to

--- a/samples/cpp/tutorial_code/gapi/age_gender_emotion_recognition/age_gender_emotion_recognition.cpp
+++ b/samples/cpp/tutorial_code/gapi/age_gender_emotion_recognition/age_gender_emotion_recognition.cpp
@@ -30,7 +30,8 @@ const std::string keys =
    "{ emom   |   | IE emotions recognition model IR }"
    "{ emow   |   | IE emotions recognition model weights }"
    "{ emod   |   | IE emotions recognition model device }"
-    "{ pure   |   | When set, no output is displayed. Useful for benchmarking }";
+    "{ pure   |   | When set, no output is displayed. Useful for benchmarking }"
+    "{ ser    |   | Run serially (no pipelining involved). Useful for benchmarking }";

 struct Avg {
    struct Elapsed {
@@ -73,6 +74,7 @@ namespace custom {
 // executed. The _how_ is defined at graph compilation stage (via parameters),
 // not on the graph construction stage.

+//! [G_API_NET]
 // Face detector: takes one Mat, returns another Mat
 G_API_NET(Faces, <cv::GMat(cv::GMat)>, "face-detector");

@@ -84,7 +86,9 @@ G_API_NET(AgeGender, <AGInfo(cv::GMat)>,   "age-gender-recoginition");

 // Emotion recognition - takes one Mat, returns another.
 G_API_NET(Emotions, <cv::GMat(cv::GMat)>, "emotions-recognition");
+//! [G_API_NET]

+//! [Postproc]
 // SSD Post-processing function - this is not a network but a kernel.
 // The kernel body is declared separately, this is just an interface.
 // This operation takes two Mats (detections and the source image),
@@ -101,6 +105,7 @@ G_API_OP(PostProc, <cv::GArray<cv::Rect>(cv::GMat, cv::GMat)>, "custom.fd_postpr
    }
 };

+// OpenCV-based implementation of the above kernel.
 GAPI_OCV_KERNEL(OCVPostProc, PostProc) {
    static void run(const cv::Mat &in_ssd_result,
                    const cv::Mat &in_frame,
@@ -124,10 +129,12 @@ GAPI_OCV_KERNEL(OCVPostProc, PostProc) {
            if (image_id < 0.f) {  // indicates end of detections
                break;
            }
-            if (confidence < 0.5f) { // fixme: hard-coded snapshot
+            if (confidence < 0.5f) { // a hard-coded snapshot
                continue;
            }

+            // Convert floating-point coordinates to the absolute image
+            // frame coordinates; clip by the source image boundaries.
            cv::Rect rc;
            rc.x      = static_cast<int>(rc_left   * upscale.width);
            rc.y      = static_cast<int>(rc_top    * upscale.height);
@@ -137,6 +144,8 @@ GAPI_OCV_KERNEL(OCVPostProc, PostProc) {
        }
    }
 };
+//! [Postproc]
+
 } // namespace custom

 namespace labels {
@@ -208,9 +217,11 @@ int main(int argc, char *argv[])
    }
    const std::string input = cmd.get<std::string>("input");
    const bool no_show = cmd.get<bool>("pure");
+    const bool be_serial = cmd.get<bool>("ser");

    // Express our processing pipeline. Lambda-based constructor
    // is used to keep all temporary objects in a dedicated scope.
+    //! [GComputation]
    cv::GComputation pp([]() {
            // Declare an empty GMat - the beginning of the pipeline.
            cv::GMat in;
@@ -256,6 +267,7 @@ int main(int argc, char *argv[])
            return cv::GComputation(cv::GIn(in),
                                    cv::GOut(frame, faces, ages, genders, emotions));
        });
+    //! [GComputation]

    // Note: it might be very useful to have dimensions loaded at this point!
    // After our computation is defined, specify how it should be executed.
@@ -269,7 +281,8 @@ int main(int argc, char *argv[])
    //
    // OpenCV DNN backend will have its own parmater structure with settings
    // relevant to OpenCV DNN module. Same applies to other possible inference
-    // backends, like cuDNN, etc (:-))
+    // backends...
+    //! [Param_Cfg]
    auto det_net = cv::gapi::ie::Params<custom::Faces> {
        cmd.get<std::string>("fdm"),   // read cmd args: path to topology IR
        cmd.get<std::string>("fdw"),   // read cmd args: path to weights
@@ -287,40 +300,54 @@ int main(int argc, char *argv[])
        cmd.get<std::string>("emow"),   // read cmd args: path to weights
        cmd.get<std::string>("emod"),   // read cmd args: device specifier
    };
+    //! [Param_Cfg]

+    //! [Compile]
    // Form a kernel package (with a single OpenCV-based implementation of our
-    // post-processing) and a network package (holding our three networks).x
+    // post-processing) and a network package (holding our three networks).
    auto kernels = cv::gapi::kernels<custom::OCVPostProc>();
    auto networks = cv::gapi::networks(det_net, age_net, emo_net);

-    // Compile our pipeline for a specific input image format (TBD - can be relaxed)
-    // and pass our kernels & networks as parameters.
-    // This is the place where G-API learns which networks & kernels we're actually
-    // operating with (the graph description itself known nothing about that).
-    auto cc = pp.compileStreaming(cv::GMatDesc{CV_8U,3,cv::Size(1280,720)},
-                                  cv::compile_args(kernels, networks));
+    // Compile our pipeline and pass our kernels & networks as
+    // parameters.  This is the place where G-API learns which
+    // networks & kernels we're actually operating with (the graph
+    // description itself known nothing about that).
+    auto cc = pp.compileStreaming(cv::compile_args(kernels, networks));
+    //! [Compile]
+
+    Avg avg;
+    std::size_t frames = 0u;            // Frame counter (not produced by the graph)

    std::cout << "Reading " << input << std::endl;
-    cc.setSource(cv::gapi::wip::make_src<cv::gapi::wip::GCaptureSource>(input));
+    // Duplicate huge portions of the code in if/else branches in the sake of
+    // better documentation snippets
+    if (!be_serial) {
+        //! [Source]
+        auto in_src = cv::gapi::wip::make_src<cv::gapi::wip::GCaptureSource>(input);
+        cc.setSource(cv::gin(in_src));
+        //! [Source]

-    Avg avg;
        avg.start();
+
+        //! [Run]
+        // After data source is specified, start the execution
        cc.start();

-    cv::Mat frame;
-    std::vector<cv::Rect> faces;
-    std::vector<cv::Mat> out_ages;
-    std::vector<cv::Mat> out_genders;
-    std::vector<cv::Mat> out_emotions;
-    std::size_t frames = 0u;
+        // Declare data objects we will be receiving from the pipeline.
+        cv::Mat frame;                      // The captured frame itself
+        std::vector<cv::Rect> faces;        // Array of detected faces
+        std::vector<cv::Mat> out_ages;      // Array of inferred ages (one blob per face)
+        std::vector<cv::Mat> out_genders;   // Array of inferred genders (one blob per face)
+        std::vector<cv::Mat> out_emotions;  // Array of classified emotions (one blob per face)

        // Implement different execution policies depending on the display option
        // for the best performance.
        while (cc.running()) {
            auto out_vector = cv::gout(frame, faces, out_ages, out_genders, out_emotions);
            if (no_show) {
-            // This is purely a video processing. No need to balance with UI rendering.
-            // Use a blocking pull() to obtain data. Break the loop if the stream is over.
+                // This is purely a video processing. No need to balance
+                // with UI rendering.  Use a blocking pull() to obtain
+                // data. Break the loop if the stream is over.
                if (!cc.pull(std::move(out_vector)))
                    break;
            } else if (!cc.try_pull(std::move(out_vector))) {
@@ -329,15 +356,46 @@ int main(int argc, char *argv[])
                if (cv::waitKey(1) >= 0) break;
                else continue;
            }
-        // At this point we have data for sure (obtained in either blocking or non-blocking way).
+            // At this point we have data for sure (obtained in either
+            // blocking or non-blocking way).
            frames++;
            labels::DrawResults(frame, faces, out_ages, out_genders, out_emotions);
            labels::DrawFPS(frame, frames, avg.fps(frames));
            if (!no_show) cv::imshow("Out", frame);
        }
-    cc.stop();
-    std::cout << "Processed " << frames << " frames in " << avg.elapsed() << std::endl;
-
+        //! [Run]
+    } else { // (serial flag)
+        //! [Run_Serial]
+        cv::VideoCapture cap(input);
+        cv::Mat in_frame, frame;            // The captured frame itself
+        std::vector<cv::Rect> faces;        // Array of detected faces
+        std::vector<cv::Mat> out_ages;      // Array of inferred ages (one blob per face)
+        std::vector<cv::Mat> out_genders;   // Array of inferred genders (one blob per face)
+        std::vector<cv::Mat> out_emotions;  // Array of classified emotions (one blob per face)
+
+        while (cap.read(in_frame)) {
+            pp.apply(cv::gin(in_frame),
+                     cv::gout(frame, faces, out_ages, out_genders, out_emotions),
+                     cv::compile_args(kernels, networks));
+            labels::DrawResults(frame, faces, out_ages, out_genders, out_emotions);
+            frames++;
+            if (frames == 1u) {
+                // Start timer only after 1st frame processed -- compilation
+                // happens on-the-fly here
+                avg.start();
+            } else {
+                // Measurfe & draw FPS for all other frames
+                labels::DrawFPS(frame, frames, avg.fps(frames-1));
+            }
+            if (!no_show) {
+                cv::imshow("Out", frame);
+                if (cv::waitKey(1) >= 0) break;
+            }
+        }
+        //! [Run_Serial]
+    }
+    std::cout << "Processed " << frames << " frames in " << avg.elapsed()
+              << " (" << avg.fps(frames) << " FPS)" << std::endl;
    return 0;
 }
 #else