Commit 40c80392 authored by Alexey Suhov's avatar Alexey Suhov

cleanup

parent adc0d278
......@@ -159,26 +159,6 @@ if (ENABLE_GNA)
debug_message(STATUS "gna=" ${GNA})
endif()
if (ENABLE_ROCKHOPER)
set(rh_decoder_version "Rockhopper_1.0.0.682")
set(INCLUDE_RH_DECODER "include(\"\$\{IE_ROOT_DIR\}/share/ie_rh_decoder.cmake\")")
RESOLVE_DEPENDENCY(RH_Decoder
ARCHIVE_UNIFIED "${rh_decoder_version}.zip"
TARGET_PATH "${TEMP}/${rh_decoder_version}"
VERSION_REGEX ".*_([0-9]+.[0-9]+.[0-9]+.[0-9]+).*")
configure_file(
"${IE_MAIN_SOURCE_DIR}/cmake/InitRHDecoder.cmake.in"
"${CMAKE_BINARY_DIR}/share/ie_rh_decoder.cmake"
@ONLY)
list(APPEND CMAKE_MODULE_PATH ${CMAKE_BINARY_DIR}/share)
# for inference engine in tree build - lets include this finder
include(ie_rh_decoder)
endif()
configure_file(
"${IE_MAIN_SOURCE_DIR}/cmake/share/InferenceEngineConfig.cmake.in"
"${CMAKE_BINARY_DIR}/share/InferenceEngineConfig.cmake"
......
......@@ -8,7 +8,6 @@ include (options)
#these options are aimed to optimize build time on development system
ie_option (ENABLE_GNA "GNA support for inference engine" ON)
ie_option (ENABLE_ROCKHOPER "use Rockhopper decoder for converting / output scores" ON)
ie_option (ENABLE_MKL_DNN "MKL-DNN plugin for inference engine" ON)
......
# Copyright (C) 2018-2019 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
#
if(COMMAND init_rh_decoder)
init_rh_decoder()
ie_add_sample(NAME speech_recognition_offline_demo
SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/main.cpp"
HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/speech_sample.hpp"
INCLUDE_DIRECTORIES "${libRH_Decoder_INCLUDE_DIRS}"
DEPENDENCIES "${libRH_Decoder_LIBRARIES}")
endif()
\ No newline at end of file
# Offline Automatic Speech Recognition C++ Demo
This topic shows how to run speech recognition, demonstrates acoustic model inference and Weighted Finite State Transducer (WFST) language model decoding based on Kaldi\* acoustic neural models, Intel® Rockhopper Trail language models, and speech feature vectors.
## How It Works
The workflow is as follows:
1. The application reads command-line parameters
and loads a Kaldi-trained neural network along with a Kaldi `.ark` speech feature vector file to the Inference Engine plugin.
2. The application performs inference and passes acoustic scores vectors to decoding stage, and
Intel® Rockhopper Trail decoder translates them into a text transcription.
3. The application prints recognized text on a screen.
### Acoustic and Language Model Setup
Pretrained models are available at [Intel® Open Source Technology Center](https://download.01.org/openvinotoolkit/models_contrib/speech/kaldi) and [Intel® OpenVINO™ Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader). For this sample, we use models from `librispeech\s5_ext` folder.
To train models from scratch, refer to a shell-script Kaldi training recipe `lspeech_s5_ext_run.sh` and corresponding documentation `lspeech_s5_ext.md`.
To convert a Kaldi acoustic model into an Intermediate Representation (IR) format acceptable by this sample, use the following Model Optimizer command:
```sh
$ python3 mo.py --framework kaldi --input_model lspeech_s5_ext.nnet --counts lspeech_s5_ext.counts --remove_output_softmax
```
The command produces an IR network consisting of `lspeech_s5_ext.xml` and
`lspeech_s5_ext.bin`.
> **NOTE**: Model Optimizer (`mo.py`), Kaldi-trained neural network (`lspeech_s5_ext.nnet`)
and Kaldi class counts file (`lspeech_s5_ext.counts`) must be in your working directory.
### Speech Recognition
Once the IR is created or downloaded, you can use the following command for
speech recognition on Intel® processors with a GNA coprocessor (or
emulation library) and Rockhopper Trail decoder library:
```sh
$ ./speech_recognition_offline_demo -d GNA_AUTO -bs 1 -i test_feat_1_10.ark -m lspeech_s5_ext.xml -hmm rht_language_model/rh.hmm -cl rht_language_model/cl.fst -g rht_language_model/g.fst -labels rht_language_model/labels.bin -amsf 0.08
```
## Sample Output
```
[ INFO ] InferenceEngine:
API version ............ 1.6
Build .................. R3
Description ....... API
[ INFO ] Parsing input parameters
[ INFO ] No extensions provided
[ INFO ] Loading Inference Engine
[ INFO ] Device info:
GNA
GNAPlugin version ......... 1.6
Build ........... GNAPlugin
[ INFO ] Loading network files
[ INFO ] Batch size is 1
[ INFO ] Using scale factor of 4079.14 calculated from first utterance.
[ INFO ] Loading model to the device
[ INFO ] Model loading time 301.864 ms
Utterance 0:
1272-128104-0012 ONLY UNFORTUNATELY HIS OWN WORK NEVER DOES GET GOOD
Total time in Infer (HW and SW): 1522.28 ms
Frames in utterance: 536 frames
Average Infer time per frame: 2.84008 ms
End of Utterance 0
Utterance 1:
174-84280-0011 BUT NOW IT DOESN'T SEEM TO MATTER VERY MUCH
Total time in Infer (HW and SW): 957.779 ms
Frames in utterance: 334 frames
Average Infer time per frame: 2.8676 ms
End of Utterance 1
Utterance 2:
1988-147956-0010 I REMEMBERED WHAT THE CONDUCTOR HAD SAID ABOUT HER EYES
Total time in Infer (HW and SW): 1082.91 ms
Frames in utterance: 384 frames
Average Infer time per frame: 2.82008 ms
End of Utterance 2
Utterance 3:
1988-147956-0026 WE WERE SO DEEP IN THE GRASS THAT WE COULD SEE NOTHING BUT THE BLUE SKY OVER US AND THE GOLD TREE IN FRONT OF US
Total time in Infer (HW and SW): 1963.4 ms
Frames in utterance: 690 frames
Average Infer time per frame: 2.84551 ms
End of Utterance 3
Utterance 4:
2086-149220-0045 FEWER WORDS THAN BEFORE BUT WITH THE SAME MYSTERIOUS MUSIC IN
Total time in Infer (HW and SW): 1283.32 ms
Frames in utterance: 453 frames
Average Infer time per frame: 2.83293 ms
End of Utterance 4
Utterance 5:
2277-149874-0011 HE SEEMED TO BE THINKING OF SOMETHING ELSE
Total time in Infer (HW and SW): 690.602 ms
Frames in utterance: 245 frames
Average Infer time per frame: 2.81878 ms
End of Utterance 5
Utterance 6:
2277-149896-0034 HE RANG AGAIN THIS TIME HARDER STILL NO ANSWER
Total time in Infer (HW and SW): 1128.91 ms
Frames in utterance: 399 frames
Average Infer time per frame: 2.82934 ms
End of Utterance 6
Utterance 7:
2277-149897-0015 IN ABOUT AN HOUR AND THREE QUARTERS THE BOY RETURNED
Total time in Infer (HW and SW): 857.916 ms
Frames in utterance: 302 frames
Average Infer time per frame: 2.84078 ms
End of Utterance 7
Utterance 8:
2412-153948-0005 I WAS DELIGHTED WITH THE COUNTRY AND THE MANNER OF LIFE
Total time in Infer (HW and SW): 897.309 ms
Frames in utterance: 312 frames
Average Infer time per frame: 2.87599 ms
End of Utterance 8
Utterance 9:
3081-166546-0044 HE WAS THE PLAIN FACE DETECTIVE WHO HAD SPOKEN TO GEORGE
Total time in Infer (HW and SW): 1280.3 ms
Frames in utterance: 448 frames
Average Infer time per frame: 2.8578 ms
End of Utterance 9
[ INFO ] Execution successful
```
## Input Preparation
Speech Recognition Offline Demo application accepts Kaldi binary `.ark` files holding stacked feature frames.
To prepare such files, please follow steps described in `lspeech_s5_ext.md` from folder `librispeech\s5_ext` of Model Zoo.
## See Also
* [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md)
* [Model Optimizer](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md)
* [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader)
// Copyright (C) 2018-2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "speech_sample.hpp"
#include "rockhopper_decoder.h"
#include <gflags/gflags.h>
#include <functional>
#include <iostream>
#include <memory>
#include <map>
#include <fstream>
#include <random>
#include <string>
#include <vector>
#include <utility>
#include <time.h>
#include <thread>
#include <chrono>
#include <limits>
#include <iomanip>
#include <inference_engine.hpp>
#include <gna/gna_config.hpp>
#include <samples/common.hpp>
#include <samples/slog.hpp>
#include <samples/args_helper.hpp>
#ifndef ALIGN
#define ALIGN(memSize, pad) ((static_cast<int>((memSize) + pad - 1) / pad) * pad)
#endif
#define MAX_SCORE_DIFFERENCE 0.0001f
#define MAX_VAL_2B_FEAT 16384
using namespace InferenceEngine;
typedef std::chrono::high_resolution_clock Time;
typedef std::chrono::duration<double, std::ratio<1, 1000>> ms;
typedef std::chrono::duration<float> fsec;
typedef struct {
uint32_t numScores;
uint32_t numErrors;
float threshold;
float maxError;
float rmsError;
float sumError;
float sumRmsError;
float sumSquaredError;
float maxRelError;
float sumRelError;
float sumSquaredRelError;
} score_error_t;
struct InferRequestStruct {
InferRequest inferRequest;
int frameIndex;
uint32_t numFramesThisBatch;
};
struct RhDecoderInstanceParams {
RhDecoderInstanceHandle handle;
uint8_t* hmm_data;
uint8_t* cl_data;
uint8_t* g_data;
uint8_t* label_data;
};
void GetKaldiArkInfo(const char *fileName,
uint32_t numArrayToFindSize,
uint32_t *ptrNumArrays,
uint32_t *ptrNumMemoryBytes) {
uint32_t numArrays = 0;
uint32_t numMemoryBytes = 0;
std::ifstream in_file(fileName, std::ios::binary);
if (in_file.good()) {
while (!in_file.eof()) {
std::string line;
uint32_t numRows = 0u, numCols = 0u, num_bytes = 0u;
std::getline(in_file, line, '\0'); // read variable length name followed by space and NUL
std::getline(in_file, line, '\4'); // read "BFM" followed by space and control-D
if (line.compare("BFM ") != 0) {
break;
}
in_file.read(reinterpret_cast<char *>(&numRows), sizeof(uint32_t)); // read number of rows
std::getline(in_file, line, '\4'); // read control-D
in_file.read(reinterpret_cast<char *>(&numCols), sizeof(uint32_t)); // read number of columns
num_bytes = numRows * numCols * sizeof(float);
in_file.seekg(num_bytes, in_file.cur); // read data
if (numArrays == numArrayToFindSize) {
numMemoryBytes += num_bytes;
}
numArrays++;
}
in_file.close();
} else {
fprintf(stderr, "Failed to open %s for reading in GetKaldiArkInfo()!\n", fileName);
exit(-1);
}
if (ptrNumArrays != NULL) *ptrNumArrays = numArrays;
if (ptrNumMemoryBytes != NULL) *ptrNumMemoryBytes = numMemoryBytes;
}
void LoadKaldiArkArray(const char *fileName, uint32_t arrayIndex, std::string &ptrName, std::vector<uint8_t> &memory,
uint32_t *ptrNumRows, uint32_t *ptrNumColumns, uint32_t *ptrNumBytesPerElement) {
std::ifstream in_file(fileName, std::ios::binary);
if (in_file.good()) {
uint32_t i = 0;
while (i < arrayIndex) {
std::string line;
uint32_t numRows = 0u, numCols = 0u;
std::getline(in_file, line, '\0'); // read variable length name followed by space and NUL
std::getline(in_file, line, '\4'); // read "BFM" followed by space and control-D
if (line.compare("BFM ") != 0) {
break;
}
in_file.read(reinterpret_cast<char *>(&numRows), sizeof(uint32_t)); // read number of rows
std::getline(in_file, line, '\4'); // read control-D
in_file.read(reinterpret_cast<char *>(&numCols), sizeof(uint32_t)); // read number of columns
in_file.seekg(numRows * numCols * sizeof(float), in_file.cur); // read data
i++;
}
if (!in_file.eof()) {
std::string line;
std::getline(in_file, ptrName, '\0'); // read variable length name followed by space and NUL
std::getline(in_file, line, '\4'); // read "BFM" followed by space and control-D
if (line.compare("BFM ") != 0) {
fprintf(stderr, "Cannot find array specifier in file %s in LoadKaldiArkArray()!\n", fileName);
exit(-1);
}
in_file.read(reinterpret_cast<char *>(ptrNumRows), sizeof(uint32_t)); // read number of rows
std::getline(in_file, line, '\4'); // read control-D
in_file.read(reinterpret_cast<char *>(ptrNumColumns), sizeof(uint32_t)); // read number of columns
in_file.read(reinterpret_cast<char *>(&memory.front()),
*ptrNumRows * *ptrNumColumns * sizeof(float)); // read array data
}
in_file.close();
} else {
fprintf(stderr, "Failed to open %s for reading in GetKaldiArkInfo()!\n", fileName);
exit(-1);
}
*ptrNumBytesPerElement = sizeof(float);
}
void SaveKaldiArkArray(const char *fileName,
bool shouldAppend,
std::string name,
void *ptrMemory,
uint32_t numRows,
uint32_t numColumns) {
std::ios_base::openmode mode = std::ios::binary;
if (shouldAppend) {
mode |= std::ios::app;
}
std::ofstream out_file(fileName, mode);
if (out_file.good()) {
out_file.write(name.c_str(), name.length()); // write name
out_file.write("\0", 1);
out_file.write("BFM ", 4);
out_file.write("\4", 1);
out_file.write(reinterpret_cast<char *>(&numRows), sizeof(uint32_t));
out_file.write("\4", 1);
out_file.write(reinterpret_cast<char *>(&numColumns), sizeof(uint32_t));
out_file.write(reinterpret_cast<char *>(ptrMemory), numRows * numColumns * sizeof(float));
out_file.close();
} else {
throw std::runtime_error(std::string("Failed to open %s for writing in SaveKaldiArkArray()!\n") + fileName);
}
}
float ScaleFactorForQuantization(void *ptrFloatMemory, float targetMax, uint32_t numElements) {
float *ptrFloatFeat = reinterpret_cast<float *>(ptrFloatMemory);
float max = 0.0;
float scaleFactor;
for (uint32_t i = 0; i < numElements; i++) {
if (fabs(ptrFloatFeat[i]) > max) {
max = fabs(ptrFloatFeat[i]);
}
}
if (max == 0) {
scaleFactor = 1.0;
} else {
scaleFactor = targetMax / max;
}
return (scaleFactor);
}
void ClearScoreError(score_error_t *error) {
error->numScores = 0;
error->numErrors = 0;
error->maxError = 0.0;
error->rmsError = 0.0;
error->sumError = 0.0;
error->sumRmsError = 0.0;
error->sumSquaredError = 0.0;
error->maxRelError = 0.0;
error->sumRelError = 0.0;
error->sumSquaredRelError = 0.0;
}
void UpdateScoreError(score_error_t *error, score_error_t *totalError) {
totalError->numErrors += error->numErrors;
totalError->numScores += error->numScores;
totalError->sumRmsError += error->rmsError;
totalError->sumError += error->sumError;
totalError->sumSquaredError += error->sumSquaredError;
if (error->maxError > totalError->maxError) {
totalError->maxError = error->maxError;
}
totalError->sumRelError += error->sumRelError;
totalError->sumSquaredRelError += error->sumSquaredRelError;
if (error->maxRelError > totalError->maxRelError) {
totalError->maxRelError = error->maxRelError;
}
}
uint32_t CompareScores(float *ptrScoreArray,
void *ptrRefScoreArray,
score_error_t *scoreError,
uint32_t numRows,
uint32_t numColumns) {
uint32_t numErrors = 0;
ClearScoreError(scoreError);
float *A = ptrScoreArray;
float *B = reinterpret_cast<float *>(ptrRefScoreArray);
for (uint32_t i = 0; i < numRows; i++) {
for (uint32_t j = 0; j < numColumns; j++) {
float score = A[i * numColumns + j];
float refscore = B[i * numColumns + j];
float error = fabs(refscore - score);
float rel_error = error / (static_cast<float>(fabs(refscore)) + 1e-20f);
float squared_error = error * error;
float squared_rel_error = rel_error * rel_error;
scoreError->numScores++;
scoreError->sumError += error;
scoreError->sumSquaredError += squared_error;
if (error > scoreError->maxError) {
scoreError->maxError = error;
}
scoreError->sumRelError += rel_error;
scoreError->sumSquaredRelError += squared_rel_error;
if (rel_error > scoreError->maxRelError) {
scoreError->maxRelError = rel_error;
}
if (error > scoreError->threshold) {
numErrors++;
}
}
}
scoreError->rmsError = sqrt(scoreError->sumSquaredError / (numRows * numColumns));
scoreError->sumRmsError += scoreError->rmsError;
scoreError->numErrors = numErrors;
return (numErrors);
}
float StdDevError(score_error_t error) {
return (sqrt(error.sumSquaredError / error.numScores
- (error.sumError / error.numScores) * (error.sumError / error.numScores)));
}
float StdDevRelError(score_error_t error) {
return (sqrt(error.sumSquaredRelError / error.numScores
- (error.sumRelError / error.numScores) * (error.sumRelError / error.numScores)));
}
#if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(_M_ARM64)
#if defined(_WIN32) || defined(WIN32)
#include <intrin.h>
#include <windows.h>
#else
#include <cpuid.h>
#endif
inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx) {
size_t level = *eax;
#if defined(_WIN32) || defined(WIN32)
int regs[4] = {static_cast<int>(*eax), static_cast<int>(*ebx), static_cast<int>(*ecx), static_cast<int>(*edx)};
__cpuid(regs, level);
*eax = static_cast<uint32_t>(regs[0]);
*ebx = static_cast<uint32_t>(regs[1]);
*ecx = static_cast<uint32_t>(regs[2]);
*edx = static_cast<uint32_t>(regs[3]);
#else
__get_cpuid(level, eax, ebx, ecx, edx);
#endif
}
// return GNA module frequency in MHz
float getGnaFrequencyMHz() {
uint32_t eax = 1;
uint32_t ebx = 0;
uint32_t ecx = 0;
uint32_t edx = 0;
uint32_t family = 0;
uint32_t model = 0;
const uint8_t sixth_family = 6;
const uint8_t cannon_lake_model = 102;
const uint8_t gemini_lake_model = 122;
native_cpuid(&eax, &ebx, &ecx, &edx);
family = (eax >> 8) & 0xF;
// model is the concatenation of two fields
// | extended model | model |
// copy extended model data
model = (eax >> 16) & 0xF;
// shift
model <<= 4;
// copy model data
model += (eax >> 4) & 0xF;
if (family == sixth_family && model == cannon_lake_model) {
return 400;
} else if (family == sixth_family &&
model == gemini_lake_model) {
return 200;
} else {
// counters not supported and we retrns just default value
return 1;
}
}
#endif // if not ARM
void printReferenceCompareResults(score_error_t const &totalError,
size_t framesNum,
std::ostream &stream) {
stream << " max error: " <<
totalError.maxError << std::endl;
stream << " avg error: " <<
totalError.sumError / totalError.numScores << std::endl;
stream << " avg rms error: " <<
totalError.sumRmsError / framesNum << std::endl;
stream << " stdev error: " <<
StdDevError(totalError) << std::endl << std::endl;
stream << std::endl;
}
void printPerformanceCounters(std::map<std::string,
InferenceEngine::InferenceEngineProfileInfo> const &utterancePerfMap,
size_t callsNum,
std::ostream &stream, std::string fullDeviceName) {
#if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(_M_ARM64)
std::ios_base::fmtflags fmt_flags(stream.flags() );
stream << std::endl << "Performance counts:" << std::endl;
stream << std::setw(10) << std::right << "" << "Counter descriptions";
stream << std::setw(22) << "Utt scoring time";
stream << std::setw(18) << "Avg infer time";
stream << std::endl;
stream << std::setw(46) << "(ms)";
stream << std::setw(24) << "(us per call)";
stream << std::endl;
for (const auto &it : utterancePerfMap) {
std::string const &counter_name = it.first;
float current_units = static_cast<float>(it.second.realTime_uSec);
float call_units = current_units / callsNum;
// if GNA HW counters
// get frequency of GNA module
float freq = getGnaFrequencyMHz();
current_units /= freq * 1000;
call_units /= freq;
stream << std::setw(30) << std::left << counter_name.substr(4, counter_name.size() - 1);
stream << std::setw(16) << std::right << current_units;
stream << std::setw(21) << std::right << call_units;
stream << std::endl;
}
stream << std::endl;
std::cout << std::endl;
std::cout << "Full device name: " << fullDeviceName << std::endl;
std::cout << std::endl;
stream.flags(fmt_flags);
#endif
}
void getPerformanceCounters(InferenceEngine::InferRequest &request,
std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfCounters) {
auto retPerfCounters = request.GetPerformanceCounts();
for (const auto &pair : retPerfCounters) {
perfCounters[pair.first] = pair.second;
}
}
void sumPerformanceCounters(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> const &perfCounters,
std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &totalPerfCounters) {
for (const auto &pair : perfCounters) {
totalPerfCounters[pair.first].realTime_uSec += pair.second.realTime_uSec;
}
}
bool ParseAndCheckCommandLine(int argc, char *argv[]) {
// ---------------------------Parsing and validation of input args--------------------------------------
slog::info << "Parsing input parameters" << slog::endl;
gflags::ParseCommandLineNonHelpFlags(&argc, &argv, true);
if (FLAGS_h) {
showUsage();
showAvailableDevices();
return false;
}
bool isDumpMode = !FLAGS_wg.empty() || !FLAGS_we.empty();
// input not required only in dump mode and if external scale factor provided
if (FLAGS_i.empty() && (!isDumpMode || FLAGS_q.compare("user") != 0)) {
if (isDumpMode) {
throw std::logic_error("In model dump mode either static quantization is used (-i) or user scale"
" factor need to be provided. See -q user option");
}
throw std::logic_error("Input file not set. Please use -i.");
}
if (FLAGS_m.empty() && FLAGS_rg.empty()) {
throw std::logic_error("Either IR file (-m) or GNAModel file (-rg) need to be set.");
}
if ((!FLAGS_m.empty() && !FLAGS_rg.empty())) {
throw std::logic_error("Only one of -m and -rg is allowed.");
}
std::vector<std::string> supportedDevices = {
"CPU",
"GPU",
"GNA_AUTO",
"GNA_HW",
"GNA_SW_EXACT",
"GNA_SW",
"GNA_SW_FP32",
"HETERO:GNA,CPU",
"HETERO:GNA_HW,CPU",
"HETERO:GNA_SW_EXACT,CPU",
"HETERO:GNA_SW,CPU",
"HETERO:GNA_SW_FP32,CPU",
"MYRIAD"
};
if (std::find(supportedDevices.begin(), supportedDevices.end(), FLAGS_d) == supportedDevices.end()) {
throw std::logic_error("Specified device is not supported.");
}
float scaleFactorInput = static_cast<float>(FLAGS_sf);
if (scaleFactorInput <= 0.0f) {
throw std::logic_error("Scale factor out of range (must be non-negative).");
}
uint32_t batchSize = (uint32_t) FLAGS_bs;
if ((batchSize < 1) || (batchSize > 8)) {
throw std::logic_error("Batch size out of range (1..8).");
}
/** default is a static quantisation **/
if ((FLAGS_q.compare("static") != 0) && (FLAGS_q.compare("dynamic") != 0) && (FLAGS_q.compare("user") != 0)) {
throw std::logic_error("Quantization mode not supported (static, dynamic, user).");
}
if (FLAGS_q.compare("dynamic") == 0) {
throw std::logic_error("Dynamic quantization not yet supported.");
}
if (FLAGS_qb != 16 && FLAGS_qb != 8) {
throw std::logic_error("Only 8 or 16 bits supported.");
}
if (FLAGS_nthreads <= 0) {
throw std::logic_error("Invalid value for 'nthreads' argument. It must be greater that or equal to 0");
}
if (FLAGS_cw_r < 0) {
throw std::logic_error("Invalid value for 'cw_r' argument. It must be greater than or equal to 0");
}
if (FLAGS_cw_l < 0) {
throw std::logic_error("Invalid value for 'cw_l' argument. It must be greater than or equal to 0");
}
// RH decoder parameters
if (FLAGS_hmm.empty()) {
throw std::logic_error("RH HMM model file not set. Please use -hmm.");
}
if (FLAGS_labels.empty()) {
throw std::logic_error("RH labels file not set. Please use -labels.");
}
if (FLAGS_g.empty()) {
throw std::logic_error("RH LM: G.fst model file not set. Please use -g.");
}
if (FLAGS_cl.empty()) {
throw std::logic_error("RH LM: CL.fst model file not set. Please use -cl.");
}
return true;
}
uint8_t* ReadBinaryFile(const char* filename, uint32_t* size) {
if (nullptr == size) {
throw std::logic_error("Size parameter is null");
}
FILE * f = fopen(filename, "rb");
if (!f) {
throw std::runtime_error("Failed to open binary file " + std::string(filename));
}
int32_t res = fseek(f, 0, SEEK_END);
if (res != 0) {
fclose(f);
throw std::runtime_error("Error occured while loading (fseek) file " + std::string(filename));
}
auto fileSize = ftell(f);
if (fileSize < 0) {
fclose(f);
throw std::runtime_error("Error occured while loading (ftell) file " + std::string(filename));
return nullptr;
}
res = fseek(f, 0, SEEK_SET);
uint8_t* data = new (std::nothrow) uint8_t[fileSize];
if (!data) {
fclose(f);
throw std::runtime_error("Not enough memory to load file " + std::string(filename));
}
*size = fread(data, 1, fileSize, f);
fclose(f);
if (*size != fileSize) {
delete[] data;
throw std::runtime_error("Could not read all the data from file " + std::string(filename));
}
return data;
}
void InitializeRhDecoder(RhDecoderInstanceParams& instanceParams, int32_t scoreVectorSize) {
uint32_t hmm_size = 0;
uint32_t cl_size = 0;
uint32_t g_size = 0;
uint32_t label_size = 0;
instanceParams.hmm_data = ReadBinaryFile(FLAGS_hmm.c_str(), &hmm_size);
instanceParams.cl_data = ReadBinaryFile(FLAGS_cl.c_str(), &cl_size);
instanceParams.g_data = ReadBinaryFile(FLAGS_g.c_str(), &g_size);
instanceParams.label_data = ReadBinaryFile(FLAGS_labels.c_str(), &label_size);
if (instanceParams.hmm_data && instanceParams.cl_data &&
instanceParams.g_data && instanceParams.label_data) {
RhDecoderStatus status = RhDecoderCreateInstance(&instanceParams.handle);
do {
if (RhDecoderStatus::RH_DECODER_SUCCESS != status) {
throw std::logic_error("Failed to create decoder");
}
status = RhDecoderSetDefaultParameterValues(instanceParams.handle,
RhAcousticModelType::RH_ACOUSTIC_MODEL_TYPE_GENERIC_CHAIN);
if (RhDecoderStatus::RH_DECODER_SUCCESS != status) {
throw std::logic_error("Failed to set default decoder values");
}
// now overwrite some of the parameters
float acoustic_scale_factor = static_cast<float>(FLAGS_amsf);
status = RhDecoderSetParameterValue(instanceParams.handle, RH_DECODER_ACOUSTIC_SCALE_FACTOR,
&acoustic_scale_factor, sizeof(float));
if (RhDecoderStatus::RH_DECODER_SUCCESS != status) {
throw std::logic_error("Failed to set parameter acoustic_scale_factor value");
}
status = RhDecoderSetParameterValue(instanceParams.handle, RH_DECODER_ACOUSTIC_SCORE_VECTOR_SIZE,
&scoreVectorSize, sizeof(int));
if (RhDecoderStatus::RH_DECODER_SUCCESS != status) {
throw std::logic_error("Failed to set parameter score_vector_size value");
}
float beam_width = static_cast<float>(FLAGS_beam_width);
status = RhDecoderSetParameterValue(instanceParams.handle, RH_DECODER_BEAM_WIDTH,
&beam_width, sizeof(float));
if (RhDecoderStatus::RH_DECODER_SUCCESS != status) {
throw std::logic_error("Failed to set parameter beam_width value");
}
status = RhDecoderSetParameterValue(instanceParams.handle, RH_DECODER_NBEST,
&FLAGS_nbest, sizeof(int));
if (RhDecoderStatus::RH_DECODER_SUCCESS != status) {
throw std::logic_error("Failed to set parameter nbest value");
}
status = RhDecoderSetParameterValue(instanceParams.handle, RH_DECODER_G_CACHE_LOG_SIZE,
&FLAGS_gcls, sizeof(int));
if (RhDecoderStatus::RH_DECODER_SUCCESS != status) {
throw std::logic_error("Failed to set parameter g_cache_log_size value");
}
status = RhDecoderSetParameterValue(instanceParams.handle, RH_DECODER_TRACE_BACK_LOG_SIZE,
&FLAGS_tbls, sizeof(int));
if (RhDecoderStatus::RH_DECODER_SUCCESS != status) {
throw std::logic_error("Failed to set parameter trace_back_log_size value");
}
status = RhDecoderSetParameterValue(instanceParams.handle, RH_DECODER_MIN_STABLE_FRAMES,
&FLAGS_msf, sizeof(int));
if (RhDecoderStatus::RH_DECODER_SUCCESS != status) {
throw std::logic_error("Failed to set parameter min_stable_frames value");
}
status = RhDecoderSetParameterValue(instanceParams.handle, RH_DECODER_TOKEN_BUFFER_SIZE,
&FLAGS_tbs, sizeof(int));
if (RhDecoderStatus::RH_DECODER_SUCCESS != status) {
throw std::logic_error("Failed to set parameter token_buffer_size value");
}
status = RhDecoderSetupResource(instanceParams.handle,
RhResourceType::HMM, instanceParams.hmm_data, hmm_size);
if (RhDecoderStatus::RH_DECODER_SUCCESS != status) {
throw std::logic_error("Failed to load HMM model");
}
status = RhDecoderSetupResource(instanceParams.handle,
RhResourceType::PRONUNCIATION_MODEL, instanceParams.cl_data, cl_size);
if (RhDecoderStatus::RH_DECODER_SUCCESS != status) {
throw std::logic_error("Failed to load pronunciation model");
}
status = RhDecoderSetupResource(instanceParams.handle,
RhResourceType::LANGUAGE_MODEL, instanceParams.g_data, g_size);
if (RhDecoderStatus::RH_DECODER_SUCCESS != status) {
throw std::logic_error("Failed to load language model");
}
status = RhDecoderSetupResource(instanceParams.handle,
RhResourceType::LABELS, instanceParams.label_data, label_size);
if (RhDecoderStatus::RH_DECODER_SUCCESS != status) {
throw std::logic_error("Failed to load labels");
}
status = RhDecoderInitInstance(instanceParams.handle);
if (RhDecoderStatus::RH_DECODER_SUCCESS != status) {
throw std::logic_error("Failed to initialize decoder");
}
} while (0);
} else {
throw std::logic_error("Failed to read one of the resources");
}
}
void FreeRhDecoder(RhDecoderInstanceParams& instanceParams) {
if (instanceParams.handle) {
RhDecoderStatus status = RhDecoderFreeInstance(instanceParams.handle);
if (status != RH_DECODER_SUCCESS) {
slog::err << "Failed to free decoder. Status: " << status << slog::endl;
throw std::logic_error("Failed to free decoder. Status: " + std::to_string(status));
}
}
if (instanceParams.hmm_data) {
delete[] instanceParams.hmm_data;
instanceParams.hmm_data = nullptr;
}
if (instanceParams.cl_data) {
delete[] instanceParams.cl_data;
instanceParams.cl_data = nullptr;
}
if (instanceParams.g_data) {
delete[] instanceParams.g_data;
instanceParams.g_data = nullptr;
}
if (instanceParams.label_data) {
delete[] instanceParams.label_data;
instanceParams.label_data = nullptr;
}
}
/**
* @brief The entry point for inference engine automatic speech recognition sample
* @file speech_sample/main.cpp
* @example speech_sample/main.cpp
*/
int main(int argc, char *argv[]) {
try {
slog::info << "InferenceEngine: " << GetInferenceEngineVersion() << slog::endl;
// ------------------------------ Parsing and validation of input args ---------------------------------
if (!ParseAndCheckCommandLine(argc, argv)) {
return 0;
}
if (FLAGS_l.empty()) {
slog::info << "No extensions provided" << slog::endl;
}
auto isFeature = [&](const std::string xFeature) { return FLAGS_d.find(xFeature) != std::string::npos; };
bool useGna = isFeature("GNA");
bool useHetero = isFeature("HETERO");
std::string deviceStr =
useHetero && useGna ? "HETERO:GNA,CPU" : FLAGS_d.substr(0, (FLAGS_d.find("_")));
float scaleFactorInput = static_cast<float>(FLAGS_sf);
uint32_t batchSize = (FLAGS_cw_r > 0 || FLAGS_cw_l > 0) ? 1 : (uint32_t) FLAGS_bs;
std::vector<std::string> inputArkFiles;
std::vector<uint32_t> numBytesThisUtterance;
uint32_t numUtterances(0);
if (!FLAGS_i.empty()) {
std::string outStr;
std::istringstream stream(FLAGS_i);
uint32_t currentNumUtterances(0), currentNumBytesThisUtterance(0);
while (getline(stream, outStr, ',')) {
std::string filename(fileNameNoExt(outStr) + ".ark");
inputArkFiles.push_back(filename);
GetKaldiArkInfo(filename.c_str(), 0, &currentNumUtterances, &currentNumBytesThisUtterance);
if (numUtterances == 0) {
numUtterances = currentNumUtterances;
} else if (currentNumUtterances != numUtterances) {
throw std::logic_error("Incorrect input files. Number of utterance must be the same for all ark files");
}
numBytesThisUtterance.push_back(currentNumBytesThisUtterance);
}
}
size_t numInputArkFiles(inputArkFiles.size());
// -----------------------------------------------------------------------------------------------------
// --------------------------- 1. Load inference engine -------------------------------------
slog::info << "Loading Inference Engine" << slog::endl;
Core ie;
/** Printing device version **/
slog::info << "Device info: " << slog::endl;
std::cout << ie.GetVersions(deviceStr) << std::endl;
// -----------------------------------------------------------------------------------------------------
// --------------------------- 2. Read IR Generated by ModelOptimizer (.xml and .bin files) ------------
slog::info << "Loading network files" << slog::endl;
CNNNetReader netBuilder;
if (!FLAGS_m.empty()) {
/** Read network model **/
netBuilder.ReadNetwork(FLAGS_m);
/** Extract model name and load weights **/
std::string binFileName = fileNameNoExt(FLAGS_m) + ".bin";
netBuilder.ReadWeights(binFileName);
// -------------------------------------------------------------------------------------------------
// --------------------------- 3. Set batch size ---------------------------------------------------
/** Set batch size. Unlike in imaging, batching in time (rather than space) is done for speech recognition. **/
netBuilder.getNetwork().setBatchSize(batchSize);
slog::info << "Batch size is " << std::to_string(netBuilder.getNetwork().getBatchSize())
<< slog::endl;
}
/** Setting parameter for per layer metrics **/
std::map<std::string, std::string> gnaPluginConfig;
std::map<std::string, std::string> genericPluginConfig;
if (useGna) {
std::string gnaDevice =
useHetero ? FLAGS_d.substr(FLAGS_d.find("GNA"), FLAGS_d.find(",") - FLAGS_d.find("GNA")) : FLAGS_d;
gnaPluginConfig[GNAConfigParams::KEY_GNA_DEVICE_MODE] =
gnaDevice.find("_") == std::string::npos ? "GNA_AUTO" : gnaDevice;
}
if (FLAGS_pc) {
genericPluginConfig[PluginConfigParams::KEY_PERF_COUNT] = PluginConfigParams::YES;
}
if (FLAGS_q.compare("user") == 0) {
if (numInputArkFiles > 1) {
std::string errMessage("Incorrect use case for multiple input ark files. Please don't use -q 'user' for this case.");
throw std::logic_error(errMessage);
}
slog::info << "Using scale factor of " << FLAGS_sf << slog::endl;
gnaPluginConfig[GNA_CONFIG_KEY(SCALE_FACTOR)] = std::to_string(FLAGS_sf);
} else {
// "static" quantization with calculated scale factor
for (size_t i = 0; i < numInputArkFiles; i++) {
auto inputArkName = inputArkFiles[i].c_str();
std::string name;
std::vector<uint8_t> ptrFeatures;
uint32_t numArrays(0), numBytes(0), numFrames(0), numFrameElements(0), numBytesPerElement(0);
GetKaldiArkInfo(inputArkName, 0, &numArrays, &numBytes);
ptrFeatures.resize(numBytes);
LoadKaldiArkArray(inputArkName,
0,
name,
ptrFeatures,
&numFrames,
&numFrameElements,
&numBytesPerElement);
scaleFactorInput =
ScaleFactorForQuantization(ptrFeatures.data(), MAX_VAL_2B_FEAT, numFrames * numFrameElements);
slog::info << "Using scale factor of " << scaleFactorInput << " calculated from first utterance."
<< slog::endl;
std::string scaleFactorConfigKey = GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_") + std::to_string(i);
gnaPluginConfig[scaleFactorConfigKey] = std::to_string(scaleFactorInput);
}
}
if (FLAGS_qb == 8) {
gnaPluginConfig[GNAConfigParams::KEY_GNA_PRECISION] = "I8";
} else {
gnaPluginConfig[GNAConfigParams::KEY_GNA_PRECISION] = "I16";
}
gnaPluginConfig[GNAConfigParams::KEY_GNA_LIB_N_THREADS] = std::to_string((FLAGS_cw_r > 0 || FLAGS_cw_l > 0) ? 1 : FLAGS_nthreads);
gnaPluginConfig[GNA_CONFIG_KEY(COMPACT_MODE)] = CONFIG_VALUE(NO);
// -----------------------------------------------------------------------------------------------------
// --------------------------- 4. Write model to file --------------------------------------------------
// Embedded GNA model dumping (for Intel(R) Speech Enabling Developer Kit)
if (!FLAGS_we.empty()) {
gnaPluginConfig[GNAConfigParams::KEY_GNA_FIRMWARE_MODEL_IMAGE] = FLAGS_we;
}
// -----------------------------------------------------------------------------------------------------
// --------------------------- 5. Loading model to the device ------------------------------------------
if (useGna) {
genericPluginConfig.insert(std::begin(gnaPluginConfig), std::end(gnaPluginConfig));
}
auto t0 = Time::now();
ExecutableNetwork executableNet;
if (!FLAGS_m.empty()) {
slog::info << "Loading model to the device" << slog::endl;
executableNet = ie.LoadNetwork(netBuilder.getNetwork(), deviceStr, genericPluginConfig);
} else {
slog::info << "Importing model to the device" << slog::endl;
executableNet = ie.ImportNetwork(FLAGS_rg.c_str(), deviceStr, genericPluginConfig);
}
ms loadTime = std::chrono::duration_cast<ms>(Time::now() - t0);
slog::info << "Model loading time " << loadTime.count() << " ms" << slog::endl;
// --------------------------- 6. Exporting gna model using InferenceEngine AOT API---------------------
if (!FLAGS_wg.empty()) {
slog::info << "Writing GNA Model to file " << FLAGS_wg << slog::endl;
t0 = Time::now();
executableNet.Export(FLAGS_wg);
ms exportTime = std::chrono::duration_cast<ms>(Time::now() - t0);
slog::info << "Exporting time " << exportTime.count() << " ms" << slog::endl;
return 0;
}
if (!FLAGS_we.empty()) {
slog::info << "Exported GNA embedded model to file " << FLAGS_we << slog::endl;
return 0;
}
std::vector<InferRequestStruct> inferRequests((FLAGS_cw_r > 0 || FLAGS_cw_l > 0) ? 1 : FLAGS_nthreads);
for (auto& inferRequest : inferRequests) {
inferRequest = {executableNet.CreateInferRequest(), -1, batchSize};
}
// -----------------------------------------------------------------------------------------------------
// --------------------------- 7. Prepare input blobs --------------------------------------------------
/** Taking information about all topology inputs **/
ConstInputsDataMap cInputInfo = executableNet.GetInputsInfo();
/** Stores all input blobs data **/
if (cInputInfo.size() != numInputArkFiles) {
throw std::logic_error("Number of network inputs("
+ std::to_string(cInputInfo.size()) + ") is not equal to number of ark files("
+ std::to_string(numInputArkFiles) + ")");
}
std::vector<Blob::Ptr> ptrInputBlobs;
for (auto& input : cInputInfo) {
ptrInputBlobs.push_back(inferRequests.begin()->inferRequest.GetBlob(input.first));
}
InputsDataMap inputInfo;
if (!FLAGS_m.empty()) {
inputInfo = netBuilder.getNetwork().getInputsInfo();
}
/** configure input precision if model loaded from IR **/
for (auto &item : inputInfo) {
Precision inputPrecision = Precision::FP32; // specify Precision::I16 to provide quantized inputs
item.second->setPrecision(inputPrecision);
item.second->getInputData()->setLayout(Layout::NC); // row major layout
}
// -----------------------------------------------------------------------------------------------------
// --------------------------- 8. Prepare output blobs -------------------------------------------------
ConstOutputsDataMap cOutputInfo(executableNet.GetOutputsInfo());
OutputsDataMap outputInfo;
if (!FLAGS_m.empty()) {
outputInfo = netBuilder.getNetwork().getOutputsInfo();
}
Blob::Ptr ptrOutputBlob = inferRequests.begin()->inferRequest.GetBlob(cOutputInfo.rbegin()->first);
for (auto &item : outputInfo) {
DataPtr outData = item.second;
if (!outData) {
throw std::logic_error("output data pointer is not valid");
}
Precision outputPrecision = Precision::FP32; // specify Precision::I32 to retrieve quantized outputs
outData->setPrecision(outputPrecision);
outData->setLayout(Layout::NC); // row major layout
}
// -----------------------------------------------------------------------------------------------------
// --------------------------- 9. Initialize RH decoder ------------------------------------------------
RhDecoderInstanceParams rhDecoderInstanceParams{ nullptr };
auto lastLayerOutputCount = outputInfo.begin()->second->getDims()[1];
InitializeRhDecoder(rhDecoderInstanceParams, lastLayerOutputCount);
// allocate 1MB for result
std::vector<char> rh_utterance_transcription(1024 * 1024);
// -----------------------------------------------------------------------------------------------------
// --------------------------- 10. Do inference --------------------------------------------------------
std::vector<std::vector<uint8_t>> ptrUtterances;
std::vector<uint8_t> ptrScores;
std::vector<uint8_t> ptrReferenceScores;
score_error_t frameError, totalError;
ptrUtterances.resize(inputArkFiles.size());
for (uint32_t utteranceIndex = 0; utteranceIndex < numUtterances; ++utteranceIndex) {
std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> utterancePerfMap;
std::string uttName;
uint32_t numFrames(0), n(0);
std::vector<uint32_t> numFrameElementsInput;
uint32_t numFramesReference(0), numFrameElementsReference(0), numBytesPerElementReference(0),
numBytesReferenceScoreThisUtterance(0);
const uint32_t numScoresPerFrame = ptrOutputBlob->size() / batchSize;
numFrameElementsInput.resize(numInputArkFiles);
for (size_t i = 0; i < inputArkFiles.size(); i++) {
std::vector<uint8_t> ptrUtterance;
auto inputArkFilename = inputArkFiles[i].c_str();
uint32_t currentNumFrames(0), currentNumFrameElementsInput(0), currentNumBytesPerElementInput(0);
GetKaldiArkInfo(inputArkFilename, utteranceIndex, &n, &numBytesThisUtterance[i]);
ptrUtterance.resize(numBytesThisUtterance[i]);
LoadKaldiArkArray(inputArkFilename,
utteranceIndex,
uttName,
ptrUtterance,
&currentNumFrames,
&currentNumFrameElementsInput,
&currentNumBytesPerElementInput);
if (numFrames == 0) {
numFrames = currentNumFrames;
} else if (numFrames != currentNumFrames) {
std::string errMessage("Number of frames in ark files is different: " + std::to_string(numFrames) +
" and " + std::to_string(currentNumFrames));
throw std::logic_error(errMessage);
}
ptrUtterances[i] = ptrUtterance;
numFrameElementsInput[i] = currentNumFrameElementsInput;
}
int i = 0;
for (auto& ptrInputBlob : ptrInputBlobs) {
if (ptrInputBlob->size() != numFrameElementsInput[i++] * batchSize) {
throw std::logic_error("network input size(" + std::to_string(ptrInputBlob->size()) +
") mismatch to ark file size (" +
std::to_string(numFrameElementsInput[i-1] * batchSize) + ")");
}
}
ptrScores.resize(numFrames * numScoresPerFrame * sizeof(float));
if (!FLAGS_r.empty()) {
std::string refUtteranceName;
GetKaldiArkInfo(FLAGS_r.c_str(), utteranceIndex, &n, &numBytesReferenceScoreThisUtterance);
ptrReferenceScores.resize(numBytesReferenceScoreThisUtterance);
LoadKaldiArkArray(FLAGS_r.c_str(),
utteranceIndex,
refUtteranceName,
ptrReferenceScores,
&numFramesReference,
&numFrameElementsReference,
&numBytesPerElementReference);
}
double totalTime = 0.0;
std::cout << "Utterance " << utteranceIndex << ": " << std::endl;
ClearScoreError(&totalError);
totalError.threshold = frameError.threshold = MAX_SCORE_DIFFERENCE;
auto outputFrame = &ptrScores.front();
std::vector<uint8_t*> inputFrame;
for (auto& ut : ptrUtterances) {
inputFrame.push_back(&ut.front());
}
std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> callPerfMap;
size_t frameIndex = 0;
uint32_t numFramesArkFile = numFrames;
numFrames += FLAGS_cw_l + FLAGS_cw_r;
uint32_t numFramesThisBatch{batchSize};
auto t0 = Time::now();
auto t1 = t0;
while (frameIndex <= numFrames) {
if (frameIndex == numFrames) {
if (std::find_if(inferRequests.begin(),
inferRequests.end(),
[&](InferRequestStruct x) { return (x.frameIndex != -1); } ) == inferRequests.end()) {
break;
}
}
bool inferRequestFetched = false;
for (auto &inferRequest : inferRequests) {
if (frameIndex == numFrames) {
numFramesThisBatch = 1;
} else {
numFramesThisBatch = (numFrames - frameIndex < batchSize) ? (numFrames - frameIndex)
: batchSize;
}
if (inferRequest.frameIndex != -1) {
StatusCode code = inferRequest.inferRequest.Wait(
InferenceEngine::IInferRequest::WaitMode::RESULT_READY);
if (code != StatusCode::OK) {
if (!useHetero) continue;
if (code != StatusCode::INFER_NOT_STARTED) continue;
}
if (inferRequest.frameIndex >= 0) {
Blob::Ptr outputBlob = inferRequest.inferRequest.GetBlob(cOutputInfo.rbegin()->first);
RhDecoderInfo info;
const float* acoustic_score_vector_index = outputBlob->buffer();
for (uint32_t f = 0; f < inferRequest.numFramesThisBatch; ++f) {
RhDecoderStatus rh_status = RhDecoderProcessFrame(rhDecoderInstanceParams.handle,
acoustic_score_vector_index, numScoresPerFrame, &info);
if (RhDecoderStatus::RH_DECODER_SUCCESS != rh_status) {
throw std::logic_error(
"Decoder failed to process frame: " + std::to_string(inferRequest.frameIndex));
}
if (info.is_result_stable || inferRequest.frameIndex + f == numFrames - 1) {
RhDecoderGetResult(rhDecoderInstanceParams.handle,
RhDecoderResultType::RH_DECODER_FINAL_RESULT,
rh_utterance_transcription.data(),
rh_utterance_transcription.size());
if (RhDecoderStatus::RH_DECODER_SUCCESS != rh_status) {
throw std::logic_error("Failed to retrieve speech recognition result");
}
std::cout << uttName << "\t" << rh_utterance_transcription.data() << std::endl;
}
acoustic_score_vector_index += lastLayerOutputCount;
}
if (!FLAGS_o.empty()) {
outputFrame =
&ptrScores.front() + numScoresPerFrame * sizeof(float) * (inferRequest.frameIndex);
Blob::Ptr outputBlob = inferRequest.inferRequest.GetBlob(cOutputInfo.rbegin()->first);
auto byteSize = inferRequest.numFramesThisBatch * numScoresPerFrame * sizeof(float);
std::memcpy(outputFrame,
outputBlob->buffer(),
byteSize);
}
if (!FLAGS_r.empty()) {
Blob::Ptr outputBlob = inferRequest.inferRequest.GetBlob(cOutputInfo.begin()->first);
CompareScores(outputBlob->buffer().as<float*>(),
&ptrReferenceScores[inferRequest.frameIndex *
numFrameElementsReference *
numBytesPerElementReference],
&frameError,
inferRequest.numFramesThisBatch,
numFrameElementsReference);
UpdateScoreError(&frameError, &totalError);
}
if (FLAGS_pc) {
// retrive new counters
getPerformanceCounters(inferRequest.inferRequest, callPerfMap);
// summarize retrived counters with all previous
sumPerformanceCounters(callPerfMap, utterancePerfMap);
}
}
}
if (frameIndex == numFrames) {
inferRequest.frameIndex = -1;
continue;
}
ptrInputBlobs.clear();
for (auto& input : cInputInfo) {
ptrInputBlobs.push_back(inferRequest.inferRequest.GetBlob(input.first));
}
for (size_t i = 0; i < numInputArkFiles; ++i) {
std::memcpy(ptrInputBlobs[i]->buffer(),
inputFrame[i],
ptrInputBlobs[i]->byteSize());
}
int index = static_cast<int>(frameIndex) - (FLAGS_cw_l + FLAGS_cw_r);
inferRequest.inferRequest.StartAsync();
inferRequest.frameIndex = index < 0 ? -2 : index;
inferRequest.numFramesThisBatch = numFramesThisBatch;
frameIndex += numFramesThisBatch;
for (size_t j = 0; j < inputArkFiles.size(); j++) {
if (FLAGS_cw_l > 0 || FLAGS_cw_r > 0) {
int i = frameIndex - FLAGS_cw_l;
if (i > 0 && i < static_cast<int>(numFramesArkFile)) {
inputFrame[j] += sizeof(float) * numFrameElementsInput[j] * numFramesThisBatch;
} else if (i >= static_cast<int>(numFramesArkFile)) {
inputFrame[j] = &ptrUtterances[0].front() +
(numFramesArkFile - 1) * sizeof(float) * numFrameElementsInput[j] * numFramesThisBatch;
} else if (i < 0) {
inputFrame[j] = &ptrUtterances[0].front();
}
} else {
inputFrame[j] += sizeof(float) * numFrameElementsInput[j] * numFramesThisBatch;
}
}
inferRequestFetched |= true;
}
if (!inferRequestFetched) {
std::this_thread::sleep_for(std::chrono::milliseconds(1));
continue;
}
}
t1 = Time::now();
fsec fs = t1 - t0;
ms d = std::chrono::duration_cast<ms>(fs);
totalTime += d.count();
// resetting state between utterances
for (auto &&state : executableNet.QueryState()) {
state.Reset();
}
if (!FLAGS_o.empty()) {
bool shouldAppend = (utteranceIndex == 0) ? false : true;
SaveKaldiArkArray(FLAGS_o.c_str(), shouldAppend, uttName, &ptrScores.front(),
numFrames, numScoresPerFrame);
}
/** Show performance results **/
std::cout << "Total time in Infer (HW and SW):\t" << totalTime << " ms"
<< std::endl;
std::cout << "Frames in utterance:\t\t\t" << numFrames << " frames"
<< std::endl;
std::cout << "Average Infer time per frame:\t\t" << totalTime / static_cast<double>(numFrames) << " ms"
<< std::endl;
if (FLAGS_pc) {
// print
printPerformanceCounters(utterancePerfMap, frameIndex, std::cout, getFullDeviceName(ie, FLAGS_d));
}
if (!FLAGS_r.empty()) {
printReferenceCompareResults(totalError, numFrames, std::cout);
}
std::cout << "End of Utterance " << utteranceIndex << std::endl << std::endl;
}
FreeRhDecoder(rhDecoderInstanceParams);
// -----------------------------------------------------------------------------------------------------
}
catch (const std::exception &error) {
slog::err << error.what() << slog::endl;
return 1;
}
catch (...) {
slog::err << "Unknown/internal exception happened" << slog::endl;
return 1;
}
slog::info << "Execution successful" << slog::endl;
return 0;
}
// Copyright (C) 2018-2019 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <string>
#include <vector>
#include <gflags/gflags.h>
#include <iostream>
/// @brief message for help argument
static const char help_message[] = "Print a usage message.";
/// @brief message for images argument
static const char input_message[] = "Required. Paths to an .ark files. Example of usage: <file1.ark,file2.ark> or <file.ark>.";
/// @brief message for model argument
static const char model_message[] = "Required. Path to an .xml file with a trained model (required if -rg is missing).";
/// @brief message for plugin argument
static const char plugin_message[] = "Plugin name. For example MKLDNNPlugin. If this parameter is pointed, " \
"the sample will look for this plugin only";
/// @brief message for assigning cnn calculation to device
static const char target_device_message[] = "Specify a target device to infer on. CPU, GPU, GNA_AUTO, GNA_HW, GNA_SW, GNA_SW_FP32 "
"GNA_SW_EXACT and HETERO with combination of GNA as the primary device and CPU"
" as a secondary (e.g. HETERO:GNA,CPU) are supported. The sample will look "
"for a suitable plugin for device specified.";
/// @brief message for performance counters
static const char performance_counter_message[] = "Enables per-layer performance report";
/// @brief message for user library argument
static const char custom_cpu_library_message[] = "Required for MKLDNN (CPU)-targeted custom layers." \
"Absolute path to a shared library with the kernels impl.";
/// @brief message for score output argument
static const char output_message[] = "Output file name (default name is scores.ark).";
/// @brief message for reference score file argument
static const char reference_score_message[] = "Read reference score .ark file and compare scores.";
/// @brief message for read GNA model argument
static const char read_gna_model_message[] = "Read GNA model from file using path/filename provided (required if -m is missing).";
/// @brief message for write GNA model argument
static const char write_gna_model_message[] = "Write GNA model to file using path/filename provided.";
/// @brief message for write GNA embedded model argument
static const char write_embedded_model_message[] = "Write GNA embedded model to file using path/filename provided.";
/// @brief message for quantization argument
static const char quantization_message[] = "Input quantization mode: static (default), dynamic, or user (use with -sf).";
/// @brief message for quantization bits argument
static const char quantization_bits_message[] = "Weight bits for quantization: 8 or 16 (default)";
/// @brief message for scale factor argument
static const char scale_factor_message[] = "Optional user-specified input scale factor for quantization (use with -q user).";
/// @brief message for batch size argument
static const char batch_size_message[] = "Batch size 1-8 (default 1)";
/// @brief message for #threads for CPU inference
static const char infer_num_threads_message[] = "Optional. Number of threads to use for concurrent async" \
" inference requests on the GNA.";
/// @brief message for context window argument
static const char context_window_message_l[] = "Optional. Number of frames for left context windows (default is 0). " \
"Works only with context window networks."
" If you use the cw_l or cw_r flag, then batch size and nthreads arguments are ignored.";
/// @brief message for right context window argument
static const char context_window_message_r[] = "Optional. Number of frames for right context windows (default is 0). " \
"Works only with context window networks."
" If you use the cw_r or cw_l flag, then batch size and nthreads arguments are ignored.";
/// @brief message for RH HMM model argument
static const char rh_hmm_model_message[] = "Required. Path to RH .hmm file.";
/// @brief message for RH model argument
static const char rh_labels_message[] = "Required. Path to RH labels file.";
/// @brief message for RH LM: G model argument
static const char rh_g_model_message[] = "Required. Path to RH LM: G .fst model file.";
/// @brief message for RH LM: CL model argument
static const char rh_cl_model_message[] = "Required. Path to RH LM: CL .fst model file.";
/// @brief message for RH acoustic model scale factor argument
static const char rh_am_scale_factor_message[] = "Optional. RH acoustic model scale factor.";
/// @brief message for RH beam width argument
static const char rh_beam_width_message[] = "Optional. RH beam width.";
/// @brief message for RH N-best result argument
static const char rh_nbest_message[] = "Optional. RH N-best results.";
/// @brief message for RH G-cache log size argument
static const char rh_g_cache_log_size_message[] = "Optional. RH G-cache log size.";
/// @brief message for RH trace back log size argument
static const char rh_trace_back_log_size_message[] = "Optional. RH trace back log size.";
/// @brief message for RH minimum number of stable frames to attribute result as final
static const char rh_min_stable_frames_message[] = "Optional. Minimum number of stable frames to attribute result as final.";
/// @brief message for RH token buffer size argument
static const char rh_token_buffer_size_message[] = "Optional. RH token buffer size.";
/// @brief Define flag for showing help message <br>
DEFINE_bool(h, false, help_message);
/// @brief Define parameter for set image file <br>
/// It is a required parameter
DEFINE_string(i, "", input_message);
/// @brief Define parameter for set model file <br>
/// It is a required parameter
DEFINE_string(m, "", model_message);
/// @brief Define parameter for set plugin name <br>
/// It is a required parameter
DEFINE_string(p, "", plugin_message);
/// @brief device the target device to infer on <br>
DEFINE_string(d, "GNA_AUTO", target_device_message);
/// @brief Enable per-layer performance report
DEFINE_bool(pc, false, performance_counter_message);
/// @brief Absolute path to CPU library with user layers <br>
/// It is a optional parameter
DEFINE_string(l, "", custom_cpu_library_message);
/// @brief Write model to file (model.bin)
DEFINE_string(o, "", output_message);
/// @brief Read reference score file
DEFINE_string(r, "", reference_score_message);
/// @brief Read GNA model from file (model.bin)
DEFINE_string(rg, "", read_gna_model_message);
/// @brief Write GNA model to file (model.bin)
DEFINE_string(wg, "", write_gna_model_message);
/// @brief Write GNA embedded model to file (model.bin)
DEFINE_string(we, "", write_embedded_model_message);
/// @brief Input quantization mode (default static)
DEFINE_string(q, "static", quantization_message);
/// @brief Input quantization bits (default 16)
DEFINE_int32(qb, 16, quantization_bits_message);
/// @brief Scale factor for quantization (default 1.0)
DEFINE_double(sf, 1.0, scale_factor_message);
/// @brief Batch size (default 1)
DEFINE_int32(bs, 1, batch_size_message);
/// @brief Number of threads to use for inference on the CPU (also affects Hetero cases)
DEFINE_int32(nthreads, 1, infer_num_threads_message);
/// @brief Right context window size (default 0)
DEFINE_int32(cw_r, 0, context_window_message_r);
/// @brief Left context window size (default 0)
DEFINE_int32(cw_l, 0, context_window_message_l);
/// @brief Define parameter for set RH HMM model file
/// It is a required parameter
DEFINE_string(hmm, "rh.hmm", rh_hmm_model_message);
/// @brief Define parameter for set RH labels file
/// It is a required parameter
DEFINE_string(labels, "labels.bin", rh_labels_message);
/// @brief Define parameter for set RH LM: G model file
/// It is a required parameter
DEFINE_string(g, "g.fst", rh_g_model_message);
/// @brief Define parameter for set RH LM: CL model file
/// It is a required parameter
DEFINE_string(cl, "cl.fst", rh_cl_model_message);
/// @brief RH Acoustic model scale factor (default 1.0)
DEFINE_double(amsf, 1.0, rh_am_scale_factor_message);
/// @brief RH beam width (default 14.0)
DEFINE_double(beam_width, 14.0, rh_beam_width_message);
/// @brief RH N-best (default 1)
DEFINE_int32(nbest, 1, rh_nbest_message);
/// @brief RH G cache log size (default 19)
DEFINE_int32(gcls, 19, rh_g_cache_log_size_message);
/// @brief RH trace back log size (default 19)
DEFINE_int32(tbls, 19, rh_trace_back_log_size_message);
/// @brief RH minimum stable frames (default -1)
DEFINE_int32(msf, -1, rh_min_stable_frames_message);
/// @brief RH token buffer size (default 150000)
DEFINE_int32(tbs, 150000, rh_token_buffer_size_message);
/**
* @brief This function show a help message
*/
static void showUsage() {
std::cout << std::endl;
std::cout << "speech_recognition_offline_demo [OPTION]" << std::endl;
std::cout << "Options:" << std::endl;
std::cout << std::endl;
std::cout << " -h " << help_message << std::endl;
std::cout << " -i \"<path>\" " << input_message << std::endl;
std::cout << " -m \"<path>\" " << model_message << std::endl;
std::cout << " -o \"<path>\" " << output_message << std::endl;
std::cout << " -l \"<absolute_path>\" " << custom_cpu_library_message << std::endl;
std::cout << " -d \"<device>\" " << target_device_message << std::endl;
std::cout << " -p " << plugin_message << std::endl;
std::cout << " -pc " << performance_counter_message << std::endl;
std::cout << " -q \"<mode>\" " << quantization_message << std::endl;
std::cout << " -qb \"<integer>\" " << quantization_bits_message << std::endl;
std::cout << " -sf \"<double>\" " << scale_factor_message << std::endl;
std::cout << " -bs \"<integer>\" " << batch_size_message << std::endl;
std::cout << " -r \"<path>\" " << reference_score_message << std::endl;
std::cout << " -rg \"<path>\" " << read_gna_model_message << std::endl;
std::cout << " -wg \"<path>\" " << write_gna_model_message << std::endl;
std::cout << " -we \"<path>\" " << write_embedded_model_message << std::endl;
std::cout << " -nthreads \"<integer>\" " << infer_num_threads_message << std::endl;
std::cout << " -cw_l \"<integer>\" " << context_window_message_l << std::endl;
std::cout << " -cw_r \"<integer>\" " << context_window_message_r << std::endl;
std::cout << " -hmm \"<path>\" " << rh_hmm_model_message << std::endl;
std::cout << " -labels \"<path>\" " << rh_labels_message << std::endl;
std::cout << " -g \"<path>\" " << rh_g_model_message << std::endl;
std::cout << " -cl \"<path>\" " << rh_cl_model_message << std::endl;
std::cout << " -amsf \"<double>\" " << rh_am_scale_factor_message << std::endl;
std::cout << " -beam_width \"<double>\" " << rh_beam_width_message << std::endl;
std::cout << " -nbest \"<integer>\" " << rh_nbest_message << std::endl;
std::cout << " -gcls \"<integer>\" " << rh_g_cache_log_size_message << std::endl;
std::cout << " -tbls \"<integer>\" " << rh_trace_back_log_size_message << std::endl;
std::cout << " -msf \"<integer>\" " << rh_min_stable_frames_message << std::endl;
std::cout << " -tbs \"<integer>\" " << rh_token_buffer_size_message << std::endl;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment