Kinect for Windows SDK v2.0 Development Notes (8) Speech Recognition (2)

Keywords: SDK Windows REST encoding

Reprinted in: https://blog.csdn.net/dustpg/article/details/38202371

Use SDK: Kinect for Windows SDK v2.0 public preview

Next to the previous section, how to initialize the Kinect this time is very simple:

  1. //Initialization of Kinect  
  2. HRESULT ThisApp::init_kinect(){  
  3.     IAudioSource* pAudioSource = nullptr;  
  4.     IAudioBeamList* pAudioBeamList = nullptr;  
  5.     //Find the current default Kinect  
  6.     HRESULT hr = ::GetDefaultKinectSensor(&m_pKinect);  
  7.     //Gentlely open Kinect  
  8.     if (SUCCEEDED(hr)){  
  9.         hr = m_pKinect->Open();  
  10.     }  
  11.     //Access to audio sources  
  12.     if (SUCCEEDED(hr)){  
  13.         hr = m_pKinect->get_AudioSource(&pAudioSource);  
  14.     }  
  15.     //Access to audio lists  
  16.     if (SUCCEEDED(hr)){  
  17.         hr = pAudioSource->get_AudioBeams(&pAudioBeamList);  
  18.     }  
  19.     //Access to audio  
  20.     if (SUCCEEDED(hr)){  
  21.         hr = pAudioBeamList->OpenAudioBeam(0, &m_pAudioBeam);  
  22.     }  
  23.     //Get input audio stream  
  24.     if (SUCCEEDED(hr)){  
  25.         IStream* pStream = nullptr;  
  26.         hr = m_pAudioBeam->OpenInputStream(&pStream);  
  27.         //Generating packaging objects with puppets  
  28.         m_p16BitPCMAudioStream = new KinectAudioStreamWrapper(pStream);  
  29.         SafeRelease(pStream);  
  30.     }  
  31.     SafeRelease(pAudioBeamList);  
  32.     SafeRelease(pAudioSource);  
  33.     return hr;  
  34. }  
The rest is to initialize the speech recognition engine, because it is very stereotyped, we recommend that you copy it directly.

  1. //Initialized speech recognition  
  2. HRESULT ThisApp::init_speech_recognizer(){  
  3.     HRESULT hr = S_OK;  
  4.     //Create a voice input stream  
  5.     if (SUCCEEDED(hr)){  
  6.         hr = CoCreateInstance(CLSID_SpStream, nullptr, CLSCTX_INPROC_SERVER, __uuidof(ISpStream), (void**)&m_pSpeechStream);;  
  7.     }  
  8.     //Connect to our Kinect voice input  
  9.     if (SUCCEEDED(hr)){  
  10.         WAVEFORMATEX wft = {  
  11.             WAVE_FORMAT_PCM, //PCM encoding  
  12.             1, //Mono channel  
  13.             16000,  //Sampling rate is 16KHz  
  14.             32000, //Data stream per minute = sampling rate * alignment  
  15.             2, //Alignment: Mono * Sample depth = 2 byte  
  16.             16, //Sample depth 16BIT  
  17.             0 //Additional data  
  18.         };  
  19.         //Setting state  
  20.         hr = m_pSpeechStream->SetBaseStream(m_p16BitPCMAudioStream, SPDFID_WaveFormatEx, &wft);  
  21.     }  
  22.     //Creating Speech Recognition Objects  
  23.     if (SUCCEEDED(hr)){  
  24.         ISpObjectToken *pEngineToken = nullptr;  
  25.         //Create a language recognizer  
  26.         hr = CoCreateInstance(CLSID_SpInprocRecognizer, nullptr, CLSCTX_INPROC_SERVER, __uuidof(ISpRecognizer), (void**)&m_pSpeechRecognizer);  
  27.         if (SUCCEEDED(hr)) {  
  28.             //Connect the voice input stream object we created  
  29.             m_pSpeechRecognizer->SetInput(m_pSpeechStream, TRUE);  
  30.             //Create a language to be recognized. Choose Mainland Chinese (zh-cn) here  
  31.             //At present, there is no Kinect Chinese Speech Recognition Pack. If there is one, you can set "language=804;Kinect=Ture"  
  32.             hr = SpFindBestToken(SPCAT_RECOGNIZERS, L"Language=804", nullptr, &pEngineToken);  
  33.             if (SUCCEEDED(hr)) {  
  34.                 //Setting up the language to be recognized  
  35.                 m_pSpeechRecognizer->SetRecognizer(pEngineToken);  
  36.                 //Creating Speech Recognition Context  
  37.                 hr = m_pSpeechRecognizer->CreateRecoContext(&m_pSpeechContext);  
  38.                 //Adaptability ON! Prevent degradation of recognition ability due to long processing time  
  39.                 if (SUCCEEDED(hr))  {  
  40.                     hr = m_pSpeechRecognizer->SetPropertyNum(L"AdaptationOn", 0);  
  41.                 }  
  42.             }  
  43.         }  
  44.         SafeRelease(pEngineToken);  
  45.     }  
  46.     //Create grammar  
  47.     if (SUCCEEDED(hr)){  
  48.         hr = m_pSpeechContext->CreateGrammar(1, &m_pSpeechGrammar);  
  49.     }  
  50.     //Load static SRGS grammar files  
  51.     if (SUCCEEDED(hr)){  
  52.         hr = m_pSpeechGrammar->LoadCmdFromFile(s_GrammarFileName, SPLO_STATIC);  
  53.     }  
  54.     //Activating grammatical rules  
  55.     if (SUCCEEDED(hr)){  
  56.         hr = m_pSpeechGrammar->SetRuleState(nullptr, nullptr, SPRS_ACTIVE);  
  57.     }  
  58.     //Set the identifier to read the data all the time  
  59.     if (SUCCEEDED(hr)){  
  60.         hr = m_pSpeechRecognizer->SetRecoState(SPRST_ACTIVE_ALWAYS);  
  61.     }  
  62.     //Setting up interest in identifying events  
  63.     if (SUCCEEDED(hr)){  
  64.         hr = m_pSpeechContext->SetInterest(SPFEI(SPEI_RECOGNITION), SPFEI(SPEI_RECOGNITION));  
  65.     }  
  66.     //Ensuring that speech recognition is activated  
  67.     if (SUCCEEDED(hr)){  
  68.         hr = m_pSpeechContext->Resume(0);  
  69.     }  
  70.     //Acquisition of identification events  
  71.     if (SUCCEEDED(hr)){  
  72.         m_p16BitPCMAudioStream->SetSpeechState(TRUE);  
  73.         m_hSpeechEvent = m_pSpeechContext->GetNotifyEventHandle();  
  74.     }  
  75. #ifdef _DEBUG  
  76.     else  
  77.         printf_s("init_speech_recognizer failed\n");  
  78. #endif  
  79.     return hr;  
  80. }  

Attention should be paid to the following points:

1. When setting BaseStream fills in WAVEFORMATEX, PCM data format fills in.

2.SpFindBestToken initialization selection region is 16-digit, mainland Chinese is 0x0804. If additional voice packages are provided by Kinect, additional voice packages can be added

Set Kinect=Ture, separated by semicolons.

3.LoadCmdFromFile sets the grammar file to load SRGS and whether it is dynamic or not.

4. GetNotify EventHandle is used to get speech recognition event handles, which are not closed by programmers. Do not actively Close Handle.


Waiting for SR event to trigger

  1. //Audio processing  
  2. void ThisApp::speech_process() {  
  3.     //Confidence threshold  
  4.     const float ConfidenceThreshold = 0.3f;  
  5.   
  6.     SPEVENT curEvent = { SPEI_UNDEFINED, SPET_LPARAM_IS_UNDEFINED, 0, 0, 0, 0 };  
  7.     ULONG fetched = 0;  
  8.     HRESULT hr = S_OK;  
  9.     //Acquisition of events  
  10.     m_pSpeechContext->GetEvents(1, &curEvent, &fetched);  
  11.     while (fetched > 0)  
  12.     {  
  13.         //Identification of events  
  14.         switch (curEvent.eEventId)  
  15.         {  
  16.         case SPEI_RECOGNITION:  
  17.             //Guarantee bit object  
  18.             if (SPET_LPARAM_IS_OBJECT == curEvent.elParamType) {  
  19.                 ISpRecoResult* result = reinterpret_cast<ISpRecoResult*>(curEvent.lParam);  
  20.                 SPPHRASE* pPhrase = nullptr;  
  21.                 //Acquiring recognition phrases  
  22.                 hr = result->GetPhrase(&pPhrase);  
  23.                 if (SUCCEEDED(hr)) {  
  1. // XXXXXXXXXXXXXXXXXXX  
  1.                     ::CoTaskMemFree(pPhrase);  
  2.                 }  
  3.             }  
  4.             break;  
  5.         }  
  6.   
  7.         m_pSpeechContext->GetEvents(1, &curEvent, &fetched);  
  8.     }  
  9.   
  10.     return;  
  11. }  
Still the stereotype


When the phrase is successful, it can be used

  1. WCHAR* pwszFirstWord;  
  2. result->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &pwszFirstWord, nullptr);  
  3. // XXX  
  4. ::CoTaskMemFree(pwszFirstWord);  
Gets the fully recognized string
The structure of SPPHRASE is quite complex, and the important ones are
SPPHRASEPROPERTY* pointer is a tree pointer, and each node has SREngine Confidence for confidence.

The confidence of the parent node represents the general confidence of the branch, and the child node represents the confidence of the phrase in this section. Generally, the confidence of the parent node is used.

In order to indicate whether a SR event recognition is accurate, a confidence threshold can be set, which is greater than the threshold before the recognition is considered accurate. It can be an empirical value.

For example, 0.3, 0.4 and so on, can also be dynamic, designated by the environment or simply determined by the player.


Here, let's say, "We destroyed enemy toilets," according to the SRGS provided in the previous section.

PPhrase - > pProperties are roughly:


"_value"

Situation of War

The Subject

                                        |

Predicate

                                        |

Object

                                        |

Object


The pszValue members of each node can get character data, such as out = 0 we set; then this is "0".

VValue obtains the identified data, such as vValue.lVal obtains long-type data, which can be viewed by itself, after all, it is all data.


It's so arbitrary how to achieve that. You can play it by yourselves. Here's a semi-finished product.

  1. //Audio processing  
  2. void ThisApp::speech_process() {  
  3.     //Confidence threshold  
  4.     const float ConfidenceThreshold = 0.3f;  
  5.   
  6.     SPEVENT curEvent = { SPEI_UNDEFINED, SPET_LPARAM_IS_UNDEFINED, 0, 0, 0, 0 };  
  7.     ULONG fetched = 0;  
  8.     HRESULT hr = S_OK;  
  9.     //Acquisition of events  
  10.     m_pSpeechContext->GetEvents(1, &curEvent, &fetched);  
  11.     while (fetched > 0)  
  12.     {  
  13.         //Identification of events  
  14.         switch (curEvent.eEventId)  
  15.         {  
  16.         case SPEI_RECOGNITION:  
  17.             //Guarantee bit object  
  18.             if (SPET_LPARAM_IS_OBJECT == curEvent.elParamType) {  
  19.                 ISpRecoResult* result = reinterpret_cast<ISpRecoResult*>(curEvent.lParam);  
  20.                 SPPHRASE* pPhrase = nullptr;  
  21.                 //Acquiring recognition phrases  
  22.                 hr = result->GetPhrase(&pPhrase);  
  23.                 if (SUCCEEDED(hr)) {  
  24. #ifdef _DEBUG  
  25.                     //Display recognition string when DEBUG  
  26.                     WCHAR* pwszFirstWord;  
  27.                     result->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &pwszFirstWord, nullptr);  
  28.                     _cwprintf(pwszFirstWord);  
  29.                     ::CoTaskMemFree(pwszFirstWord);  
  30. #endif  
  31.                     pPhrase->pProperties;  
  32.                     const SPPHRASEELEMENT* pointer = pPhrase->pElements + 1;  
  33.                     if ((pPhrase->pProperties != nullptr) && (pPhrase->pProperties->pFirstChild != nullptr)) {  
  34.                         const SPPHRASEPROPERTY* pSemanticTag = pPhrase->pProperties->pFirstChild;  
  35. #ifdef _DEBUG  
  36.                         _cwprintf(L"   Confidence level:%d%%\n", (int)(pSemanticTag->SREngineConfidence*100.f));  
  37. #endif  
  38.                         if (pSemanticTag->SREngineConfidence > ConfidenceThreshold) {  
  39.                             speech_behavior(pSemanticTag);  
  40.                         }  
  41.                     }  
  42.                     ::CoTaskMemFree(pPhrase);  
  43.                 }  
  44.             }  
  45.             break;  
  46.         }  
  47.   
  48.         m_pSpeechContext->GetEvents(1, &curEvent, &fetched);  
  49.     }  
  50.   
  51.     return;  
  52. }  
  53.   
  54.   
  55. //Voice behavior  
  56. void ThisApp::speech_behavior(const SPPHRASEPROPERTY* tag){  
  57.     if (!tag) return;  
  58.     if (!wcscmp(tag->pszName, L"War situation")){  
  59.         enum class Subject{  
  60.             US = 0,  
  61.             Enemy  
  62.         } ;  
  63.         enum class Predicate{  
  64.             Destroy = 0,  
  65.             Defeat,  
  66.             Breakdown  
  67.         };  
  68.         //Analysis of the war situation  
  69.         union  Situation{  
  70.             struct{  
  71.                 //Subject  
  72.                 Subject subject;  
  73.                 //Predicate  
  74.                 Predicate predicate;  
  75.                 //Object  
  76.                 int object2;  
  77.                 //Object  
  78.                 int object;  
  79.   
  80.             };  
  81.             UINT32 data[4];  
  82.         };  
  83.         Situation situation;  
  84.         auto obj = tag->pFirstChild;  
  85.         auto pointer = situation.data;  
  86.         //Fill in the data  
  87.         while (obj) {  
  88.             *pointer = obj->vValue.lVal;  
  89.             ++pointer;  
  90.             obj = obj->pNextSibling;  
  91.         }  
  92.         // XXX  
  93.     }  
  94.     else if (!wcscmp(tag->pszName, L"Find things")){  
  95.         //Discovering  
  96.     }  
  97. }  

Well, that's the end of speech recognition.



It's still a console program. Please don't click X to exit the program, but press any key to exit.

Download address: click here


On Face Recognition and Visualization Gestures:

When you look at SDK, I believe you can see that there are also some in SDK.

Kinect.Face.h

Kinect.VisualGestureBuilder.h

Among them, "Kinect. Visual GestureBuilder. h" corresponds to lib and dll only with x64 version. I don't know whether Microsoft is lazy or because a feature can only be used by x64.

Hopefully, it's lazy. After all, almost no 64-bit program has been developed. 8-byte pointer feels too wasteful.


But these two: no official documents and examples of the C++ part have been found, and it is estimated that they are currently for C#. Currently used, some functions/methods are returned:

path not found

For example, gesture recognition

IVisualGestureBuilderFrameSource::AddGestures

Facial recognition

CreateHighDefinitionFaceFrameSource


You can only wait for SDK updates or expert answers, so "Kinect for Windows SDK v2.0 Development Notes" is temporarily over here.

Thank you for your support. Goodbye for SDK updates

Posted by robkir on Tue, 01 Jan 2019 18:48:08 -0800