Template: include/reco_vosk.h Source File - Speech - SCOL Language

Template

include

/*
-----------------------------------------------------------------------------
This source file is part of OpenSpace3D
For the latest info, see http://www.openspace3d.com
 
Copyright (c) 2012 I-maginer
 
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU Lesser General Public License as published by the Free Software
Foundation; either version 2 of the License, or (at your option) any later
version.
 
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
 
You should have received a copy of the GNU Lesser General Public License along with
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
Place - Suite 330, Boston, MA 02111-1307, USA, or go to
http://www.gnu.org/copyleft/lesser.txt
 
-----------------------------------------------------------------------------
*/
 
 
#ifndef __RECO_H__
#define __RECO_H__
 
#include "pluginSpeechPrerequisites.h"
#include <vosk_api.h>
#include <thread>
#include <mutex>
 
#include <cstring>
#include <iostream>
#include <algorithm>
 
#include <unordered_map>
 
class Buffer {
public:
  Buffer(size_t bufferSize) : size(bufferSize), data(new char[size]), count(0) {}
 
  ~Buffer()
  {
    delete[] data;
  }
 
  void fill(const char* newData, size_t newDataSize)
  {
    const std::lock_guard<std::mutex> lock(mMutex);
    if (newDataSize > size - count)
    {
      // Buffer overflow. Data not filled completely. consumme previous data
      //consume(newDataSize - (size - count));
      //std::memcpy(data + count, newData, size - count);
      return;
    }
 
    std::memcpy(data + count, newData, newDataSize);
    count += newDataSize;
  }
 
  void consume(size_t consumeSize)
  {
    if (consumeSize >= count)
    {
      count = 0;
    }
    else
    {
      std::memmove(data, data + consumeSize, count - consumeSize);
      count -= consumeSize;
    }
  }
 
  void consume_safe(size_t consumeSize)
  {
    const std::lock_guard<std::mutex> lock(mMutex);
    if (consumeSize >= count)
    {
      count = 0;
    }
    else
    {
      std::memmove(data, data + consumeSize, count - consumeSize);
      count -= consumeSize;
    }
  }
 
  size_t getBufferCopy(char* buffer, size_t length)
  {
    const std::lock_guard<std::mutex> lock(mMutex);
    size_t cpl = std::min(length, count);
    std::memcpy(buffer, data, cpl);
    consume(cpl);
    return cpl;
  }
 
  void getInt16(int16_t* int16Array, size_t length)
  {
    const int16_t* audioData = reinterpret_cast<const int16_t*>(data);
    for (size_t i = 0; i < length; ++i)
    {
      int16Array[i] = audioData[i];
    }
    consume(length);
  }
 
  const char* getData() const
  {
    return data;
  }
 
  size_t getCount() const
  {
    return count;
  }
 
  float calculateAudioLevel(size_t length)
  {
    // Assuming 16-bit signed PCM audio data
    const int16_t* samples = reinterpret_cast<const int16_t*>(data);
    size_t numSamples = length / sizeof(int16_t);
 
    // Calculate the sum of squared samples
    double sum = 0.0;
    for (size_t i = 0; i < numSamples; ++i)
    {
      double sample = static_cast<double>(samples[i]) / static_cast<double>(INT16_MAX);
      sum += sample * sample;
    }
 
    // Calculate the root mean square (RMS) level
    double rmsLevel = std::sqrt(sum / numSamples);
 
    return rmsLevel;
  }
 
  void RemoveNoise(size_t length, float noiseLevel, float reductionFactor)
  {
    // Assuming audio samples are 16-bit signed integers (2 bytes per sample)
    const size_t sampleSize = 2;
 
    // Compute the threshold based on the noise level
    const float threshold = noiseLevel * std::numeric_limits<short>::max();
 
    // Process each sample in the audio buffer
    for (size_t i = 0; i < length; i += sampleSize)
    {
      // Convert the sample bytes to a signed short value
      short* sample = reinterpret_cast<short*>(&data[i]);
 
      // Apply noise reduction
      if (std::abs(*sample) < threshold)
      {
        // Reduce the sample value by the reduction factor
        *sample *= reductionFactor;
      }
    }
  }
 
  size_t ProcessBuffer(char* buffer, size_t length, float threshold)
  {
    const std::lock_guard<std::mutex> lock(mMutex);
    
    size_t cpl = std::min(length, count);
    RemoveNoise(cpl, 0.4f, 0.8f);
    float audioLevel = calculateAudioLevel(cpl);
 
    if (audioLevel < threshold)
    {
      consume(cpl); // Skip the buffer
      return 0;
    }
 
    std::memcpy(buffer, data, cpl);
    consume(cpl);
    return cpl;
  }
 
private:
  std::mutex mMutex;
  size_t size;
  char* data;
  size_t count;
};
 
 
class Recognition 
{
public:
protected:
private:
  std::thread mThread;
  std::mutex mMutexConfig;
  bool mValid;
  VoskModel* mModel;
  VoskRecognizer* mRecognizer;
  Buffer* mBuffer;
  bool mInSpeech;
  int mTimeOutSamples;
  bool mKeySearch;
  std::vector<std::string> mKeyWords;
public:
  Recognition();
 
        Recognition(std::string pathtobin, std::string lang);
 
  ~Recognition();
 
  void cbThread();
 
  void fillAudioBuffer(const char* data, size_t lenght);
 
  bool initializeObjects();
 
  int getVolume();
        
  void setVolume(int volume);
      
  void AddWord(std::string s_Rule, std::string s_Word);
protected:
private:
};
 
#endif

Project

General

Profile

Scol » Speech