pocketsphinx实现连续大词汇量语音识别

之前有个项目需求是要在客户内网实现一个连续大词汇语音识别功能,由于客户的内网是独立的,不能访问互联网,所以我只能到开源社区去找找碰碰运气了。后来在网上找到了cmusphinx(地址:http://cmusphinx.sourceforge.net/),项目是c#语言的,pocketsphinx这个语音识别引擎是cpp写的,Sphinx4这是java的版本,为了很好的集成到项目中去,我选择了pocketsphinx,因为它在移动端和PC端都能运行。本人C++语言很差,东拼西凑总算把调用识别引擎的方法写好了,编译成Dll以后,在C#里面用DllImport来调用。从github上下载项目源码后,写一个下面的类,代码如下:

#include <pocketsphinx.h>
#include <fstream>
#include <iostream>
#include <string>
#include <Windows.h>
#define EXPORT_DLL extern "C" __declspec(dllexport)//向c#开放此文件的cpp方法 
using  namespace std; 

//#define MODELDIR "model"


//获取运行环境路径
EXPORT_DLL string GetProgramDir()  
{ 
    char buf[1024];
    GetCurrentDirectory(1024,buf);
    string path(buf);
    return path;
} 

//保存识别结果
static void write_results(char *rec_result)
{
    ofstream fResult("D:/run_path.txt",ios::app);
    fResult<<rec_result;
    fResult.close();
}

//获取配置文件(1:英文识别,2:中文识别)
static cmd_ln_t* get_config(char *modelDir,int language_type)
{
    string configPath(modelDir);
    if(modelDir==NULL)
    {
        string configPath=GetProgramDir();
    }
    //write_results((char*)configPath.c_str());
    cmd_ln_t *config;
    string hmm,lm,dict;
    if(language_type==1)
    {
        hmm=configPath+"/model/en-us/en-us";
        lm=configPath+"/model/en-us/en-us.lm.dmp";
        dict=configPath+"/model/en-us/cmudict-en-us.dict";
        config = cmd_ln_init(NULL, ps_args(), TRUE,
            "-hmm", hmm.c_str(),
            "-lm", lm.c_str(),
            "-dict",dict.c_str(),
            NULL);
    }else
    {
        hmm=configPath+"/model/zh/zh";
        lm=configPath+"/model/zh/zh_broadcastnews_64000_utf8.dmp";
        dict=configPath+"/model/zh/zh_broadcastnews_utf8.dic";
        config = cmd_ln_init(NULL, ps_args(), TRUE,
            "-hmm", hmm.c_str(),
            "-lm", lm.c_str(),
            "-dict", dict.c_str(),
            NULL);
    }
    /*if (config == NULL)
    return config;*/
    return config;
}

//字符编码转换
char * UnicodeToANSI(const wchar_t* str)
{
    char* result;
    int textlen;
    textlen=WideCharToMultiByte(CP_ACP,0,str,-1,NULL,0,NULL,NULL);
    result=(char *)malloc((textlen+1)*sizeof(char));
    memset(result,0,sizeof(char)*(textlen+1));
    WideCharToMultiByte(CP_ACP,0,str,-1,result,textlen,NULL,NULL);
    return result;
}

wchar_t * UTF8ToNunicode(const char* str)
{
    int textlen;
    wchar_t * result;
    textlen=MultiByteToWideChar(CP_UTF8,0,str,-1,NULL,0);
    result=(wchar_t *)malloc((textlen+1)*sizeof(wchar_t));
    memset(result,0,(textlen+1)*sizeof(wchar_t));
    MultiByteToWideChar(CP_UTF8,0,str,-1,(LPWSTR)result,textlen);
    return result;
}

char* UTF8ToANSI(const char* str)
{
    return UnicodeToANSI(UTF8ToNunicode(str));
}


char strResult[1024];//保存语音识别返回结果
/*语言识别方法一
*languageType:(1:英文识别,2:中文识别,)
*file_name: (wav音频文件路径)
*/
EXPORT_DLL char* wavfile_speech_rec(int languageType,char *modelDir,char *file_name)
{
    ps_decoder_t *ps;
    int rv;
    int16 buf[512];
    int32 score;
    FILE *fh;
    char const *hyp, *uttid;
    cmd_ln_t *config=get_config(modelDir,languageType);
    if(config==NULL)
    {
        strcpy(strResult,"语音模型配置文件读取失败!");
        return strResult;
    }
    ps=ps_init(config);
    if (ps == NULL)
    {
        strcpy(strResult,"解码器初始化失败!");
        return strResult;
    }
    fh = fopen(file_name, "rb");
    if (fh == NULL)
    {
        strcpy(strResult,"不是有效的wav文件!");
        return strResult;
    }
    rv = ps_start_utt(ps);
    if (rv < 0)
    {
        strcpy(strResult,"解码失败!");
        return strResult;
    }
    while (!feof(fh)) {
        size_t nsamp;
        nsamp = fread(buf, 2, 512, fh);
        rv = ps_process_raw(ps, buf, nsamp, FALSE, FALSE);
    }
    rv = ps_end_utt(ps);
    if (rv < 0)
    {
        strcpy(strResult,"解码失败!");
        return strResult;
    }
    hyp = ps_get_hyp(ps, &score);
    if (hyp == NULL)
    {
        strcpy(strResult,"");
        return strResult;
    }else
    {
        strcpy(strResult,hyp);
    }
    fclose(fh);
    fh=NULL;
    ps_free(ps);
    cmd_ln_free_r(config);
    return UTF8ToANSI(strResult);
}


c#中调用的代码如下:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Runtime.InteropServices;
using System.Text;

namespace SpeechRec
{
    /// <summary>
    /// 语音识别工具类
    /// </summary>
    public class SpeechRecTool
    {
        public SpeechRecTool()
        {

        }
        [DllImport("pocketsphinx_speech_rec.dll", EntryPoint = "GetProgramDir", CharSet = CharSet.Ansi, CallingConvention = CallingConvention.Cdecl)]
        public static extern IntPtr GetProgramDir();
        /// <summary>
        /// wav文件识别
        /// </summary> CharSet = CharSet.Ansi, EntryPoint = "wavfile_speech_rec")
        /// <returns></returns>
        [DllImport("pocketsphinx_speech_rec.dll", EntryPoint = "wavfile_speech_rec", CharSet = CharSet.Ansi, CallingConvention = CallingConvention.Cdecl)]
        private static extern IntPtr wavfile_speech_rec(int lang, string modelPath, string fileName);
        /// <summary>
        /// wav文件识别
        /// </summary>
        /// <param name="language"></param>
        /// <param name="fileName"></param>
        public string WavFileSpeechRec(LanguageType language, string fileName)
        {
            string strResult = "";
            if (System.IO.File.Exists(fileName))
            {
                WavTools wTool = new WavTools();
                if (!new WavTools().IsStandardWavFile(fileName))
                {
                    fileName = wTool.FormateWavFile(fileName);
                }
                IntPtr intPtrResult = wavfile_speech_rec((int)language, @"E:\SoftWare\语音识别\狮身人面像\pocketsphinx\bin\Release\Win32", fileName);
                strResult = Marshal.PtrToStringAnsi(intPtrResult);
                System.IO.File.WriteAllText("D:\\aa.txt", strResult, System.Text.Encoding.GetEncoding("GB2312"));
                Console.WriteLine("执行结果:" + strResult);
            }
            else
            {
                Console.WriteLine("不是有效的音频文件");
            }
            return strResult;
        }
    }
    /// <summary>
    /// 语言类型
    /// </summary>
    public enum LanguageType
    {
        /// <summary>
        /// 中文
        /// </summary>
        En = 1,
        /// <summary>
        /// 英文
        /// </summary>
        Zh = 2
    }
}


注意事项:音频要是wav,而且格式要和它demo里面的一致,我用了Naudio开源组件来实现wav音频格式的转换,如果音频比特率和采样率和它demo里面的不一样的话,会导致识别率降低

原文地址:https://www.cnblogs.com/yyq745201/p/4633628.html