语音识别CMUSphinx(3)特征信息的提取

为了能在安卓Demo中提取出有用的信息，先来参考.c源码（因为Android Demo是用JNI封装了一下.c实现的）

.c源码中所需要的声学分数提取

INFO

这里我想要的就是start end ascr等参数，查看continuous.c源代码，主要的逻辑如下

//当能够读到音频时一直进行    
while ((k = fread(adbuf, sizeof(int16), 2048, rawfd)) > 0) {
        ps_process_raw(ps, adbuf, k, FALSE, FALSE);
      //in_speech=TRUE表示在说话，为FALSE表示为静音silence
        in_speech = ps_get_in_speech(ps);
      //utterance 开始
        if (in_speech && !utt_started) {
            utt_started = TRUE;
        } 
      //如果开始之后又没有在说话了（即到达了一句话尾部的silence停顿）
        if (!in_speech && utt_started) {
          //调试信息是在这个函数里面输出的
            ps_end_utt(ps);
            hyp = ps_get_hyp(ps, NULL);
            if (hyp != NULL)
            printf("%s\n", hyp);
            if (print_times)
            print_word_times();
            fflush(stdout);

            ps_start_utt(ps);
            utt_started = FALSE;
        }
    }//while结束
    //音频读取完毕 再输出一次
    ps_end_utt(ps);
    if (utt_started) {
        hyp = ps_get_hyp(ps, NULL);
        if (hyp != NULL) {
            printf("%s\n", hyp);
            if (print_times) {
            print_word_times();
        }
    }
    }

来到ps_end_utt对应行

            E_INFO("%s (%d)\n", hyp, score);
            E_INFO_NOFN("%-20s %-5s %-5s %-5s %-10s %-10s %-3s\n",
                    "word", "start", "end", "pprob", "ascr", "lscr", "lback");
            for (seg = ps_seg_iter(ps); seg;
             seg = ps_seg_next(seg)) {
                char const *word;
            int sf, ef;
            int32 post, lscr, ascr, lback;

            word = ps_seg_word(seg);
            ps_seg_frames(seg, &sf, &ef);
            post = ps_seg_prob(seg, &ascr, &lscr, &lback);
            E_INFO_NOFN("%-20s %-5d %-5d %-1.3f %-10d %-10d %-3d\n",
                            word, sf, ef, logmath_exp(ps_get_logmath(ps), post),
                        ascr, lscr, lback);
            }

对应开头的

INFO

所需要的函数就是ps_seg_frame ps_seg_prob
https://sourceforge.net/p/cmusphinx/discussion/help/thread/dd80eb2a/
也印证了这一点

安卓环境下的 Hypothesis就是识别结果了。

不过相比于Kaldi而言，没办法识别音素对应的重音比较遗憾。
https://sourceforge.net/p/cmusphinx/discussion/sphinx4/thread/736c772a/?limit=25#d425

Android中的工作模式

onPartialResult 和onResult等回掉函数的工作形式则需要查看SpeechRecognizer.class里面的逻辑，是对Decoder.class的一层封装，添加语法等等可以在这一层进行（也可以在Decoder层进行，不过偏底层罢了，参考下一小节），另外在这个类中涉及到音频的处理，使用安卓的类AudioRecord来获取录音（AudioRecord的使用可以参考https://blog.csdn.net/qq_36982160/article/details/79383046），存入buffer，再根据是否结束等等进行逻辑判断，大体和.c中的一致。

        public void run() {
            SpeechRecognizer.this.recorder.startRecording();
            if (SpeechRecognizer.this.recorder.getRecordingState() == 1) {
                SpeechRecognizer.this.recorder.stop();
                IOException ioe = new IOException("Failed to start recording. Microphone might be already in use.");
                SpeechRecognizer.this.mainHandler.post(SpeechRecognizer.this.new OnErrorEvent(ioe));
            } else {
                Log.d(SpeechRecognizer.TAG, "Starting decoding");
                SpeechRecognizer.this.decoder.startUtt();
                short[] buffer = new short[SpeechRecognizer.this.bufferSize];
              //这里和.c源代码里面一样，同样是获取是否有要识别的内容
                boolean inSpeech = SpeechRecognizer.this.decoder.getInSpeech();
                SpeechRecognizer.this.recorder.read(buffer, 0, buffer.length);

                while(!interrupted() && (this.timeoutSamples == -1 || this.remainingSamples > 0)) {
                  //nread为读取的样本数量
                    int nread = SpeechRecognizer.this.recorder.read(buffer, 0, buffer.length);
                    if (-1 == nread) {
                        throw new RuntimeException("error reading audio buffer");
                    }

                    if (nread > 0) {
                        SpeechRecognizer.this.decoder.processRaw(buffer, (long)nread, false, false);
                        if (SpeechRecognizer.this.decoder.getInSpeech() != inSpeech) {
                            inSpeech = SpeechRecognizer.this.decoder.getInSpeech();
                            SpeechRecognizer.this.mainHandler.post(SpeechRecognizer.this.new InSpeechChangeEvent(inSpeech));
                        }

                        if (inSpeech) {
                            this.remainingSamples = this.timeoutSamples;
                        }

                        Hypothesis hypothesis = SpeechRecognizer.this.decoder.hyp();
                      //这里的false跟踪过去表示还没有完全结束，即回调的函数是onPartialResult
                        SpeechRecognizer.this.mainHandler.post(SpeechRecognizer.this.new ResultEvent(hypothesis, false));
                    }

                    if (this.timeoutSamples != -1) {
                        this.remainingSamples -= nread;
                    }
                }
              //这里stop之后就会调用onResult
                SpeechRecognizer.this.recorder.stop();
                SpeechRecognizer.this.decoder.endUtt();
                SpeechRecognizer.this.mainHandler.removeCallbacksAndMessages((Object)null);
                if (this.timeoutSamples != -1 && this.remainingSamples <= 0) {
                    SpeechRecognizer.this.mainHandler.post(SpeechRecognizer.this.new TimeoutEvent());
                }

            }
        }

Decoder.class

从上面的逻辑可以看出，和底层音频数据以及JNI打交道的是Decoder.class，想要处理每个buffer里面储存的PCM格式音频的话可以使用getRawdata，其他的通过命令行设置的参数应该也可以通过该接口实现，参考
https://stackoverflow.com/questions/29008111/give-a-file-as-input-to-pocketsphinx-on-android

    Config c = Decoder.defaultConfig();
    c.setString("-hmm", "../../model/en-us/en-us");
    c.setString("-lm", "../../model/en-us/en-us.lm.dmp");
    c.setString("-dict", "../../model/en-us/cmudict-en-us.dict");
    Decoder d = new Decoder(c);

    URL testwav = new URL("file:../../test/data/goforward.wav");
    FileInputStream stream = new FileInputStream(new File(testwav)));

    d.startUtt();
    byte[] b = new byte[4096];
    try {
        int nbytes;
        while ((nbytes = stream.read(b)) >= 0) {
            ByteBuffer bb = ByteBuffer.wrap(b, 0, nbytes);

            // Not needed on desktop but required on android
            bb.order(ByteOrder.LITTLE_ENDIAN); 

            short[] s = new short[nbytes/2];
            bb.asShortBuffer().get(s);
            d.processRaw(s, nbytes/2, false, false);
        }
    } catch (IOException e) {
        fail("Error when reading goforward.wav" + e.getMessage());
    }
    d.endUtt();
    System.out.println(d.hyp().getHypstr());
    for (Segment seg : d.seg()) {
        System.out.println(seg.getWord());
    }
}

对齐信息调整

有时候每个单词出现的对齐信息不是从0开始的，而是和上一段录音结束时间有关，这点有点麻烦，应该怎么把这个信息重置呢？

/**
 * Get the offset of the utterance start of the current stream, helpful for stream-wide timing.
 */
int32
acmod_stream_offset(acmod_t *acmod)
{
    return acmod->utt_start_frame;
}

/**
 * Reset the current stream
 */
void acmod_start_stream(acmod_t *acmod)
{
    fe_start_stream(acmod->fe);
    acmod->utt_start_frame = 0;
}

跟踪到

int
ps_start_stream(ps_decoder_t *ps)
{
    acmod_start_stream(ps->acmod);
    return 0;
}

也就是Decoder.class中的

    public void startStream() {
        PocketSphinxJNI.Decoder_startStream(this.swigCPtr, this);
    }

在每次重新识别的时候，调用这个函数就可以重置了。然而在安卓实测的时候发现可能是重置需要一定时间，重置未完成的时候再开始识别，重置有时候会失败。于是换了最暴力的重置，在生成解码器的时候保存Config，然后重新开始新识别的时候直接重置该最初的Config

文件储存

Java class直接向JNI中指定了rawlogdir，再从C源码中查看参数保存的位置

    if (ps->rawlogdir) {
        char *logfn = string_join(ps->rawlogdir, "/",
                                  uttid, ".raw", NULL);
        FILE *rawfh;
        E_INFO("Writing raw audio file: %s\n", logfn);
        if ((rawfh = fopen(logfn, "wb")) == NULL) {
            E_ERROR_SYSTEM("Failed to open raw audio file %s", logfn);
            ckd_free(logfn);
            return -1;
        }
        ckd_free(logfn);
        acmod_set_rawfh(ps->acmod, rawfh);
    }

直接查找rawlogdir相关的引用即可。

Rawdata

        if (acmod->rawfh) {
            fwrite(prev_audio_inptr, sizeof(int16),
                   processed_samples, acmod->rawfh);
        }

此处每个单位是int16,sizeof(int16)=2

文件名格式

查找一下uttid的引用，格式如下

最终在手机上的储存是类似000000000.raw的形式。MFCC特征的文件名也是同样的格式。
在每次start_utt的时候增加。在Android中为每次startListening的时候调用start_utt

MFCC

一开始set的时候如下，写入了一个int的0

acmod_set_mfcfh(acmod_t *acmod, FILE *logfh)
{
    int rv = 0;

    if (acmod->mfcfh)
        fclose(acmod->mfcfh);
    acmod->mfcfh = logfh;
    fwrite(&rv, 4, 1, acmod->mfcfh);
    return rv;
}

中途处理的时候如下，这里cep是个二维数组,第一维n_frames=?（每次是变化的）,第二维feat_cepsize(acmod->fcb)=13即每帧有多少个特征，总大小n

static int
acmod_log_mfc(acmod_t *acmod,
              mfcc_t **cep, int n_frames)
{
    int n = n_frames * feat_cepsize(acmod->fcb);
    /* Write features. */
    if (fwrite(cep[0], sizeof(mfcc_t), n, acmod->mfcfh) != n) {
        E_ERROR_SYSTEM("Failed to write %d values to file", n);
    }
    return 0;
}

而在结束的时候写入了特征总长度（减去了开头的rv=0这个int的大小(4)）

    if (acmod->mfcfh) {
        long outlen;
        int32 rv;
        outlen = (ftell(acmod->mfcfh) - 4) / 4;
        /* Try to seek and write */
        if ((rv = fseek(acmod->mfcfh, 0, SEEK_SET)) == 0) {
            fwrite(&outlen, 4, 1, acmod->mfcfh);
        }
        fclose(acmod->mfcfh);
        acmod->mfcfh = NULL;
    }

因此整个MFCC文件的格式就是0+(n帧*13维) MFCC特征+MFCC特征长度，每个单位都是4个字节

在以上过程之后我们就可以对特征信息、MFCC特征、原音频文件进行后续处理了。

附录

.c中用到的几个函数：
fwrite函数
 fseek函数
 ftell函数
安卓音频处理库：
https://www.cnblogs.com/lsjwzh/p/4361457.html
https://blog.csdn.net/tongsiw/article/details/51469686

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 199,830评论 5赞 468
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 83,992评论 2赞 376
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 146,875评论 0赞 331
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 53,837评论 1赞 271
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 62,734评论 5赞 360
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 48,091评论 1赞 277
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 37,550评论 3赞 390
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 36,217评论 0赞 254
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 40,368评论 1赞 294
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 35,298评论 2赞 317
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 37,350评论 1赞 329
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 33,027评论 3赞 315
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 38,623评论 3赞 303
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 29,706评论 0赞 19
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 30,940评论 1赞 255
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 42,349评论 2赞 346
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 41,936评论 2赞 341