在 kaldi 系统中通常会出现很多参数相关的解析,比如:
# 对于 声纹识别 中获取 ivector 特征向量使用如下脚本
steps/online/nnet2/extract_ivectors_online.sh
# 其主要的步骤如下:
# 1. 特征处理:cmvn+splice+lda
# 2. 根据特征和m(final.dubm)获得每个speaker对应的s
# 3. 根据s、m(final.dubm)、T(final.ie)得到w
# 查看 ivector 特征
copy-feats --binary=false --compress=false ark:ivector_online.1.ark ark,t:ivector_online.1.ark.txt
如上所示, 在 copy-feats 中存在 ark:ivector_online.1.ark 类似的东西,为了深入理解 kaldi 运行机制,我们先从尝试解析这些参数开始深入
指令源码分析
在 src/feats/copy-feats.cc 文件中我们能看到所有 copy-feats 实现的全貌,
int main(int argc, char *argv[]) {
try {
using namespace kaldi;
const char *usage =
"Copy features [and possibly change format]\n"
"Usage: copy-feats [options] <feature-rspecifier> <feature-wspecifier>\n"
"or: copy-feats [options] <feats-rxfilename> <feats-wxfilename>\n"
"e.g.: copy-feats ark:- ark,scp:foo.ark,foo.scp\n"
" or: copy-feats ark:foo.ark ark,t:txt.ark\n"
"See also: copy-matrix, copy-feats-to-htk, copy-feats-to-sphinx, select-feats,\n"
"extract-feature-segments, subset-feats, subsample-feats, splice-feats, paste-feats,\n"
"concat-feats\n";
ParseOptions po(usage);
bool binary = true;
bool htk_in = false;
bool sphinx_in = false;
bool compress = false;
int32 compression_method_in = 1;
std::string num_frames_wspecifier;
// 注册选项参数,并将相应的选项值输入到对应的变量中
po.Register("htk-in", &htk_in, "Read input as HTK features");
po.Register("sphinx-in", &sphinx_in, "Read input as Sphinx features");
po.Register("binary", &binary, "Binary-mode output (not relevant if writing "
"to archive)");
po.Register("compress", &compress, "If true, write output in compressed form"
"(only currently supported for wxfilename, i.e. archive/script,"
"output)");
po.Register("compression-method", &compression_method_in,
"Only relevant if --compress=true; the method (1 through 7) to "
"compress the matrix. Search for CompressionMethod in "
"src/matrix/compressed-matrix.h.");
po.Register("write-num-frames", &num_frames_wspecifier,
"Wspecifier to write length in frames of each utterance. "
"e.g. 'ark,t:utt2num_frames'. Only applicable if writing tables, "
"not when this program is writing individual files. See also "
"feat-to-len.");
// 解析相应的参数列表内容
po.Read(argc, argv);
if (po.NumArgs() != 2) {
po.PrintUsage();
exit(1);
}
int32 num_done = 0;
CompressionMethod compression_method = static_cast<CompressionMethod>(
compression_method_in);
if (ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier) {
// Copying tables of features.
// 获取输入文件的配置
std::string rspecifier = po.GetArg(1);
// 获取输出到文件的配置
std::string wspecifier = po.GetArg(2);
Int32Writer num_frames_writer(num_frames_wspecifier);
if (!compress) {
// 输出Matrix到文件
BaseFloatMatrixWriter kaldi_writer(wspecifier);
if (htk_in) {
// 序列化读入文件
SequentialTableReader<HtkMatrixHolder> htk_reader(rspecifier);
for (; !htk_reader.Done(); htk_reader.Next(), num_done++) {
kaldi_writer.Write(htk_reader.Key(), htk_reader.Value().first);
if (!num_frames_wspecifier.empty())
num_frames_writer.Write(htk_reader.Key(),
htk_reader.Value().first.NumRows());
}
} else if (sphinx_in) {
SequentialTableReader<SphinxMatrixHolder<> > sphinx_reader(rspecifier);
for (; !sphinx_reader.Done(); sphinx_reader.Next(), num_done++) {
kaldi_writer.Write(sphinx_reader.Key(), sphinx_reader.Value());
if (!num_frames_wspecifier.empty())
num_frames_writer.Write(sphinx_reader.Key(),
sphinx_reader.Value().NumRows());
}
} else {
SequentialBaseFloatMatrixReader kaldi_reader(rspecifier);
for (; !kaldi_reader.Done(); kaldi_reader.Next(), num_done++) {
kaldi_writer.Write(kaldi_reader.Key(), kaldi_reader.Value());
if (!num_frames_wspecifier.empty())
num_frames_writer.Write(kaldi_reader.Key(),
kaldi_reader.Value().NumRows());
}
}
} else {
CompressedMatrixWriter kaldi_writer(wspecifier);
if (htk_in) {
SequentialTableReader<HtkMatrixHolder> htk_reader(rspecifier);
for (; !htk_reader.Done(); htk_reader.Next(), num_done++) {
kaldi_writer.Write(htk_reader.Key(),
CompressedMatrix(htk_reader.Value().first,
compression_method));
if (!num_frames_wspecifier.empty())
num_frames_writer.Write(htk_reader.Key(),
htk_reader.Value().first.NumRows());
}
} else if (sphinx_in) {
SequentialTableReader<SphinxMatrixHolder<> > sphinx_reader(rspecifier);
for (; !sphinx_reader.Done(); sphinx_reader.Next(), num_done++) {
kaldi_writer.Write(sphinx_reader.Key(),
CompressedMatrix(sphinx_reader.Value(),
compression_method));
if (!num_frames_wspecifier.empty())
num_frames_writer.Write(sphinx_reader.Key(),
sphinx_reader.Value().NumRows());
}
} else {
SequentialBaseFloatMatrixReader kaldi_reader(rspecifier);
for (; !kaldi_reader.Done(); kaldi_reader.Next(), num_done++) {
kaldi_writer.Write(kaldi_reader.Key(),
CompressedMatrix(kaldi_reader.Value(),
compression_method));
if (!num_frames_wspecifier.empty())
num_frames_writer.Write(kaldi_reader.Key(),
kaldi_reader.Value().NumRows());
}
}
}
KALDI_LOG << "Copied " << num_done << " feature matrices.";
return (num_done != 0 ? 0 : 1);
} else {
KALDI_ASSERT(!compress && "Compression not yet supported for single files");
if (!num_frames_wspecifier.empty())
KALDI_ERR << "--write-num-frames option not supported when writing/reading "
<< "single files.";
std::string feat_rxfilename = po.GetArg(1), feat_wxfilename = po.GetArg(2);
Matrix<BaseFloat> feat_matrix;
if (htk_in) {
Input ki(feat_rxfilename); // Doesn't look for read binary header \0B, because
// no bool* pointer supplied.
HtkHeader header; // we discard this info.
ReadHtk(ki.Stream(), &feat_matrix, &header);
} else if (sphinx_in) {
KALDI_ERR << "For single files, sphinx input is not yet supported.";
} else {
ReadKaldiObject(feat_rxfilename, &feat_matrix);
}
WriteKaldiObject(feat_matrix, feat_wxfilename, binary);
KALDI_LOG << "Copied features from " << PrintableRxfilename(feat_rxfilename)
<< " to " << PrintableWxfilename(feat_wxfilename);
}
} catch(const std::exception &e) {
std::cerr << e.what();
return -1;
}
}
从上述看到, 其中大多数是封装好了的 Reader/Writer 并将参数直接传入到 Reader/Writer 中进行解析。
这些 Reader/Writer 都是在 src/util/kaldi-table.h 文件中进行定义的, 大多数是定义的模板类,其相关实现则 在 src/util/kaldi-table-inl.h 文件进行实现。以 SequentialTableReader 为例子:
template<class Holder>
SequentialTableReader<Holder>::SequentialTableReader(const std::string
&rspecifier): impl_(NULL) {
// 调用 SequentialTableReader.Open() 函数
if (rspecifier != "" && !Open(rspecifier))
KALDI_ERR << "Error constructing TableReader: rspecifier is " << rspecifier;
}
template<class Holder>
bool SequentialTableReader<Holder>::Open(const std::string &rspecifier) {
if (IsOpen())
if (!Close())
KALDI_ERR << "Could not close previously open object.";
// now impl_ will be NULL.
RspecifierOptions opts;
// 调用 ClassifyRspecifier 类来解析相应的参数内容
// 根据 RspecifierType 来将操作交给相应的实现来执行
RspecifierType wt = ClassifyRspecifier(rspecifier, NULL, &opts);
switch (wt) {
case kArchiveRspecifier:
impl_ = new SequentialTableReaderArchiveImpl<Holder>();
break;
case kScriptRspecifier:
impl_ = new SequentialTableReaderScriptImpl<Holder>();
break;
case kNoRspecifier: default:
KALDI_WARN << "Invalid rspecifier " << rspecifier;
return false;
}
if (!impl_->Open(rspecifier)) {
delete impl_;
impl_ = NULL;
return false; // sub-object will have printed warnings.
}
if (opts.background) {
impl_ = new SequentialTableReaderBackgroundImpl<Holder>(
impl_);
if (!impl_->Open("")) {
// the rxfilename is ignored in that Open() call.
// It should only return false on code error.
return false;
}
}
return true;
}
这里以 SequentialTableReaderArchiveImpl 为例:
template<class Holder> class SequentialTableReaderArchiveImpl:
public SequentialTableReaderImplBase<Holder> {
public:
typedef typename Holder::T T;
SequentialTableReaderArchiveImpl(): state_(kUninitialized) { }
virtual bool Open(const std::string &rspecifier) {
if (state_ != kUninitialized) {
if (!Close()) { // call Close() yourself to suppress this exception.
if (opts_.permissive)
KALDI_WARN << "Error closing previous input "
"(only warning, since permissive mode).";
else
KALDI_ERR << "Error closing previous input.";
}
}
rspecifier_ = rspecifier;
// 解析 RspecifierType 来获取类型, 并将文件信息存放在 archive_rxfilename_ 变量中,将其选项参数放置到 opts_ 变量中
RspecifierType rs = ClassifyRspecifier(rspecifier,
&archive_rxfilename_,
&opts_);
KALDI_ASSERT(rs == kArchiveRspecifier);
bool ans;
// NULL means don't expect binary-mode header
if (Holder::IsReadInBinary())
ans = input_.Open(archive_rxfilename_, NULL);
else
ans = input_.OpenTextMode(archive_rxfilename_);
if (!ans) { // header.
KALDI_WARN << "Failed to open stream "
<< PrintableRxfilename(archive_rxfilename_);
state_ = kUninitialized; // Failure on Open
return false; // User should print the error message.
}
state_ = kFileStart;
Next();
if (state_ == kError) {
KALDI_WARN << "Error beginning to read archive file (wrong filename?): "
<< PrintableRxfilename(archive_rxfilename_);
input_.Close();
state_ = kUninitialized;
return false;
}
KALDI_ASSERT(state_ == kHaveObject || state_ == kEof);
return true;
}
....
整体看下来,基本是由 ClassifyRspecifier 来进行解析相应参数的,因此可以专注查看 ClassifyRspecifier 实现,就能明白这些参数是干啥的了。
在 src/util/kaldi-table.cc 文件中,我们看到 ClassifyRspecifier 相关实现如下:
RspecifierType ClassifyRspecifier(const std::string &rspecifier,
std::string *wxfilename,
RspecifierOptions *opts) {
// Examples
// ark:rxfilename -> kArchiveRspecifier
// scp:rxfilename -> kScriptRspecifier
//
// We also allow the meaningless prefixes b, and t,
// plus the options o (once), no (not-once),
// s (sorted) and ns (not-sorted), p (permissive)
// and np (not-permissive).
// so the following would be valid:
//
// f, o, b, np, ark:rxfilename -> kArchiveRspecifier
//
// Examples:
//
// b, ark:rxfilename -> kArchiveRspecifier
// t, ark:rxfilename -> kArchiveRspecifier
// b, scp:rxfilename -> kScriptRspecifier
// t, no, s, scp:rxfilename -> kScriptRspecifier
// t, ns, scp:rxfilename -> kScriptRspecifier
// Improperly formed Rspecifiers will be classified as kNoRspecifier.
if (wxfilename) wxfilename->clear();
if (opts != NULL)
*opts = RspecifierOptions(); // Make sure all the defaults are as in the
// default constructor of the options class.
size_t pos = rspecifier.find(':');
if (pos == std::string::npos) return kNoRspecifier;
if (isspace(*(rspecifier.rbegin()))) return kNoRspecifier; // Trailing space
// disallowed.
std::string before_colon(rspecifier, 0, pos),
after_colon(rspecifier, pos+1);
std::vector<std::string> split_first_part; // Split part before ':' on ', '.
SplitStringToVector(before_colon, ", ", false, &split_first_part); // false==
// don't omit empty strings between commas.
RspecifierType rs = kNoRspecifier;
for (size_t i = 0; i < split_first_part.size(); i++) {
const std::string &str = split_first_part[i]; // e.g. "b", "t", "f", "ark",
// "scp".
const char *c = str.c_str();
if (!strcmp(c, "b")); // Ignore this option. It's so we can use the same
// specifiers for rspecifiers and wspecifiers.
else if (!strcmp(c, "t")); // Ignore this option too.
else if (!strcmp(c, "o")) {
if (opts) opts->once = true;
} else if (!strcmp(c, "no")) {
if (opts) opts->once = false;
} else if (!strcmp(c, "p")) {
if (opts) opts->permissive = true;
} else if (!strcmp(c, "np")) {
if (opts) opts->permissive = false;
} else if (!strcmp(c, "s")) {
if (opts) opts->sorted = true;
} else if (!strcmp(c, "ns")) {
if (opts) opts->sorted = false;
} else if (!strcmp(c, "cs")) {
if (opts) opts->called_sorted = true;
} else if (!strcmp(c, "ncs")) {
if (opts) opts->called_sorted = false;
} else if (!strcmp(c, "bg")) {
if (opts) opts->background = true;
} else if (!strcmp(c, "ark")) {
if (rs == kNoRspecifier) rs = kArchiveRspecifier;
else
return kNoRspecifier; // Repeated or combined ark and scp options
// invalid.
} else if (!strcmp(c, "scp")) {
if (rs == kNoRspecifier) rs = kScriptRspecifier;
else
return kNoRspecifier; // Repeated or combined ark and scp options
// invalid.
} else {
return kNoRspecifier; // Could not interpret this option.
}
}
if ((rs == kArchiveRspecifier || rs == kScriptRspecifier)
&& wxfilename != NULL)
*wxfilename = after_colon;
return rs;
}
注意:
1. 在 kaldi 中 Reader/Writer 都是以模板的形式给出的实现,其中会添加一个 Holder , 这个 Holder 可以简单看作是将 对象 与 流形式 Binary 之间的相互转换 , 然后在用 Reader/Writer 将其进行相应的读取和写入操作
2. Holder 的实现多在 src/util/kaldi-holder.h 中定义,然后在 src/util/kaldi-holder-inl.h 文件中进行实现
3. 对于 外部引用这是在 src/util/table-types.h 中进行 typedef ,从而在外部使用时直接使用即可。
4. 对于实际进行读写操作的 Input/Output 类,则是在 src/util/kaldi-io.cc 文件中进行实现的,其中包含了比如 pipe/file/std 等方式的读写操作,输出类型如下:
enum OutputType {
kNoOutput,
kFileOutput,
kStandardOutput,
kPipeOutput
};
而 读类型如下:
enum InputType {
kNoInput,
kFileInput,
kStandardInput,
kOffsetFileInput,
kPipeInput
};
/// ClassifyWxfilename interprets filenames as follows:
/// - kNoOutput: invalid filenames (leading or trailing space, things that look
/// like wspecifiers and rspecifiers or like pipes to read from with leading
/// |.
/// - kFileOutput: Normal filenames
/// - kStandardOutput: The empty string or "-", interpreted as standard output
/// - kPipeOutput: pipes, e.g. "gunzip -c some_file.gz |"
最后, 总结下 Rspecifier 参数相关的说明:
// Documentation for "rspecifier"
// "rspecifier" describes how we read a set of objects indexed by keys.
// The possibilities are:
//
// ark:rxfilename
// scp:rxfilename
//
// We also allow various modifiers:
// o means the program will only ask for each key once, which enables
// the reader to discard already-asked-for values.
// s means the keys are sorted on input (means we don't have to read till
// eof if someone asked for a key that wasn't there).
// cs means that it is called in sorted order (we are generally asserting
// this based on knowledge of how the program works).
// p means "permissive", and causes it to skip over keys whose corresponding
// scp-file entries cannot be read. [and to ignore errors in archives and
// script files, and just consider the "good" entries].
// We allow the negation of the options above, as in no, ns, np,
// but these aren't currently very useful (just equivalent to omitting the
// corresponding option).
// [any of the above options can be prefixed by n to negate them, e.g. no,
// ns, ncs, np; but these aren't currently useful as you could just omit
// the option].
// bg means "background". It currently has no effect for random-access readers,
// but for sequential readers it will cause it to "read ahead" to the next
// value, in a background thread. Recommended when reading larger objects
// such as neural-net training examples, especially when you want to
// maximize GPU usage.
//
// b is ignored [for scripting convenience] , opts->binary = true
// t is ignored [for scripting convenience] , opts->binary = false
//
//
// So for instance the following would be a valid rspecifier:
//
// "o, s, p, ark:gunzip -c foo.gz|"
最后总结下:
- xfilename的类型如下:
1. “-”或“” 表示标准输入
2. “some command |” 表示一个输入管道命令,i.e.我们去掉管道符“|”,把剩下的字符串通过popen()传入shell
3. “/some/filename:12345” 表示文件的偏置,i.e.我们打开文件并定位至12345
4. “/some/filename”... 与以上不匹配的模式都会被当做普通的文件名(当然,一些明显的错误会被检测出来,在它们被打开之前)
- wxfilename的类型如下:
1. “-”或“” 表示标准输入
2. “| some command” 表示一个输出管道命令,i.e.我们去掉管道符“|”,把剩下的字符串通过popen()传入shell
3. “/some/filename”... 与以上不匹配的模式都会被当做普通的文件名(当然,会检测并过滤掉明显的错误)