目的:记录结合多方资料以及个人理解的剖析代码;
https://heleifz.github.io/14732610572844.html
http://www.cnblogs.com/peghoty/p/3857839.html
一:代码总体模块关联图:
核心模块是fasttext.cc以及model.cc模块,但是辅助模块也很重要,是代码的螺丝钉,以及实现了数据采取什么样子数据结构进行组织,这里的东西值得学习借鉴,而且你会发现存储训练数据的结构比较常用的手段,后期可以对比多个源码的训练数据的结构对比。
部分:螺丝钉代码的剖析
二:dictionary模版
1 /** 2 * Copyright (c) 2016-present, Facebook, Inc. 3 * All rights reserved. 4 * 5 * This source code is licensed under the BSD-style license found in the 6 * LICENSE file in the root directory of this source tree. An additional grant 7 * of patent rights can be found in the PATENTS file in the same directory. 8 */ 9 10 #include "dictionary.h" 11 12 #include <assert.h> 13 14 #include <iostream> 15 #include <algorithm> 16 #include <iterator> 17 #include <unordered_map> 18 19 namespace fasttext { 20 21 const std::string Dictionary::EOS = "</s>"; 22 const std::string Dictionary::BOW = "<"; 23 const std::string Dictionary::EOW = ">"; 24 25 Dictionary::Dictionary(std::shared_ptr<Args> args) { 26 args_ = args; 27 size_ = 0; 28 nwords_ = 0; 29 nlabels_ = 0; 30 ntokens_ = 0; 31 word2int_.resize(MAX_VOCAB_SIZE);//建立全词的索引,hash值在0~MAX_VOCAB_SIZE-1之间 32 for (int32_t i = 0; i < MAX_VOCAB_SIZE; i++) { 33 word2int_[i] = -1; 34 } 35 } 36 //根据字符串,进行hash,hash后若是冲突则线性探索,找到其对应的hash位置 37 int32_t Dictionary::find(const std::string& w) const { 38 int32_t h = hash(w) % MAX_VOCAB_SIZE; 39 while (word2int_[h] != -1 && words_[word2int_[h]].word != w) { 40 h = (h + 1) % MAX_VOCAB_SIZE; 41 } 42 return h; 43 } 44 //向words_添加词,词可能是标签词 45 void Dictionary::add(const std::string& w) { 46 int32_t h = find(w); 47 ntokens_++;//已处理的词 48 if (word2int_[h] == -1) { 49 entry e; 50 e.word = w; 51 e.count = 1; 52 e.type = (w.find(args_->label) == 0) ? entry_type::label : entry_type::word;//与给出标签相同,则表示标签词 53 words_.push_back(e); 54 word2int_[h] = size_++; 55 } else { 56 words_[word2int_[h]].count++; 57 } 58 } 59 //返回纯词个数--去重 60 int32_t Dictionary::nwords() const { 61 return nwords_; 62 } 63 //标签词个数---去重 64 int32_t Dictionary::nlabels() const { 65 return nlabels_; 66 } 67 //返回已经处理的词数---可以重复 68 int64_t Dictionary::ntokens() const { 69 return ntokens_; 70 } 71 //获取纯词的ngram 72 const std::vector<int32_t>& Dictionary::getNgrams(int32_t i) const { 73 assert(i >= 0); 74 assert(i < nwords_); 75 return words_[i].subwords; 76 } 77 //获取纯词的ngram,根据词串 78 const std::vector<int32_t> Dictionary::getNgrams(const std::string& word) const { 79 int32_t i = getId(word); 80 if (i >= 0) { 81 return getNgrams(i); 82 } 83 //若是该词没有被入库词典中,未知词,则计算ngram 84 //这就可以通过其他词的近似ngram来获取该词的ngram 85 std::vector<int32_t> ngrams; 86 computeNgrams(BOW + word + EOW, ngrams); 87 return ngrams; 88 } 89 //是否丢弃的判断标准---这是由于无用词会出现过多的词频,需要被丢弃, 90 bool Dictionary::discard(int32_t id, real rand) const { 91 assert(id >= 0); 92 assert(id < nwords_); 93 if (args_->model == model_name::sup) return false;//非词向量不需要丢弃 94 return rand > pdiscard_[id]; 95 } 96 //获取词的id号 97 int32_t Dictionary::getId(const std::string& w) const { 98 int32_t h = find(w); 99 return word2int_[h]; 100 } 101 //词的类型 102 entry_type Dictionary::getType(int32_t id) const { 103 assert(id >= 0); 104 assert(id < size_); 105 return words_[id].type; 106 } 107 //根据词id获取词串 108 std::string Dictionary::getWord(int32_t id) const { 109 assert(id >= 0); 110 assert(id < size_); 111 return words_[id].word; 112 } 113 //hash规则 114 uint32_t Dictionary::hash(const std::string& str) const { 115 uint32_t h = 2166136261; 116 for (size_t i = 0; i < str.size(); i++) { 117 h = h ^ uint32_t(str[i]); 118 h = h * 16777619; 119 } 120 return h; 121 } 122 //根据词计算其ngram情况 123 void Dictionary::computeNgrams(const std::string& word, 124 std::vector<int32_t>& ngrams) const { 125 for (size_t i = 0; i < word.size(); i++) { 126 std::string ngram; 127 if ((word[i] & 0xC0) == 0x80) continue; 128 for (size_t j = i, n = 1; j < word.size() && n <= args_->maxn; n++) {//n-1个词背景 129 ngram.push_back(word[j++]); 130 while (j < word.size() && (word[j] & 0xC0) == 0x80) { 131 ngram.push_back(word[j++]); 132 } 133 if (n >= args_->minn && !(n == 1 && (i == 0 || j == word.size()))) { 134 int32_t h = hash(ngram) % args_->bucket;//hash余数值 135 ngrams.push_back(nwords_ + h); 136 } 137 } 138 } 139 } 140 //初始化ngram值 141 void Dictionary::initNgrams() { 142 for (size_t i = 0; i < size_; i++) { 143 std::string word = BOW + words_[i].word + EOW; 144 words_[i].subwords.push_back(i); 145 computeNgrams(word, words_[i].subwords); 146 } 147 } 148 //读取词 149 bool Dictionary::readWord(std::istream& in, std::string& word) const 150 { 151 char c; 152 std::streambuf& sb = *in.rdbuf(); 153 word.clear(); 154 while ((c = sb.sbumpc()) != EOF) { 155 if (c == ' ' || c == ' ' || c == ' ' || c == ' ' || c == 'v' || c == 'f' || c == '