

第一部分 理论基础



        分词结果1: w1:有/ 意见/ 分歧/

        分词结果2: w2:有意/ 见/ 分歧/

        最大概率分词就是要求得 Max(P(w1|s),P(w2|s)) 。


                P(w|s)=P(s|w)P(w)/P(s)                                                                      (公式1)



                P(w)=P(w1,w2,…,w3)=P(w1)P(w2)…P(w3)                                   (公式2)














                P`(wi)=P`(wi-1)P(wi)                                             (公式3)


第二部分 算法实现



























                P`(意见) > P`(见)

        (4) “分歧”是尾词,“意见”是“分歧”的最佳前趋词,分词过程结束。

第三部分 结果展示


第四部分 源码



#include <iostream>
#include <string>
#include <fstream>
#include <sstream>
#include <map>
#include <cstdlib>

using namespace std;

 * 词典的定义。用于最大概率分词
class Dictionary{
		string strline;			//保存每行内容
		string word;			//保存一个词语
		map<string, int> word_map;	//词典。用map表示
		long size;			//词典规模
		long freq_all;
		long arr_1[20];
		double arr_2[20];
		Dictionary();			//构造函数,初始化词典
		int findWord(string word);	//在词典中查找特定的词语

	freq_all = 0;
	for(int i = 0; i < 20; i++){
		arr_1[i] = 0;
		arr_2[i] = 0.0;
	fstream fin("dict_3.txt");
		cerr << "open file error !" << endl;
	while(getline(fin, strline, '
		istringstream istr(strline);
		istr >> word;		//从流中读取单词
		++word_map[word];	//
	size = word_map.size();
	for(int i = 0; i < 20; i++){
		arr_2[i] = (double)arr_1[i]/freq_all;


int Dictionary::findWord(string word){
	map<string, int>::iterator p_cur = word_map.find(word); 
	if(p_cur != word_map.end()){
		return p_cur -> second;
		return -1;


#include <cmath>
#include <string>
#include <iostream>
#include "dictionary_2.h"

const short MaxWordLength = 20;	//词典中最大词的长度
const char Separator = '/';     //词界标记

Dictionary word_dict;           //初始化一个词典
 * 类定义:候选词的结构
class Candidate{
		short pos;	//候选词在输入串中的起点
		short length;	//输入串的长度
		short bestPrev;	//最佳前趋词的序号
		float fee;	//候选词的费用
		float sumFee;	//候选词路径上的累计费用
		string word;	//候选词
		int freq;	//候选词的频数(不能用short,否则有可能溢出)

 * 函数功能:取出字符串中的所有候选词
 * 函数输入:字符串的引用
 * 函数输出:该字符串中含有的所有的存在与词典中的词(或者单字,单字能够在词典中不存在)
vector<Candidate> getTmpWords(const string &s){
	int freq = 0;			//词典中词的频率
	short n = s.length();		//字符串的长度
	string word = "";		//存放候选词
	Candidate cand;			//存放候选词属性
	vector<Candidate> vec_cd;	//候选词队列

	for(short i = 0; i < n; i += 2){
		//词的长度为 1~MaxWordLength/2 个汉字
		for(short len = 2; len <= MaxWordLength; len += 2){
			word = s.substr(i, len);
			freq = word_dict.findWord(word);//去词典中查找出现频率
			if(len > 2 && freq == -1){
			if(freq == -1){
				freq = 0;
			cand.pos = i;			//该候选词在汉字串中的起点
			cand.length = len;		//该候选词的长度
			cand.word = word;
			cand.fee = -log((double)(freq*1 + 1)/word_dict.freq_all);//该候选词的费用
			cand.sumFee = 0.0f;		//该候选词的累计费用置初值
			cand.freq = freq;

	return vec_cd;

 * 函数功能:获取最佳前趋词序号
 * 函数输入:候选词列表的引用
 * 函数输出:无
void getPrew(vector<Candidate> &vec_cd){
	short min_id = -1;				//最佳前趋词编号
	short j = -1;
	short size = (short)vec_cd.size();		//计算队列长度
	for(short i = 0; i < size; i++){
		if(vec_cd[i].pos == 0){
			vec_cd[i].bestPrev = -1;	//无前趋词
			vec_cd[i].sumFee = vec_cd[i].fee;	//累计费用为该词本身费用
			min_id = -1;			//初始化最佳前趋词编号
			j = i - 1;			//从当前对象向左找
			while(j >= 0){
				if(vec_cd[j].pos + vec_cd[j].length == vec_cd[i].pos){
					if(min_id == -1 || vec_cd[j].sumFee < vec_cd[min_id].sumFee){
						min_id = j;

			vec_cd[i].bestPrev = min_id;	//登记最佳前趋编号
			vec_cd[i].sumFee = vec_cd[i].fee + vec_cd[min_id].sumFee;//登记最小累计费用

 * 函数功能:最大概率法分词
 * 函数输入:待切分的字符串
 * 函数输出:切分好的字符串
string segmentSentence_MP(string s1){
	short len = s1.length();
	short min_id = -1;		//最小费用路径的终点词的序号
	vector<Candidate> vec_cd = getTmpWords(s1);


	short n = (short)vec_cd.size();
	for(short i = 0; i < n; i++){
		if(vec_cd[i].pos + vec_cd[i].length == len){
			if(min_id == -1 || vec_cd[i].sumFee < vec_cd[min_id].sumFee){
				min_id = i;

	string s2 = "";		//输出串初始化
	for(short i = min_id; i >= 0; i = vec_cd[i].bestPrev){
		s2 = s1.substr(vec_cd[i].pos, vec_cd[i].length) + Separator + s2;
	return s2;

 * 函数功能:对字符串用最大匹配算法(正向)处理
 * 函数输入:汉字字符串
 * 函数输出:分好词的字符串
string segmentSentence_1(string s1){
	string s2 = "";		//用s2存放分词结果
		int len = s1.length();	//取输入串长度
		if(len > MaxWordLength){
			len = MaxWordLength;	//仅仅在最大词长范围内进行处理
		string w = s1.substr(0, len);
		int n = word_dict.findWord(w);	//在词典中查找对应的词
		while(len > 2 && n == -1){
			len -= 2;	//从候选词右边减掉一个汉字。将剩下的部分作为候选词
			w = s1.substr(0, len);
			n = word_dict.findWord(w);

		s2 = s2 + w + Separator;
		s1 = s1.substr(w.length(), s1.length() - w.length());
	return s2;

 * 函数功能:对字符串用最大匹配算法(逆向)处理
 * 函数输入:汉字字符串
 * 函数输出:分好词的字符串
string segmentSentence_2(string s1){
	string s2 = "";		//用s2存放分词结果
		int len = s1.length();	//取输入串长度
		if(len > MaxWordLength){
			len = MaxWordLength;	//仅仅在最大词长范围内进行处理
		string w = s1.substr(s1.length() - len, len);
		int n = word_dict.findWord(w);	//在词典中查找对应的词
		while(len > 2 && n == -1){
			len -= 2;	//从候选词左边减掉一个汉字,将剩下的部分作为候选词
			w = s1.substr(s1.length() - len, len);
			n = word_dict.findWord(w);

		w = w + Separator;
		s2 = w + s2;
		s1 = s1.substr(0, s1.length() - len);
	return s2;


#include <cstdlib>
#include <vector>
#include <iomanip>
#include <map>
#include <algorithm>
#include <sys/time.h>
#include <sys/stat.h>
#include "segmentwords.cpp"

const long MaxCount = 50000;	//须要切分的最大句子数量。若该值大于文件里

long getCurrentTime(){
	struct timeval tv;
	gettimeofday(&tv, NULL);
	return tv.tv_sec*1000 + tv.tv_usec/1000;

unsigned long getFileSize(string file_path){
	unsigned long filesize = -1;
	struct stat statbuff;
	if(stat(file_path.c_str(), &statbuff) < 0){
		return filesize;
		filesize = statbuff.st_size;
		return filesize;

 * 函数功能:对句子进行最大匹配法处理,包括对特殊字符的处理
 * 函数输入:1.含有汉字、英文符号的字符串
 *         2.flag=1调用正向最大匹配算法。flag=2调用逆向最大匹配算法
 * 函数输出:分好词的字符串
string SegmentSentenceMM(string s1, int flag){
	string s2 = "";	//用s2存放分词结果
	int i;
	int dd;
		unsigned char ch = (unsigned char)s1[0];
		if(ch < 128){
			i = 1;
			dd = s1.length();

			while(i < dd && ((unsigned char)s1[i] < 128) && (s1[i] != 10) && (s1[i] != 13)){

			if(i == 1 && (ch == 10 || ch == 13)){
				s2 += s1.substr(0, i);
				s2 += s1.substr(0, i) + Separator;
			s1 = s1.substr(i, dd);
			if(ch < 176){
				i = 0;
				dd = s1.length();
				while(i < dd && ((unsigned char)s1[i] < 176) && ((unsigned char)s1[i] >= 161)
					&& (!((unsigned char)s1[i] == 161 && ((unsigned char)s1[i+1] >= 162 && (unsigned char)s1[i+1] <= 168)))
					&& (!((unsigned char)s1[i] == 161 && ((unsigned char)s1[i+1] >= 171 && (unsigned char)s1[i+1] <= 191)))
					&& (!((unsigned char)s1[i] == 163 && ((unsigned char)s1[i+1] == 161 || (unsigned char)s1[i+1] == 168
					||   (unsigned char)s1[i+1] == 169 || (unsigned char)s1[i+1] == 172 || (unsigned char)s1[i+1] == 186 
					||   (unsigned char)s1[i+1] == 187 || (unsigned char)s1[i+1] == 191)))){
					i = i + 2;
				if(i == 0){
					i = i + 2;

				s2 += s1.substr(0, i) + Separator;

				s1 = s1.substr(i, dd);
		i = 2;
		dd = s1.length();
		while(i < dd && (unsigned char)s1[i] >= 176){
			i += 2;

		if(flag == 1){
			s2 += segmentSentence_1(s1.substr(0, i));
		}else if(flag == 2){
			s2 += segmentSentence_2(s1.substr(0, i));
		}else if(flag == 3){
			s2 += segmentSentence_MP(s1.substr(0, i));

		s1 = s1.substr(i, dd); 

	return s2;

 * 函数功能:删除分词标记(即去掉字符串中的/)
 * 函数输入:含有分词标记的字符串
 * 函数输出:不含分词标记的字符串
string removeSeparator(string str_in){
	char s[10000];
	int j = 0;
	for(int i = 0; i < str_in.length(); i++){
		if(!(str_in[i] == '/')){
			s[j] = str_in[i];
	s[j] = '';
	string str_out = s;
	return str_out;

 * 函数功能:计算切分标记的位置
 * 函数输入:1.strline_in未进行切分的汉字字符串
 * 函数输出:vecetor,当中存放了strline_in中哪些位置放置了分词标记
 *         注意:vector中不包括最后标记的位置,可是包括位置0。
vector<int> getPos(string strline_right, string strline_in){
	int pos_1 = 0;
	int pos_2 = -1;
	int pos_3 = 0;
	string word = "";
	vector<int> vec;

	int length = strline_right.length();
	while(pos_2 < length){
		pos_1 = pos_2;
		pos_2 = strline_right.find('/', pos_1 + 1);

		if(pos_2 > pos_1){
			word  = strline_right.substr(pos_1 + 1, pos_2 - pos_1 - 1);
			pos_3 = strline_in.find(word, pos_3);
			pos_3 = pos_3 + word.size();
	return vec;

 * 获取标准切分和程序切分的结果
string getString(string word, int pos, vector<int> vec_right){
	char ss[1000];
	int i = 0;
	int k = 0;
	while(vec_right[i] < pos){
	for(int j = 0; j < word.size(); j++){
		if(j == vec_right[i] - pos){
			if(j != 0){
				ss[k] = '/';
		ss[k] = word[j];
	ss[k] = '';
	string word_str = ss;

	return word_str;

 * 函数功能:获取单个句子切分的结果统计
 * 函数输入:1.vec_right 正确的分词标记位置集合
 *           2.vec_out   函数切分得到的分词标记位置集合
 * 函数输出:返回一个veceor。含有4个元素,分别为:
 *          切分正确、组合型歧义、未登录词、交集型歧义的数量
vector<int> getCount_2(string strline, vector<int> vec_right, vector<int> vec_out, vector<string> &vec_err){
	vector<int> vec(4, 0);	//存放计算结果
	map<int, int> map_result;
	for(int i = 0; i < vec_right.size(); i++){
		map_result[vec_right[i]] += 1;
	for(int i = 0; i < vec_out.size(); i++){
		map_result[vec_out[i]] += 2;

	map<int, int>::iterator p_pre, p_cur;
	int count_value_1 = 0;
	int count_value_2 = 0;
	int count_value_3 = 0;
	p_pre = map_result.begin();
	p_cur = map_result.begin();
	while(p_cur != map_result.end()){
		while(p_cur != map_result.end() && p_cur -> second == 3){
			p_pre = p_cur;
			++count_value_3;	//切分正确的数目
			++p_cur;		//迭代器后移
		while(p_cur != map_result.end() && p_cur -> second != 3){
			if(p_cur -> second == 1){
			}else if(p_cur -> second == 2){
		if(p_cur == map_result.end() && p_cur == (++p_pre)){
		int pos_1 = p_pre -> first;
		int pos_2 = p_cur -> first; 
		string word = strline.substr(pos_1, pos_2 - pos_1);	//切分错误的单词
		string word_right = getString(word, pos_1, vec_right);	//正确的切分方式
		string word_out = getString(word, pos_1, vec_out);	//得到的切分方式
		string str_err = "";
		if(count_value_1 > 0 && count_value_2 == 0){
			str_err = "  组合型歧义: " + word + "    正确切分: " + word_right + "    错误切分: " + word_out;
			cout << str_err << endl;
			vec[1] += count_value_1;		
		}else if(count_value_1 == 0 && count_value_2 > 0){
			str_err = "  未登录词语: " + word + "    正确切分: " + word_right + "    错误切分: " + word_out;
			cout << str_err << endl;
			vec[2] += count_value_2;
		}else if(count_value_1 > 0 && count_value_2 > 0){
			str_err = "  交集型歧义: " + word + "    正确切分: " + word_right + "    错误切分: " + word_out;
			cout << str_err << endl;
			vec[3] += count_value_2;	

		count_value_1 = 0;
		count_value_2 = 0;

	vec[0] += count_value_3;	

	return vec;

 * 主函数:进行分词并统计分词结果
int main(int argc, char *argv[]){
	long time_1 = getCurrentTime();
	string strline_right;	//输入语料:用作标准分词结果
	string strline_in;	//去掉分词标记的语料(用作分词的输入)
	string strline_out_1;	//正向最大匹配分词完成的语料
	string strline_out_2;	//逆向最大匹配分词完成的语料
	string strline_out_3;	//最大概率方法分词完成的语料
	ifstream fin("test.txt");	//打开输入文件
		cout << "Unable to open input file !" << argv[1] << endl;

	ofstream fout("result.txt");	//确定输出文件
		cout << "Unable to open output file !" << endl;

	long count = 0;			//句子编号
	long count_0 = 0;		//三种方法切分都正确的句子总数
	long count_1 = 0;		//正向最大匹配全然正确的句子总数
	long count_2 = 0;		//逆向最大匹配全然正确的句子总数
	long count_3 = 0;		//最大概率方法全然正确的句子总数

	long count_right_all = 0;	//准确的切分总数
	long count_out_1_all = 0;	//正向最大匹配切分总数
	long count_out_2_all = 0;	//逆向最大匹配切分总数
	long count_out_3_all = 0;	//最大概率方法切分总数
	long count_out_1_right_all = 0;	//正向最大匹配切分正确总数
	long count_out_2_right_all = 0;	//逆向最大匹配切分正确总数
	long count_out_3_right_all = 0;	//最大概率方法切分正确总数
	long count_out_1_fail_1_all = 0;//正向最大匹配(组合型歧义)
	long count_out_1_fail_2_all = 0;//正向最大匹配(未登录词语)
	long count_out_1_fail_3_all = 0;//正向最大匹配(交集型歧义)
	long count_out_2_fail_1_all = 0;//逆向最大匹配(组合型歧义)
	long count_out_2_fail_2_all = 0;//逆向最大匹配(未登录词语)
	long count_out_2_fail_3_all = 0;//逆向最大匹配(交集型歧义)
	long count_out_3_fail_1_all = 0;//最大概率方法(组合型歧义)
	long count_out_3_fail_2_all = 0;//最大概率方法(未登录词语)
	long count_out_3_fail_3_all = 0;//最大概率方法(交集型歧义)

	vector<string> vec_err_1;	//正向最大匹配切分错误的词
	vector<string> vec_err_2;	//逆向最大匹配切分错误的词
	vector<string> vec_err_3;	//最大概率方法切分错误的词

	while(getline(fin, strline_right, '
') && count < MaxCount){
		if(strline_right.length() > 1){
			strline_in = removeSeparator(strline_right);

			strline_out_1 = strline_right;
			strline_out_1 = SegmentSentenceMM(strline_in, 1);
			strline_out_2 = strline_right;
			strline_out_2 = SegmentSentenceMM(strline_in, 2);

			strline_out_3 = strline_right;
			strline_out_3 = SegmentSentenceMM(strline_in, 3);

			cout << "----------------------------------------------" << endl;
			cout << "句子编号:" << count << endl;
			cout << endl;
			cout << "待分词的句子长度: " << strline_in.length() << "  句子:" << endl;
			cout << strline_in << endl;
			cout << endl;
			cout << "标准比对结果长度: " << strline_right.length() << "  句子:" << endl;
			cout << strline_right << endl;
			cout << endl;
			cout << "正向匹配分词长度: " << strline_out_1.length() << "  句子:" << endl;
			cout << strline_out_1 << endl;
			cout << endl;
			cout << "逆向匹配分词长度: " << strline_out_2.length() << "  句子:" << endl;
			cout << strline_out_2 << endl;
			cout << endl;
			cout << "最大概率分词长度: " << strline_out_3.length() << "  句子:" << endl;
			cout << strline_out_3 << endl;
			cout << endl;

			vector<int> vec_right = getPos(strline_right, strline_in);
			vector<int> vec_out_1 = getPos(strline_out_1, strline_in);
			vector<int> vec_out_2 = getPos(strline_out_2, strline_in);
			vector<int> vec_out_3 = getPos(strline_out_3, strline_in);

			cout << "标准结果:" << endl;
			for(int i = 0; i < vec_right.size(); i++){
				cout << setw(4) << vec_right[i];
			cout << endl;
			cout << "正向匹配结果:" << endl;
			for(int i = 0; i < vec_out_1.size(); i++){
				cout << setw(4) << vec_out_1[i];
			cout << endl;
			cout << "逆向匹配结果:" << endl;
			for(int i = 0; i < vec_out_2.size(); i++){
				cout << setw(4) << vec_out_2[i];
			cout << endl;
			cout << "最大概率结果:" << endl;
			for(int i = 0; i < vec_out_3.size(); i++){
				cout << setw(4) << vec_out_3[i];
			cout << endl;

			if(vec_right == vec_out_1 && vec_right == vec_out_2 && vec_right == vec_out_3){

			cout << endl;
			if(vec_right == vec_out_1){
				cout << "正向最大匹配全然正确。" << endl;
				cout << "正向最大匹配错误列表:" << endl;
			vector<int> vec_count_1 = getCount_2(strline_in, vec_right, vec_out_1, vec_err_1);
			cout << endl;
			if(vec_right == vec_out_2){
				cout << "逆向最大匹配全然正确!

" << endl; count_2++; }else{ cout << "逆向最大匹配错误列表:" << endl; } vector<int> vec_count_2 = getCount_2(strline_in, vec_right, vec_out_2, vec_err_2); cout << endl; if(vec_right == vec_out_3){ cout << "最大概率方法全然正确!

" << endl; count_3++; }else{ cout << "最大概率方法错误列表:" << endl; } vector<int> vec_count_3 = getCount_2(strline_in, vec_right, vec_out_3, vec_err_3); cout << endl; //准确的切分数量 int count_right = vec_right.size(); //切分得到的数量 int count_out_1 = vec_out_1.size(); int count_out_2 = vec_out_2.size(); int count_out_3 = vec_out_3.size(); //切分正确的数量 int count_out_1_right = vec_count_1[0]; int count_out_2_right = vec_count_2[0]; int count_out_3_right = vec_count_3[0]; cout << "正向最大匹配:" << endl; cout << " 组合型歧义:" << vec_count_1[1] << endl; cout << " 未登录词语:" << vec_count_1[2] << endl; cout << " 交集型歧义:" << vec_count_1[3] << endl; cout << "逆向最大匹配:" << endl; cout << " 组合型歧义:" << vec_count_2[1] << endl; cout << " 未登录词语:" << vec_count_2[2] << endl; cout << " 交集型歧义:" << vec_count_2[3] << endl; cout << "最大概率方法:" << endl; cout << " 组合型歧义:" << vec_count_3[1] << endl; cout << " 未登录词语:" << vec_count_3[2] << endl; cout << " 交集型歧义:" << vec_count_3[3] << endl; count_right_all += count_right; count_out_1_all += count_out_1; count_out_2_all += count_out_2; count_out_3_all += count_out_3; count_out_1_right_all += count_out_1_right; count_out_2_right_all += count_out_2_right; count_out_3_right_all += count_out_3_right; count_out_1_fail_1_all += vec_count_1[1]; count_out_1_fail_2_all += vec_count_1[2]; count_out_1_fail_3_all += vec_count_1[3]; count_out_2_fail_1_all += vec_count_2[1]; count_out_2_fail_2_all += vec_count_2[2]; count_out_2_fail_3_all += vec_count_2[3]; count_out_3_fail_1_all += vec_count_3[1]; count_out_3_fail_2_all += vec_count_3[2]; count_out_3_fail_3_all += vec_count_3[3]; } } long time_2 = getCurrentTime(); unsigned long file_size = getFileSize("test.txt"); //打印错误的切分内容 cout << endl; cout << "---------------------------------" << endl; cout << "错误例子(已排序):" << endl; //选取样本(600个),去掉反复的 //vector<string> vec_small(vec_err.begin(), vec_err.begin() + 600); //sort(vec_small.begin(), vec_small.end()); //vector<string>::iterator end_unique = unique(vec_small.begin(), vec_small.end()); //对错误切分内容进行排序并掉反复的 sort(vec_err_1.begin(), vec_err_1.end()); sort(vec_err_2.begin(), vec_err_2.end()); sort(vec_err_3.begin(), vec_err_3.end()); vector<string>::iterator end_unique_1 = unique(vec_err_1.begin(), vec_err_1.end()); vector<string>::iterator end_unique_2 = unique(vec_err_2.begin(), vec_err_2.end()); vector<string>::iterator end_unique_3 = unique(vec_err_3.begin(), vec_err_3.end()); int num_1 = end_unique_1 - vec_err_1.begin(); int num_2 = end_unique_2 - vec_err_2.begin(); int num_3 = end_unique_3 - vec_err_3.begin(); cout << "----------------------------------" << endl; cout << "正向最大匹配切分错误数量:" << num_1 << endl; for(int i = 0; i < num_1; i++){ cout << vec_err_1[i] << endl; } cout << endl; cout << "----------------------------------" << endl; cout << "逆向最大匹配切分错误数量:" << num_2 << endl; for(int i = 0; i < num_2; i++){ cout << vec_err_2[i] << endl; } cout << endl; cout << "----------------------------------" << endl; cout << "最大概率方法切分错误数量:" << num_3 << endl; for(int i = 0; i < num_3; i++){ cout << vec_err_3[i] << endl; } cout << endl; //计算准确率和召回率 double kk_1 = (double)count_out_1_right_all / count_out_1_all; //正向最大匹配准确率 double kk_2 = (double)count_out_1_right_all / count_right_all; //正向最大匹配召回率 double kk_3 = (double)count_out_2_right_all / count_out_2_all; //逆向最大匹配准确率 double kk_4 = (double)count_out_2_right_all / count_right_all; //逆向最大匹配召回率 double kk_5 = (double)count_out_3_right_all / count_out_3_all; //最大概率方法准确率 double kk_6 = (double)count_out_3_right_all / count_right_all; //最大概率方法召回率 //集中输出结果 cout << endl; cout << "---------------------------------" << endl; cout << "分词消耗时间:" << time_2 - time_1 << "ms" << endl; cout << "測试文件大小:" << file_size/1024 << " KB" << endl; cout << "分词速度为: " << (double)file_size*1000/((time_2 - time_1)*1024) << " KB/s" << endl; cout << endl; cout << "词典规模:" << word_dict.size << endl; cout << endl; cout << "句子总数:" << count << endl; cout << "三种方法切分都正确的句子数目: " << count_0 << " ( " << (double)count_0*100/count << " % )" << endl; cout << "正向最大匹配全然正确的句子数目: " << count_1 << " ( " << (double)count_1*100/count << " % )" << endl; cout << "逆向最大匹配全然正确的句子数目: " << count_2 << " ( " << (double)count_2*100/count << " % )" << endl; cout << "最大概率方法全然正确的句子数目: " << count_3 << " ( " << (double)count_3*100/count << " % )" << endl; cout << endl; cout << "准确的切分总数:" << count_right_all << endl; //准确的切分总数 cout << "正向匹配切分总数:" << count_out_1_all << endl; //正向匹配切分总数 cout << "逆向匹配切分总数:" << count_out_2_all << endl; //逆向匹配切分总数 cout << "最大概率切分总数:" << count_out_3_all << endl; //最大概率切分总数 cout << "正向匹配切分正确总数:" << count_out_1_right_all << endl; //正向匹配切分正确总数 cout << "逆向匹配切分正确总数:" << count_out_2_right_all << endl; //逆向匹配切分正确总数 cout << "最大概率切分正确总数:" << count_out_3_right_all << endl; //逆向匹配切分正确总数 cout << endl; cout << "正向最大匹配:" << endl; long count_out_1_fail_all = count_out_1_fail_1_all + count_out_1_fail_2_all + count_out_1_fail_3_all; cout << " 组合型歧义:" << count_out_1_fail_1_all << " ( " << (double)count_out_1_fail_1_all*100/count_out_1_fail_all << " % )" << endl; cout << " 未登录词语:" << count_out_1_fail_2_all << " ( " << (double)count_out_1_fail_2_all*100/count_out_1_fail_all << " % )" << endl; cout << " 交集型歧义:" << count_out_1_fail_3_all << " ( " << (double)count_out_1_fail_3_all*100/count_out_1_fail_all << " % )" << endl; cout << "逆向最大匹配:" << endl; long count_out_2_fail_all = count_out_2_fail_1_all + count_out_2_fail_2_all + count_out_2_fail_3_all; cout << " 组合型歧义:" << count_out_2_fail_1_all << " ( " << (double)count_out_2_fail_1_all*100/count_out_2_fail_all << " % )" << endl; cout << " 未登录词语:" << count_out_2_fail_2_all << " ( " << (double)count_out_2_fail_2_all*100/count_out_2_fail_all << " % )" << endl; cout << " 交集型歧义:" << count_out_2_fail_3_all << " ( " << (double)count_out_2_fail_3_all*100/count_out_2_fail_all << " % )" << endl; cout << "最大概率方法:" << endl; long count_out_3_fail_all = count_out_3_fail_1_all + count_out_3_fail_2_all + count_out_3_fail_3_all; cout << " 组合型歧义:" << count_out_3_fail_1_all << " ( " << (double)count_out_3_fail_1_all*100/count_out_3_fail_all << " % )" << endl; cout << " 未登录词语:" << count_out_3_fail_2_all << " ( " << (double)count_out_3_fail_2_all*100/count_out_3_fail_all << " % )" << endl; cout << " 交集型歧义:" << count_out_3_fail_3_all << " ( " << (double)count_out_3_fail_3_all*100/count_out_3_fail_all << " % )" << endl; cout << endl; cout << "统计结果:" << endl; cout << "正向最大匹配 准确率:" << kk_1*100 << "% 召回率:" << kk_2*100 << "%" << endl; cout << "逆向最大匹配 准确率:" << kk_3*100 << "% 召回率:" << kk_4*100 << "%" << endl; cout << "最大概率方法 准确率:" << kk_5*100 << "% 召回率:" << kk_6*100 << "%" << endl; return 0; }
