全文检索模型算法实现


全文检索是一种将文件中所有文本与检索项匹配的文字资料检索方法。全文检索系统是按照全文检索理论建立起来的用于提供全文检索服务的软件系统。




#include <stdlib.h>
#include <string.h>
#include <stdio.h>

//Status是函数的类型,其值是函数结果状态代码
typedef int Status;
#define INT_MAX 32767
#define LINESIZE 80//设一行字符数为80
#define MAXKEYLEN 26//关键字的长度
#define MAXNUM 100//统计单词的最大数
typedef struct{	  
	char *ch;  //关键字
	int  num;  //关键字的长度
	char *info;//关键字有关信息
}KeysType;     //关键字类型

typedef enum{LEAF,BRANCH} NodeKind;//结点种类:{叶子,分支}

typedef struct DLTNode{
	char symbol;
	struct DLTNode *next;     //指向兄弟结点的指针
	NodeKind kind;            //结点标志
	union{
		struct DLTNode *first;//分支结点的孩子链指针
		struct {
			int idx;          //叶子结点的count数组下标指针
			KeysType infoptr;
		};
	};
}DLTNode,*DLTree;   //双链树类型
char line[LINESIZE];//用于缓存文章中每行的字符串
struct{
	int times;
	KeysType info;
}count[MAXNUM];
int NUM=0;//记录整个文章的单词总数
char ch1[17][100]={"a","an","the","them","there","here","they","are","that","this","those","what","which","why","then","and","these"};

void InitTree(DLTree &root){
	//初始化键树
	root=new DLTNode;
	root->next =NULL;
	root->first=NULL;
}//InitTree

Status Insert_DLTree(DLTree &root,KeysType K,int &n){
	//指针root所指双链树中已含n个关键字,若不存在和K相同的关键字,
	//则将关键字K插入到双链树中相应位置,树中关键字个数n增1且返回TRUE;..
	//否则不再插入返回FALSE
	DLTree p=root->first,f=root,pre,s;
	int j=0;
    while (p && j<K.num ){//在键树中进行查找
        pre=NULL;
		while (p && p->symbol <K.ch [j]){//查找和K.ch[j]相同的结点
			pre=p;
			p=p->next ;
		}
		if (p && p->symbol ==K.ch[j]){
			f=p;
			p=p->first ;
			j++;
		}//找到后进入键树的下一层,即查找和K.ch[j+1]相同的结点
		else{//没有找到和K.ch[j]相同的结点,插入K.ch[j]
			s=new DLTNode;
			s->kind =BRANCH;
			s->symbol =K.ch [j++];
			if (pre) pre->next =s;
			else f->first =s;
			s->next =p;
			s->first =NULL;
			p=s;
			break;
		}//else
	}//while
    if (p&&j==K.num && p->first&&p->first ->kind ==LEAF) return FALSE;
	else{//键树中已存在相同前缀的单词,插入由剩余字符构成的单支树
		while (j<=K.num ){
				s=new DLTNode;
				s->next =NULL;
				if (p){
				s->next =p->next ;
				p->first =s;
				p=s;
				}
				else {
					f->first =s;
					p=s;
				}
				if(j<K.num ){
					s->kind =BRANCH;
					s->symbol =K.ch [j++];
					s->first =NULL;
				}
				else{
					s->symbol ='$';
					s->kind =LEAF;
					n++;
					s->idx =n;
					s->infoptr.ch =count[s->idx ].info.ch =K.ch ;
					s->infoptr.info =count[s->idx ].info.info =K.info ;
					s->infoptr.num =count[s->idx ].info.num=K.num ;
					count[s->idx ].times=0;
					j++;
				}
			}//while
		return TRUE;
		}//else
}//Insert_DLTree


void CreatDLTree(DLTree &T,KeysType *key,int &n){
	//建立键树模型
	for (int i=0;i<=16;i++){
		key[i].ch=ch1[i];
        key[i].info=NULL;
		key[i].num =strlen(key[i].ch );
	}
	key[0].info ="indefinite article.";//a
	key[1].info ="indefinite articles.";//an
	key[2].info ="definite article.";//the
    key[3].info ="personal pronoun";//them
    key[4].info ="adv. of place and direction";//there
    key[5].info ="to this point or place";//here
    key[6].info ="pronoun";//they
    key[7].info ="v.i. joining subject & predicate";//are
    key[8].info ="adj. & pron.";//that
    key[9].info ="adj. & pron.";//this
    key[10].info ="adj. & pron.";//those
    key[11].info ="adj. & pron.,asking for a selection from an indefinite number";//what
    key[12].info ="adj. & pron.,asking for a selection from two or a group";//which
    key[13].info ="adj. & int.,for what reason,with what purpose";//why
    key[14].info ="adv.,at what time";//then
    key[15].info ="conj.,connecting words";//and 
    key[16].info ="adj. & pron.";//these
    for(i=0;i<=16;i++)
      Insert_DLTree(T,key[i],n);//建立键树模型
}//CreatDLTree

Status Search_DLTree(DLTree rt,int j,int &k){
	//若line中从第j个字符起长度为k的子串和指针rt所指向双链树中单词相同,
	//则数组count中相应分量增1,并返回TRUE,否则返回FALSE
	DLTree p;
	int found;
	k=0;
	found=FALSE;
	p=rt->first ;//p指向双链树中的第一棵子树的树根
    while (p && !found){
		while (p && p->symbol <line[j+k]) p=p->next ;
		if (!p||p->symbol >line[j+k]) break;//在键树的第k+1层上匹配失败
		else{//继续匹配
			p=p->first ;
			k++;
			if (p->kind ==LEAF){//找到一个单词
				if (!(line[j+k]>='a'&&line[j+k]<='z')||(line[j+k]>='A'&&line[j+k]<='Z' )){
				count[p->idx ].times++;//统计数组对应元素加1
				found=TRUE;
				}//若键树中含有"commit",文本行中为commit则为找到,若为committing则为没找到
			}//if
		}//else
	}//while
	return found;
}//Search_DLTree

void setmatch(DLTree root,char *line,FILE *f){
	//统计以root为根指针的键树中,各关键字在本文本串line中重复出现的次数,
	//并将其累加到统计数组count中去
	unsigned i=0;
	int k;//若查找成功,返回的k为所查找的关键字长度
	while (fgets(line,LINESIZE,f)!=NULL){
		cout<<line;
        i=0;
		while (i<=strlen(line)){//LINESIZE){
			if (!Search_DLTree(root,i,k)) {
				if (((line[i]>='a' && line[i]<='z')||(line[i]>='A'&&line[i]<='Z')||(line[i]>='0'&&line[i]<='9')) &&!((line[i+1]>='a'&&line[i+1]<='z')||(line[i+1]>='A'&&line[i+1]<='Z')||(line[i+1]>='0'&&line[i+1]<='9')))
					NUM++;//单词总数加1
    				i++;  //若查找不成功,则从下一个字符开始查找
				}//if
	    	else {
		 		i+=k;     //查找成功,继续在文本串中的第i+k-1个字符开始查找
				NUM++;
			}//else
		}//while
	}//while
}//setmatch

void Input(FILE *&f){
	char fname[30];
	cout<<"please input the file name:";
	cin>>fname;                     //输入文件名
	if ((f=fopen(fname,"r"))==NULL){//输入错误文件名, 无法打开
		cout<<"Can NOT open the file!"<<endl;
		exit(1);
	}
}//Input

void output(int n){
	cout<<endl;
	for(int i=1;i<=n;i++)
		if (count[i].times){
			cout<<count[i].info.ch<<" : ";
			if(count[i].info.info) cout<<count[i].info.info<<endl;
			cout<<float(count[i].times)/float(NUM)<<" in the sentence."<<endl;
		}//输出文章中含有键树中的关键字及有关解释,出现频率。即:该关键字出现次数/文章单词总数
 	cout<<endl;
}


原文地址:https://www.cnblogs.com/new0801/p/6177607.html