简单词法分析器实现

编写分析器有两种方法,一种是通过DFA对单词进行识别,二是通过直接编敲代码进行识别。

本程序採用DFA对单词进行识别。
DFA的实现方法。大概思想和书上一致,在程序中,则是用二维数组代表状态转换矩阵,用一维数组表示终态。


一个词法编辑要实现的功能主要包含下面几点:

可以识别标识符、keyword、数字和运算符,对凝视进行过滤。同一时候还能识别出程序错误。

使用说明:

本程序的输入由当前文件夹下的in.txt文件读取输入,输出为一系列二元式
#include<stdio.h>
#include<string.h>
#include<stdlib.h>
//用指针而不是二维数组表示。这样就不用指定字符串长度,仅仅是不能改动指针所指向字符串的内容
char *key[]={"int","char","float","double","void","const","for","if","else","then","while","switch","break","main","return"};
char buffer[20];//存储当前识别出的单词
char *identifier[50];
char *num[50];
int ident_num;//标志符数
int number;//数字数
int judgement_num(char c) {
	if (c >= '0' && c <= '9') return 0;
	else if (c == '.') return 1;
	return -1;
}
int judge(char c){
	if(c=='_') return 2;
	else if(c>='0'&&c<='9') return 1;
	else if(c>='a'&&c<='z'||c>='A'&&c<='Z') return 0;
	return -1;
}
//next_i,next_j分别代表着状态转换表move的当前状态与接收到的字符
//width代表每一状态可能遇到的状态数,length代表终态集大小
int DFA(int begin,int move[],int width,int final[],int length,int(*judge)(char)){
    int len=0,next_i,next_j,tmp;
    char next_char;
    memset(buffer,0,sizeof(buffer));
    next_char=getchar();
    next_i=begin;
    while((next_j=judge(next_char))!=-1){
		tmp=next_i;
		next_i=move[next_i*width+next_j];
		if(next_i==-1){
            printf("move[%d][%d]has not next state
",tmp,next_j);
            return -1;
		}
		buffer[len++]=next_char;
        next_char=getchar();

    }
    ungetc(next_char,stdin);
    buffer[len]='';
    for(int i=0;i<length;i++){
		if(next_i==final[i])  return 0;
    }
    return -1;
}

void is_comment(){
    char next_char;
    char tmp[5];
    memset(tmp,0,sizeof(tmp));
    tmp[0]=getchar();
    next_char=getchar();
    if(next_char=='*'){
		while((next_char=getchar())!=EOF){
            if(next_char=='*'&&getchar()=='/') return;
		}
		printf("The comment is error
");
    }
    else if(next_char=='/'){
        while((next_char=getchar())!='
');
    }
    else if(next_char=='='){
        printf("(/=,_)
");
    }
    else{
		printf("(/,_)
");
		ungetc(next_char,stdin);
    }
}
void is_other(){
    char next_char,c=getchar();
    if(c=='+'||c=='-'||c=='*'||c=='%'){
        next_char=getchar();
        if(next_char=='=') printf("(%c=,_)
",c);
        else {
			printf("(%c,_)
",c);
			ungetc(next_char,stdin);
        }
    }
    else if(c=='>'||c=='<'){
		next_char=getchar();
        if(next_char=='=') printf("(rlop,%c=)
",c);
        else {
			printf("(rlop,%c)
",c);
			ungetc(next_char,stdin);
        }
    }
    else if(c=='!'){
		next_char=getchar();
        if(next_char=='=') printf("(!=,_)
");
        else {
			printf("(not,_)
");
			ungetc(next_char,stdin);
        }
    }
    else if(c=='|'){
		next_char=getchar();
        if (next_char == '|')
			printf("(or,||)
");
		else {
			ungetc(next_char, stdin);
		}
    }
    else if(c=='&'){
		next_char=getchar();
        if (next_char == '&')
			printf("(and,&&)
");
		else {
			ungetc(next_char, stdin);
		}
    }
    else if(c=='='||c=='('||c==')'||c=='['||c==']'||c==';'||c==','||c=='{'||c=='}'){
        printf("(%c,_)
",c);
    }
}
void is_digit(){
    int begin=0;
    int move[]={1,-1,1,2,2,-1};
	int final[]={1,2};
	int result=-1;
	result=DFA(begin,move,2,final,2,judgement_num);
	if(result==-1){
		printf("digit DFA error
");
		exit(-1);
	}
	else if(result==0){
        //printf("%s
",buffer);
        for(int i=0;i<number;i++){
			if(strcmp(buffer,num[i])==0){
                printf("(num,%s)
",buffer);
				return;
			}
        }
        num[number]=(char*)malloc(sizeof(buffer));
        strcpy(num[number++],buffer);
        printf("(num,%s)
",buffer);
        return;
	}
}
void is_letter(){
	int i;
	int begin=0;
	int move[] ={1,-1,1,1,1,1};
	int final[]={1,2};
	int result=-1;
	result=DFA(begin,move,3,final,1,judge);
    if(result==-1){
		printf("letter DFA error
");
		exit(-1);
    }
    else if(result==0){
		int len=sizeof(key)/sizeof(char *);
		//如为keyword
        for(i=0;i<len;i++){
			if(strcmp(buffer,key[i])==0){
                printf("(key,%s)
",key[i]);
                return;
			}
        }
        //如为标志符
        for(i=0;i<ident_num;i++){
			//在当前标志符表中已经存在
			if(strcmp(buffer,identifier[i])==0){
				printf("(%id,%s)
",identifier[i]);
				return;
			}
        }
        //如不存在,则写入
        identifier[ident_num]=(char*)malloc(sizeof(buffer));
        strcmp(identifier[ident_num++],buffer);
        printf("(id,%s)
",buffer);
        return;
    }
}

void init() {
	ident_num=0;
    number=0;
}
void work() {
	char c;
    while((c=getchar())!=EOF){
		ungetc(c,stdin);
		if(judge(c)==2||judge(c)==0) is_letter();
		else if(judge(c)==1) is_digit();
		else if(c=='/') is_comment();
		else is_other();
    }
}

int main() {
    freopen("in.txt","r",stdin);
    init();
    work();
    return 0;
}


原文地址:https://www.cnblogs.com/wzzkaifa/p/7039331.html