简单的C语言编译器--词法分析器

1. 定义词法单元Tag

首先要将可能出现的词进行分类，可以有不同的分类方式。如多符一类：将所有逗号、分号、括号等都归为一类，或者一符一类，将一个符号归为一类。我这里采用的是一符一类的方式。C代码如下：

    #ifndef TAG_H
    #define TAG_H
    
    namespace Tag {
    	//保留字
    	const int
    		INT = 1, BOOL = 2, MAIN = 3, IF = 4,
    		ELSE = 5, FOR = 6, WHILE = 7, FALSE = 8,
    		BREAK = 9, RETURN = 10, TRUE = 11 ;	
    
    	//运算符
    	const int
    		NOT = 20, NE = 21, AUTOMINUS =22, MINUS = 23,
    		AUTOADD = 24, ADD = 25, OR = 26, 
    		AND = 27, MUTIPLY = 28, DIVIDE = 29, MOD = 30,
    		EQ = 31, ASSIN = 32, GE = 33, GT = 34,
    		LE = 35, LS = 36;
    
    	//分界符
    	const int 
    		COMMA = 40, SEMICOLON = 41, LLBRACKET = 42,
    		RLBRACKET = 43, LMBRACKET = 44, RMBRACKET = 45,
    		LGBRACKET = 46, RGBRACKET = 47;
    
    	//整数常数
    	const int NUM = 50;
    
    	//标识符
    	const int ID = 60;
    
    	//错误
    	const int ERROR = 404;
    
    	//空
    	const int  EMPTY = 70;
    
    }

#endif

2. 具体步骤

一个一个字符地扫描测试代码，忽略空白字符，遇到回车时，记录行数加1
要进行区分标识符(即普通变量名字)和保留字
因为将标识符和常数都guiwe各自归为一类，所以要有算法能够识别出一整个常数和完整的标识符
加入适当的非法词检测

3. 设计词法分析类

设计一个词法分析器，当然要包括如何存储一个词法单元，如何扫描(scan)测试代码等，直接上代码：

myLexer.h

    #ifndef MYLEXER_H
    #define MYLEXER_H
    
    #include <fstream>
    #include <string>
    #include <unordered_map>
    #include "tag.h"
    
    
    /*
     * 主要是定义基本的词法单元类，
     * 声明了词法分析类
     */
    
    //存储词法单元
    class Word {
    	public:
    		Word() = default;
    		Word(std::string s, int t) : lexeme(s), tag(t) {};
    		std::string getLexeme() { return lexeme; };
    		int getTag() { return tag; }
    		void setTag(int t) { tag = t; }
    		void setLexeme(std::string s) { lexeme = s; }
    	private:
    		std::string lexeme;
    		int tag;
    };
    
    //词法分析器类
    class Lexer {
    	public:
    		Lexer();
    		void reserve(Word w);
    		bool readnext(char c, std::ifstream &in);
    		Word scan(std::ifstream &in);
    		int getLine() { return line; }
    	private:
    		char peek;
    		std::unordered_map<std::string, Word> words;
    		int line;
    };
    
    
    #endif

myLexer.cpp

    #include <iostream>
    #include <cctype>
    #include <sstream>
    #include "myLexer.h"
    
    void Lexer::reserve(Word w) {
    	words.insert({w.getLexeme(), w});
    }
    
    Lexer::Lexer() {
    	//存入保留字，为了区分标识符
    	reserve( Word("int", Tag::INT) );
    	reserve( Word("bool", Tag::BOOL) );
    	reserve( Word("main", Tag::MAIN) );
    	reserve( Word("if", Tag::IF) );
    	reserve( Word("else", Tag::ELSE) );
    	reserve( Word("for", Tag::FOR) );
    	reserve( Word("while", Tag::WHILE) );
    	reserve( Word("break", Tag::BREAK) );
    	reserve( Word("return", Tag::RETURN) );
    	reserve( Word("true", Tag::TRUE) );
    	reserve( Word("false", Tag::FALSE) );
    	
    	peek = ' ';
    	line = 1;
    
    }
    
    //方便处理像>=,++等这些两个字符连在一起的运算符
    bool Lexer::readnext(char c, std::ifstream &in) {
    	in >> peek;
    	if( peek != c)
    		return false;
    	peek = ' ';
    	return true;
    }
    
    
    Word Lexer::scan(std::ifstream &in) {
    	//跳过空白符
    	while(!in.eof()) {
    		if(peek == ' ' || peek == '	') {
    			in >> peek;
    			continue;
    		}
    		else if(peek == '
')
    			++line;
    		else
    			break;
    		in >> peek;
    	}
    
    	//处理分界符、运算符等
    	switch(peek) {
    		case '!':
    			if(readnext('=', in))
    				return Word("!=", Tag::NE);
    			else
    				return Word("!", Tag::NOT);
    		case '-':
    			if(readnext('-', in))
    				return Word("--", Tag::AUTOMINUS);
    			else
    				return Word("-", Tag::MINUS);
    		case '+':
    			if(readnext('+', in)) 
    				return Word("++", Tag::AUTOADD);
    			else
    				return Word("+", Tag::ADD);
    		case '|':
    			if(readnext('|', in)) 
    				return Word("||", Tag::OR);
    			else
    				return Word("error", Tag::ERROR);
    		case '&':
    			if(readnext('&', in))
    				return Word("&&", Tag::AND);
    			else
    				return Word("error", Tag::ERROR);
    		case '*':
    			in >> peek;
    			return Word("*", Tag::MUTIPLY);
    		case '/':
    			in >> peek;
    			return Word("/", Tag::DIVIDE);
    		case '%':
    			in >> peek;
    			return Word("%", Tag::MOD);
    		case '=':
    			if(readnext('=', in))
    				return Word("==", Tag::EQ);
    			else
    				return Word("=", Tag::ASSIN);
    		case '>':
    			if(readnext('=', in))
    				return Word(">=", Tag::GE);
    			else
    				return Word(">", Tag::GT);
    		case '<':
    			if(readnext('=', in))
    				return Word("<=", Tag::LE);
    			else
    				return Word("<", Tag::LS);
    		case ',':
    			in >> peek;
    			return Word(",", Tag::COMMA);
    		case ';':
    			in >> peek;
    			return Word(";", Tag::SEMICOLON);
    		case '(':
    			in >> peek;
    			return Word("(", Tag::LLBRACKET);
    		case ')':
    			in >> peek;
    			return Word(")", Tag::RLBRACKET);
    		case '[':
    			in >> peek;
    			return Word("[", Tag::LMBRACKET);
    		case ']':
    			in >> peek;
    			return Word("]", Tag::RMBRACKET);
    		case '{':
    			in >> peek;
    			return Word("{", Tag::LGBRACKET);
    		case '}':
    			in >> peek;
    			return Word("}", Tag::RGBRACKET);
    	}
    	
    	//处理常数
    	if(isdigit(peek)) {
    		int v = 0;
    		do {
    			v = 10*v + peek - 48;
    			in >> peek;
    		} while(isdigit(peek));
    		if(peek != '.')
    			return Word(std::to_string(v), Tag::NUM);
    	}	
    
    
    	//处理标识符
    	if(isalpha(peek)) {
    		std::ostringstream b;		
    		do {
    			b << peek;
    			in >> peek;
    		} while(isalnum(peek) || peek == '_');
    
    		std::string tmp = b.str();
    
    		//判断是否为保留字
    		if(words.find(tmp) != words.end()) 
    			return words[tmp];
    		else
    			return Word(tmp, Tag::ID);
    	}
    	if(peek != ' ' && peek != '	' && peek != '
')	
    		return Word("error", Tag::ERROR);
    	return Word("empty", Tag::EMPTY);
    }

设计完成后，自己写一个Main函数，在while循环中调用scan函数，每次打印出Word内容，就能够得到