计算机相关专业的差不多都有学过编译原理吧?今天我班门弄刀,也谈谈我自己对编译原理的认识和理解。当然啦,我主要要谈的是编译器的前端的实现,后端的代码生成我目前还没有研究过。
实现一个编译器有两大步骤:一是词法分析,二是语法分析。应对这两块有很多的工具是可以帮助我们进行这样的工作的(例如:flex、yacc等),但我要说的是怎么完全手工去实现它。语法分析的主要目的是把一个个的字符和字母之类的东西给识别为token,而语法分析的主要作用是去构建分析后的对象或是构建抽象语法树。下面我结合早一段时间开发的JSON的反序列化来聊聊整个的过程。
词法分析
private IList<Token> ReadTokens(char[] chars)
{
IList<Token> tokens = new List<Token>();
StringBuilder sb = new StringBuilder();
CharReader src = new CharReader(chars);
char c = src.Read();
readLoop:
while (c != '\0')
{
switch (c)
{
#region 空白字符
case '\t':
{
c = src.Read();
while (c == '\t') { c = src.Read(); }
break;
}
case ' ':
{
c = src.Read();
while (c == ' ') { c = src.Read(); }
break;
}
case '\r':
{
c = src.Read();
while (c == '\r') { c = src.Read(); }
break;
}
case '\n':
{
c = src.Read();
while (c == '\n') { c = src.Read(); }
break;
}
#endregion
#region 运算符
case '[':
{
tokens.Add(new Token(TokenId.LBracket));
c = src.Read();
break;
}
case ']':
{
tokens.Add(new Token(TokenId.RBracket));
c = src.Read();
break;
}
case '{':
{
tokens.Add(new Token(TokenId.LCurly));
c = src.Read();
break;
}
case '}':
{
tokens.Add(new Token(TokenId.RCurly));
c = src.Read();
break;
}
case ':':
{
tokens.Add(new Token(TokenId.Colon));
c = src.Read();
break;
}
case ',':
{
tokens.Add(new Token(TokenId.Comma));
c = src.Read();
break;
}
#endregion
#region 字符串
case '@':
case '\'':
case '"':
{
bool isVerbatim = false;
if (c == '@')
{
isVerbatim = true;
c = src.Read(); // skip to follow quote
}
sb.Length = 0;
char quote = c;
bool isSingleQuote = (c == '\'');
c = src.Read();
while (!src.Eof)
{
if (c == '\\' && !isVerbatim) // normal escaped chars
{
c = src.Read();
switch (c)
{
case '\0':
{
if (src.Eof)
{
goto readLoop;
}
else
{
sb.Append('\0');
c = src.Read();
break;
}
}
case 'a':
{
sb.Append('\a');
c = src.Read();
break;
}
case 'b':
{
sb.Append('\b');
c = src.Read();
break;
}
case 'f':
{
sb.Append('\f');
c = src.Read();
break;
}
case 'n':
{
sb.Append('\n');
c = src.Read();
break;
}
case 'r':
{
sb.Append('\r');
c = src.Read();
break;
}
case 't':
{
sb.Append('\t');
c = src.Read();
break;
}
case 'v':
{
sb.Append('\v');
c = src.Read();
break;
}
case '\\':
{
sb.Append('\\');
c = src.Read();
break;
}
case '\'':
{
sb.Append('\'');
c = src.Read();
break;
}
case '\"':
{
// strings are always stored as verbatim for now, so the double quote is needed
sb.Append("\"\"");
c = src.Read();
break;
}
default:
{
sb.Append(c);
break;
}
}
}
else if (c == '\"')
{
c = src.Read();
// two double quotes are escapes for quotes in verbatim mode
if (c == '\"' && isVerbatim)// verbatim escape
{
sb.Append("\"\"");
c = src.Read();
}
else if (isSingleQuote)
{
sb.Append('\"');
}
else
{
break;
}
}
else // non escaped
{
if (c == quote)
{
break;
}
sb.Append(c);
c = src.Read();
}
}
if (c != '\0')
{
if (c == quote)
{
c = src.Read(); // skip last quote
}
tokens.Add(new Token(TokenId.String, sb.ToString()));
}
break;
}
#endregion
#region 数字
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '.':
case '+':
case '-':
{
sb.Length = 0;
TokenId numKind = TokenId.Int;
bool isDouble = false;
if (c == '.')
{
c = src.Read();
if (c < '0' || c > '9')
{
break;
}
else
{
sb.Append('.');
numKind = TokenId.Double;
isDouble = true;
}
}
else if (c == '+')
{
c = src.Read();
}
else if (c == '-')
{
sb.Append('-');
c = src.Read();
}
bool isNum = true;
while (isNum)
{
switch (c)
{
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
{
sb.Append(c);
break;
}
case '.':
{
if (isDouble)
{
numKind = TokenId.Double;
tokens.Add(new Token(numKind, sb.ToString()));
goto readLoop;
}
c = src.Read();
if (c < '0' || c > '9')
{
tokens.Add(new Token(numKind, sb.ToString()));
goto readLoop;
}
else
{
sb.Append('.');
sb.Append(c);
numKind = TokenId.Double;
isDouble = true;
}
break;
}
default:
{
isNum = false;
break;
}
}
if (isNum) c = src.Read();
}
tokens.Add(new Token(numKind, sb.ToString()));
isNum = false;
break;
}
#endregion
#region 关键字和变量
default:
{
switch (c)
{
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z':
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'G':
case 'H':
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
case 'O':
case 'P':
case 'Q':
case 'R':
case 'S':
case 'T':
case 'U':
case 'V':
case 'W':
case 'X':
case 'Y':
case 'Z':
case '_':
{
sb.Length = 0;
sb.Append(c);
c = src.Read();
bool endIdent = false;
bool possibleKeyword = true;
while (c != '\0' && !endIdent)
{
switch (c)
{
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z':
{
sb.Append(c);
c = src.Read();
break;
}
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'G':
case 'H':
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
case 'O':
case 'P':
case 'Q':
case 'R':
case 'S':
case 'T':
case 'U':
case 'V':
case 'W':
case 'X':
case 'Y':
case 'Z':
case '_':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
{
possibleKeyword = false;
sb.Append(c);
c = src.Read();
break;
}
default:
{
endIdent = true;
break;
}
}
}
string identText = sb.ToString();
sb.Length = 0;
bool isKeyword = possibleKeyword ? _keyWords.ContainsKey(identText) : false;
if (isKeyword)
{
tokens.Add(new Token(_keyWords[identText]));
}
else
{
tokens.Add(new Token(TokenId.Ident, identText));
}
break;
}
default:
{
tokens.Add(new Token(TokenId.Invalid));
c = src.Read();
break;
}
}
break;
}
#endregion
}
}
return tokens;
}
private IList<Token> ReadTokens(char[] chars)
{
IList<Token> tokens = new List<Token>();
StringBuilder sb = new StringBuilder();
CharReader src = new CharReader(chars);
char c = src.Read();
readLoop:
while (c != '\0')
{
switch (c)
{
#region 空白字符
case '\t':
{
c = src.Read();
while (c == '\t') { c = src.Read(); }
break;
}
case ' ':
{
c = src.Read();
while (c == ' ') { c = src.Read(); }
break;
}
case '\r':
{
c = src.Read();
while (c == '\r') { c = src.Read(); }
break;
}
case '\n':
{
c = src.Read();
while (c == '\n') { c = src.Read(); }
break;
}
#endregion
#region 运算符
case '[':
{
tokens.Add(new Token(TokenId.LBracket));
c = src.Read();
break;
}
case ']':
{
tokens.Add(new Token(TokenId.RBracket));
c = src.Read();
break;
}
case '{':
{
tokens.Add(new Token(TokenId.LCurly));
c = src.Read();
break;
}
case '}':
{
tokens.Add(new Token(TokenId.RCurly));
c = src.Read();
break;
}
case ':':
{
tokens.Add(new Token(TokenId.Colon));
c = src.Read();
break;
}
case ',':
{
tokens.Add(new Token(TokenId.Comma));
c = src.Read();
break;
}
#endregion
#region 字符串
case '@':
case '\'':
case '"':
{
bool isVerbatim = false;
if (c == '@')
{
isVerbatim = true;
c = src.Read(); // skip to follow quote
}
sb.Length = 0;
char quote = c;
bool isSingleQuote = (c == '\'');
c = src.Read();
while (!src.Eof)
{
if (c == '\\' && !isVerbatim) // normal escaped chars
{
c = src.Read();
switch (c)
{
case '\0':
{
if (src.Eof)
{
goto readLoop;
}
else
{
sb.Append('\0');
c = src.Read();
break;
}
}
case 'a':
{
sb.Append('\a');
c = src.Read();
break;
}
case 'b':
{
sb.Append('\b');
c = src.Read();
break;
}
case 'f':
{
sb.Append('\f');
c = src.Read();
break;
}
case 'n':
{
sb.Append('\n');
c = src.Read();
break;
}
case 'r':
{
sb.Append('\r');
c = src.Read();
break;
}
case 't':
{
sb.Append('\t');
c = src.Read();
break;
}
case 'v':
{
sb.Append('\v');
c = src.Read();
break;
}
case '\\':
{
sb.Append('\\');
c = src.Read();
break;
}
case '\'':
{
sb.Append('\'');
c = src.Read();
break;
}
case '\"':
{
// strings are always stored as verbatim for now, so the double quote is needed
sb.Append("\"\"");
c = src.Read();
break;
}
default:
{
sb.Append(c);
break;
}
}
}
else if (c == '\"')
{
c = src.Read();
// two double quotes are escapes for quotes in verbatim mode
if (c == '\"' && isVerbatim)// verbatim escape
{
sb.Append("\"\"");
c = src.Read();
}
else if (isSingleQuote)
{
sb.Append('\"');
}
else
{
break;
}
}
else // non escaped
{
if (c == quote)
{
break;
}
sb.Append(c);
c = src.Read();
}
}
if (c != '\0')
{
if (c == quote)
{
c = src.Read(); // skip last quote
}
tokens.Add(new Token(TokenId.String, sb.ToString()));
}
break;
}
#endregion
#region 数字
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '.':
case '+':
case '-':
{
sb.Length = 0;
TokenId numKind = TokenId.Int;
bool isDouble = false;
if (c == '.')
{
c = src.Read();
if (c < '0' || c > '9')
{
break;
}
else
{
sb.Append('.');
numKind = TokenId.Double;
isDouble = true;
}
}
else if (c == '+')
{
c = src.Read();
}
else if (c == '-')
{
sb.Append('-');
c = src.Read();
}
bool isNum = true;
while (isNum)
{
switch (c)
{
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
{
sb.Append(c);
break;
}
case '.':
{
if (isDouble)
{
numKind = TokenId.Double;
tokens.Add(new Token(numKind, sb.ToString()));
goto readLoop;
}
c = src.Read();
if (c < '0' || c > '9')
{
tokens.Add(new Token(numKind, sb.ToString()));
goto readLoop;
}
else
{
sb.Append('.');
sb.Append(c);
numKind = TokenId.Double;
isDouble = true;
}
break;
}
default:
{
isNum = false;
break;
}
}
if (isNum) c = src.Read();
}
tokens.Add(new Token(numKind, sb.ToString()));
isNum = false;
break;
}
#endregion
#region 关键字和变量
default:
{
switch (c)
{
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z':
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'G':
case 'H':
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
case 'O':
case 'P':
case 'Q':
case 'R':
case 'S':
case 'T':
case 'U':
case 'V':
case 'W':
case 'X':
case 'Y':
case 'Z':
case '_':
{
sb.Length = 0;
sb.Append(c);
c = src.Read();
bool endIdent = false;
bool possibleKeyword = true;
while (c != '\0' && !endIdent)
{
switch (c)
{
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z':
{
sb.Append(c);
c = src.Read();
break;
}
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'G':
case 'H':
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
case 'O':
case 'P':
case 'Q':
case 'R':
case 'S':
case 'T':
case 'U':
case 'V':
case 'W':
case 'X':
case 'Y':
case 'Z':
case '_':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
{
possibleKeyword = false;
sb.Append(c);
c = src.Read();
break;
}
default:
{
endIdent = true;
break;
}
}
}
string identText = sb.ToString();
sb.Length = 0;
bool isKeyword = possibleKeyword ? _keyWords.ContainsKey(identText) : false;
if (isKeyword)
{
tokens.Add(new Token(_keyWords[identText]));
}
else
{
tokens.Add(new Token(TokenId.Ident, identText));
}
break;
}
default:
{
tokens.Add(new Token(TokenId.Invalid));
c = src.Read();
break;
}
}
break;
}
#endregion
}
}
return tokens;
}
关于JSON的定义请参看http://www.json.org/,要实现JSON的分析,我们要确定其基本的定义,而它的定义相对比较简单,它只有 Object、Array、String、Number和Value五种形式。而根据这几种对象的表示的边界条件,我们知道我们要分析的字符主要有:'{', '}', '[', ']', ':', ',', '"'等,对这些的分析的实现代码如上。
完整代码如下:/Files/afxcn/MiniJSON.rar
下一篇: