SQL分词器1.10版

处理SQL及分词效果:

select * from ( select rownum as rn,tb1.stuid,tb1.summary from ( select stuid,sum(score) as summary from gk_score group by stuid order by summary desc ) tb1 order by tb1.summary desc ) tb2 where rn<11
Index                Type No              Text                 Type Desc            
------------------------------------------------------------------------------------
0                    1                    select               KW:select            
1                    2                    *                    Text                 
2                    4                    from                 KW:from              
3                    18                   (                    (                    
4                    1                    select               KW:select            
5                    2                    rownum               Text                 
6                    13                   as                   KW:as                
7                    2                    rn                   Text                 
8                    3                    ,                    Comma                
9                    2                    tb1.stuid            Text                 
10                   3                    ,                    Comma                
11                   2                    tb1.summary          Text                 
12                   4                    from                 KW:from              
13                   18                   (                    (                    
14                   1                    select               KW:select            
15                   2                    stuid                Text                 
16                   3                    ,                    Comma                
17                   2                    sum                  Text                 
18                   18                   (                    (                    
19                   2                    score                Text                 
20                   19                   )                    )                    
21                   13                   as                   KW:as                
22                   2                    summary              Text                 
23                   4                    from                 KW:from              
24                   2                    gk_score             Text                 
25                   14                   group                KW:group             
26                   10                   by                   KW:by                
27                   2                    stuid                Text                 
28                   9                    order                KW:order             
29                   10                   by                   KW:by                
30                   2                    summary              Text                 
31                   11                   desc                 KW:asc               
32                   19                   )                    )                    
33                   2                    tb1                  Text                 
34                   9                    order                KW:order             
35                   10                   by                   KW:by                
36                   2                    tb1.summary          Text                 
37                   11                   desc                 KW:asc               
38                   19                   )                    )                    
39                   2                    tb2                  Text                 
40                   5                    where                KW:where             
41                   2                    rn                   Text                 
42                   16                   <                    <                    
43                   2                    11                   Text                 

程序:

package com.heyang.easysql.lex10;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

class Token{
    static final int TYPE_SELECT=1;
    static final int TYPE_TEXT=2;
    static final int TYPE_COMMA=3;
    static final int TYPE_FROM=4;
    static final int TYPE_WHERE=5;
    static final int TYPE_AND=6;
    static final int TYPE_EQUAL=7;
    static final int TYPE_OR=8;
    static final int TYPE_ORDER=9;
    static final int TYPE_BY=10;
    static final int TYPE_ASC=11;
    static final int TYPE_DESC=12;
    static final int TYPE_AS=13;
    static final int TYPE_GROUP=14;
    static final int TYPE_HAVING=15;
    static final int TYPE_LESSTHAN=16;
    static final int TYPE_GREATERTHAN=17;
    static final int TYPE_OPEN_PARENTHESIS=18;
    static final int TYPE_CLOSE_PARENTHESIS=19;
    static final int TYPE_CONNECT=20;
    static final int TYPE_LESSTHAN_OR_EQUAL=21;
    static final int TYPE_GREATERTHAN_OR_EQUAL=22;
    static final int TYPE_LESSTHAN_OR_GREATERTHAN=23;
    static final int TYPE_CASE=24;
    static final int TYPE_WHEN=25;
    static final int TYPE_THEN=26;
    static final int TYPE_ELSE=27;
    static final int TYPE_END=28;
    static final int TYPE_IS=29;
    static final int TYPE_NULL=30;
    static final int TYPE_TRUE=31;
    static final int TYPE_FALSE=32;
    static final int TYPE_PLUS=33;
    static final int TYPE_MINUS=34;
    //static final int TYPE_MULTI=35;
    static final int TYPE_DEVIDE=36;
    static final int TYPE_DISTINCT=37;
    static final int TYPE_OVER=38;
    static final int TYPE_STRING_CONCAT=39;
    static final int TYPE_ON=40;
    static final int TYPE_JOIN=41;
    static final int TYPE_INNER=42;
    static final int TYPE_LEFT=43;
    static final int TYPE_RIGHT=44;
    static final int TYPE_OUTER=45;
    static final int TYPE_FULL=46;
    static final int TYPE_WITHIN=47;
    static final int TYPE_PARTITION=48;
    
    int type;
    String text;
    
    public Token(char c,int type) {
        this.text=String.valueOf(c);
        this.type=type;
    }
    
    public Token(String word,int type) {
        this.text=word;
        this.type=type;
    }
    
    public String getTypeStr() {
        if(type==TYPE_SELECT) {
            return "KW:select";
        }else if(type==TYPE_FROM) {
            return "KW:from";
        }else if(type==TYPE_COMMA) {
            return "Comma";
        }else if(type==TYPE_TEXT) {
            return "Text";
        }else if(type==TYPE_WHERE) {
            return "KW:where";
        }else if(type==TYPE_AND) {
            return "KW:and";
        }else if(type==TYPE_EQUAL) {
            return "=";
        }else if(type==TYPE_OR) {
            return "KW:or";
        }else if(type==TYPE_ORDER) {
            return "KW:order";
        }else if(type==TYPE_BY) {
            return "KW:by";
        }else if(type==TYPE_ASC) {
            return "KW:asc";
        }else if(type==TYPE_DESC) {
            return "KW:desc";
        }else if(type==TYPE_AS) {
            return "KW:as";
        }else if(type==TYPE_GROUP) {
            return "KW:group";
        }else if(type==TYPE_HAVING) {
            return "KW:having";
        }else if(type==TYPE_LESSTHAN) {
            return "<";
        }else if(type==TYPE_GREATERTHAN) {
            return ">";
        }else if(type==TYPE_OPEN_PARENTHESIS) {
            return "(";
        }else if(type==TYPE_CLOSE_PARENTHESIS) {
            return ")";
        }else if(type==TYPE_CONNECT) {
            return "KW:connect";
        }else if(type==TYPE_LESSTHAN_OR_EQUAL) {
            return "<=";
        }else if(type==TYPE_GREATERTHAN_OR_EQUAL) {
            return ">=";
        }else if(type==TYPE_LESSTHAN_OR_GREATERTHAN) {
            return "<>";
        }else if(type==TYPE_CASE) {
            return "KW:case";
        }else if(type==TYPE_WHEN) {
            return "KW:when";
        }else if(type==TYPE_THEN) {
            return "KW:then";
        }else if(type==TYPE_ELSE) {
            return "KW:else";
        }else if(type==TYPE_END) {
            return "KW:end";
        }else if(type==TYPE_IS) {
            return "KW:is";
        }else if(type==TYPE_NULL) {
            return "KW:null";
        }else if(type==TYPE_TRUE) {
            return "KW:true";
        }else if(type==TYPE_FALSE) {
            return "KW:false";
        }else if(type==TYPE_PLUS) {
            return "+";
        }else if(type==TYPE_MINUS) {
            return "-";
        }else if(type==TYPE_DEVIDE) {
            return "/";
        }else if(type==TYPE_DISTINCT) {
            return "KW:distinct";
        }else if(type==TYPE_OVER) {
            return "KW:over";
        }else if(type==TYPE_STRING_CONCAT) {
            return "||";
        }else if(type==TYPE_ON) {
            return "KW:on";
        }else if(type==TYPE_JOIN) {
            return "KW:join";
        }else if(type==TYPE_INNER) {
            return "KW:inner";
        }else if(type==TYPE_LEFT) {
            return "KW:left";
        }else if(type==TYPE_RIGHT) {
            return "KW:right";
        }else if(type==TYPE_OUTER) {
            return "KW:outer";
        }else if(type==TYPE_FULL) {
            return "KW:full";
        }else if(type==TYPE_WITHIN) {
            return "KW:within";
        }else if(type==TYPE_PARTITION) {
            return "KW:partition";
        }
        
        return null;
    }
}

public class Lexer {
    private List<Token> tokenList;
    
    public Lexer(String inputSql) {
        String sql=pretreat(inputSql);
        String swallowed="";
        
        tokenList=new ArrayList<Token>();
        for(int i=0;i<sql.length();i++){
            char c=sql.charAt(i);
            
            if(Character.isWhitespace(c)){
                addTextToList(swallowed);
                swallowed="";
            }else if(c==','){
                addTextToList(swallowed);
                swallowed="";
                tokenList.add(new Token(c,Token.TYPE_COMMA));
            }else if(c=='='){
                addTextToList(swallowed);
                swallowed="";
                tokenList.add(new Token(c,Token.TYPE_EQUAL));
            }else if(c=='<'){
                int next=i+1;
                if(next<sql.length() && sql.charAt(next)=='=') {
                    addTextToList(swallowed);
                    swallowed="";
                    tokenList.add(new Token("<=",Token.TYPE_LESSTHAN_OR_EQUAL));
                    i++;
                }else if(next<sql.length() && sql.charAt(next)=='>') {
                    addTextToList(swallowed);
                    swallowed="";
                    tokenList.add(new Token("<>",Token.TYPE_LESSTHAN_OR_GREATERTHAN));
                    i++;
                }else {
                    addTextToList(swallowed);
                    swallowed="";
                    tokenList.add(new Token(c,Token.TYPE_LESSTHAN));
                }
            }else if(c=='>'){
                int next=i+1;
                if(next<sql.length() && sql.charAt(next)=='=') {
                    addTextToList(swallowed);
                    swallowed="";
                    tokenList.add(new Token(">=",Token.TYPE_GREATERTHAN_OR_EQUAL));
                    i++;
                }else {
                    addTextToList(swallowed);
                    swallowed="";
                    tokenList.add(new Token(c,Token.TYPE_GREATERTHAN));
                }
            }else if(c=='|'){
                int next=i+1;
                if(next<sql.length() && sql.charAt(next)=='|') {
                    addTextToList(swallowed);
                    swallowed="";
                    tokenList.add(new Token("||",Token.TYPE_STRING_CONCAT));
                    i++;
                }
            }else if(c=='('){
                addTextToList(swallowed);
                swallowed="";
                tokenList.add(new Token(c,Token.TYPE_OPEN_PARENTHESIS));
            }else if(c==')'){
                addTextToList(swallowed);
                swallowed="";
                tokenList.add(new Token(c,Token.TYPE_CLOSE_PARENTHESIS));
            }else if(c=='+'){
                addTextToList(swallowed);
                swallowed="";
                tokenList.add(new Token(c,Token.TYPE_PLUS));
            }else if(c=='-'){
                addTextToList(swallowed);
                swallowed="";
                tokenList.add(new Token(c,Token.TYPE_MINUS));
            }else if(c=='/'){
                addTextToList(swallowed);
                swallowed="";
                tokenList.add(new Token(c,Token.TYPE_DEVIDE));
            }else {
                swallowed+=c;
            }
        }
    }
    
    private int findTypeByText(String text) {
        Object[][] arr= {
                            {"select", Token.TYPE_SELECT},
                            {"from",   Token.TYPE_FROM},
                            {"where",  Token.TYPE_WHERE},
                            {"and",    Token.TYPE_AND},
                            {"or",     Token.TYPE_OR},
                            {"order",  Token.TYPE_ORDER},
                            {"by",     Token.TYPE_BY},
                            {"asc",    Token.TYPE_ASC},
                            {"desc",   Token.TYPE_ASC},
                            {"asc",    Token.TYPE_DESC},
                            {"as",     Token.TYPE_AS},
                            {"group",  Token.TYPE_GROUP},
                            {"having", Token.TYPE_HAVING},
                            {"connect", Token.TYPE_CONNECT},
                            {"case",   Token.TYPE_CASE},
                            {"when",   Token.TYPE_WHEN},
                            {"then",   Token.TYPE_THEN},
                            {"else",   Token.TYPE_ELSE},
                            {"end",   Token.TYPE_END},
                            {"is",   Token.TYPE_IS},
                            {"null",   Token.TYPE_NULL},
                            {"true",   Token.TYPE_TRUE},
                            {"false",   Token.TYPE_FALSE},
                            {"distinct",   Token.TYPE_DISTINCT},
                            {"over",   Token.TYPE_OVER},
                            {"on",   Token.TYPE_ON},
                            {"join",   Token.TYPE_JOIN},
                            {"inner",   Token.TYPE_INNER},
                            {"left",   Token.TYPE_LEFT},
                            {"right",   Token.TYPE_RIGHT},
                            {"outer",   Token.TYPE_OUTER},
                            {"full",   Token.TYPE_FULL},
                            {"within",   Token.TYPE_WITHIN},
                            {"partition",   Token.TYPE_PARTITION},
                        };
        
        for(Object[] arrInner:arr) {
            String keyword=String.valueOf(arrInner[0]);
            if(keyword.equalsIgnoreCase(text)) {
                return Integer.parseInt(arrInner[1].toString());
            }
        }
        
        return Token.TYPE_TEXT;
    }
    
    private void addTextToList(String text) {
        int type=findTypeByText(text);
        addToken2List(text,type);
    }
    
    private void addToken2List(String text,int type) {
        if(text.trim().length()>0) {
            tokenList.add(new Token(text,type));
        }
    }
    
    public void printTokenList() {
        final String continuousStar = createRepeatedStr("-", 84);
        final String layout = "%-20s %-20s %-20s %-20s %s";
        StringBuilder sb = new StringBuilder();

        sb.append(String.format(layout, "Index", "Type No","Text","Type Desc","
"));
        sb.append(continuousStar + "
");
        int index=0;
        for(Token token:tokenList) {
            sb.append(String.format(layout, String.valueOf(index),String.valueOf(token.type), token.text,token.getTypeStr(),"
"));
            index++;
        }
        
        System.out.println(sb.toString());
    }
    
    private static String createRepeatedStr(String seed, int n) {
        return String.join("", Collections.nCopies(n, seed));
    }
    
    private String pretreat(String raw) {
        return raw.trim()+" ";
    }
    
    public static void main(String[] args)  throws Exception{
        String sql=removeExtraSpace(readSqlFromFile("c:\temp\12.sql"));
        System.out.println(sql);
        new Lexer(sql).printTokenList();;
    }
    
    private static String readSqlFromFile(String filePath) throws Exception{
        StringBuilder sb=new StringBuilder();
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(filePath), "UTF-8"));  
        String line = null;  
        while( ( line = br.readLine() ) != null ) {
            sb.append(line);
        }
        br.close();  
        return sb.toString();
    }
    
    private static String removeExtraSpace(String raw) {
        return raw.replaceAll("\s{2,}", " ");
    }
}

--2020年5月13日 16点07分--

原文地址:https://www.cnblogs.com/heyang78/p/12881770.html