import java.io.UnsupportedEncodingException; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.lang3.StringUtils; import org.json.JSONException; import org.json.JSONObject; public class DFA { /** * 根节点 */ private static TreeNode rootNode = new TreeNode(); /** * 创建关键词树时的临时根节点 */ private static TreeNode rootNodeTemp = new TreeNode(); /** * 关键词缓存 */ private static ByteBuffer keywordBuffer = ByteBuffer.allocate(1024); /** * 关键词编码 */ private static String charset = "utf-8"; /** * 关键词树是否初始化 */ private static boolean treeIsInit = false; /** * 创建DFA树 * @param keywordList * @throws UnsupportedEncodingException */ public static void createKeywordTree(List<String> keywordList) throws UnsupportedEncodingException{ for (String keyword : keywordList) { if(keyword == null) continue; keyword = keyword.trim().toLowerCase(); byte[] bytes = keyword.getBytes(charset); TreeNode tempNode = rootNodeTemp; for (int i = 0; i < bytes.length; i++) { int index = bytes[i] & 0xff; TreeNode node = tempNode.getSubNode(index); if(node == null){ node = new TreeNode(); tempNode.setSubNode(index, node); } tempNode = node; if(i == bytes.length - 1){ tempNode.setKeywordEnd(true); } } } synchronized (DFA.class) { rootNode = rootNodeTemp; if(!treeIsInit)treeIsInit = true; } } /** * 检索关键词 * @param text * @return * @throws UnsupportedEncodingException */ public static JSONObject searchKeyword(String text) throws UnsupportedEncodingException{ if(StringUtils.isEmpty(text)){ JSONObject json = new JSONObject(); try { json.put("islegal", "1"); json.put("deststr", ""); } catch (JSONException e) { e.printStackTrace(); } return json; } if(!treeIsInit)createKeywordTree(); System.out.println("searchKeyword:"+text); return searchKeyword(text.getBytes(charset),text.toLowerCase().getBytes(charset)); } /** * 检索关键词 * @param oldBytes * @param bytes * @return */ public static JSONObject searchKeyword(byte[] oldBytes,byte[] bytes){ String is_legal = "1"; JSONObject json = new JSONObject(); String result = null; if(bytes == null || bytes.length == 0){ result = ""; }else{ synchronized (DFA.class) { TreeNode tempNode = rootNode; int rollback = 0; int position = 0; while (position < bytes.length) { int index = bytes[position] & 0xFF; keywordBuffer.put(bytes[position]); tempNode = tempNode.getSubNode(index); if(tempNode == null){ position = position - rollback; rollback = 0; tempNode = rootNode; keywordBuffer.clear(); } else if(tempNode.isKeywordEnd()){ keywordBuffer.flip(); for (int i = 0; i <= rollback; i++) { bytes[position-i] = 42; oldBytes[position-i] = 42; } keywordBuffer.limit(keywordBuffer.capacity()); rollback = 1; is_legal = "0"; }else{ rollback++; } position++; } } try { result = new String(oldBytes,charset); } catch (Exception e) { e.printStackTrace(); } } try { json.put("islegal", is_legal); json.put("deststr", result); } catch (JSONException e) { // TODO Auto-generated catch block e.printStackTrace(); } return json; } public void setCharset(String charset) { this.charset = charset; } public static void main(String[] args){ try{ List<String> keys = new ArrayList<String>(); keys.add("TMD"); keys.add("변형"); String text ="你真TMD是个변형啊!"; DFA.createKeywordTree(keys); JSONObject a = DFA.searchKeyword(text); System.out.println(a); } catch(Exception ex){ ex.printStackTrace(); } } }
import java.util.ArrayList; import java.util.List; public class TreeNode { private static final int NODE_LEN = 256; /** * true 关键词的终结 ; false 继续 */ private boolean end = false; private List<TreeNode> subNodes = new ArrayList<TreeNode>(NODE_LEN); public TreeNode(){ for (int i = 0; i < NODE_LEN; i++) { subNodes.add(i, null); } } /** * 向指定位置添加节点树 * @param index * @param node */ public void setSubNode(int index, TreeNode node){ subNodes.set(index, node); } public TreeNode getSubNode(int index){ //System.out.println("index:"+index+" node:"+this+"--subNodes:"+subNodes.get(index)); return subNodes.get(index); } public boolean isKeywordEnd() { return end; } public void setKeywordEnd(boolean end) { this.end = end; } }