Dom4j解析XML中遇到的一些问题

最近在用Dom4j解析XML文件，遇到了一些问题，记录如下：

1. BOM头问题，得到的异常是：

Nested exception: org.xml.sax.SAXParseException: Content is not allowed in prolog.

（1）http://koti.mbnet.fi/akini/java/unicodereader/，里面提供了两个删掉BOM头的方法（我用了第一个）：

UnicodeInputStream

  1 /**
  2  version: 1.1 / 2007-01-25
  3  - changed BOM recognition ordering (longer boms first)
  4 
  5  Original pseudocode   : Thomas Weidenfeller
  6  Implementation tweaked: Aki Nieminen
  7 
  8  http://www.unicode.org/unicode/faq/utf_bom.html
  9  BOMs in byte length ordering:
 10    00 00 FE FF    = UTF-32, big-endian
 11    FF FE 00 00    = UTF-32, little-endian
 12    EF BB BF       = UTF-8,
 13    FE FF          = UTF-16, big-endian
 14    FF FE          = UTF-16, little-endian
 15 
 16  Win2k Notepad:
 17    Unicode format = UTF-16LE
 18 
 19 This class will help you to autorecognize and skip BOMs. This will support UTF-8 as well.
 20 ***/
 21 
 22 import java.io.*;
 23 
 24 /**
 25  * This inputstream will recognize unicode BOM marks
 26  * and will skip bytes if getEncoding() method is called
 27  * before any of the read(...) methods.
 28  *
 29  * Usage pattern:
 30      String enc = "ISO-8859-1"; // or NULL to use systemdefault
 31      FileInputStream fis = new FileInputStream(file);
 32      UnicodeInputStream uin = new UnicodeInputStream(fis, enc);
 33      enc = uin.getEncoding(); // check and skip possible BOM bytes
 34      InputStreamReader in;
 35      if (enc == null) in = new InputStreamReader(uin);
 36      else in = new InputStreamReader(uin, enc);
 37  */
 38 public class UnicodeInputStream extends InputStream {
 39    PushbackInputStream internalIn;
 40    boolean             isInited = false;
 41     String              defaultEnc;
 42     String              encoding;
 43 
 44     private static final int BOM_SIZE = 4;
 45 
 46     UnicodeInputStream(InputStream in, String defaultEnc) {
 47         internalIn = new PushbackInputStream(in, BOM_SIZE);
 48         this.defaultEnc = defaultEnc;
 49     }
 50 
 51     public String getDefaultEncoding() {
 52       return defaultEnc;
 53    }
 54 
 55    public String getEncoding() {
 56       if (!isInited) {
 57          try {
 58             init();
 59          } catch (IOException ex) {
 60             IllegalStateException ise = new IllegalStateException("Init method failed.");
 61             ise.initCause(ise);
 62             throw ise;
 63          }
 64       }
 65       return encoding;
 66    }
 67 
 68    /**
 69     * Read-ahead four bytes and check for BOM marks. Extra bytes are
 70     * unread back to the stream, only BOM bytes are skipped.
 71     */
 72    protected void init() throws IOException {
 73       if (isInited) return;
 74 
 75       byte bom[] = new byte[BOM_SIZE];
 76       int n, unread;
 77       n = internalIn.read(bom, 0, bom.length);
 78 
 79       if ( (bom[0] == (byte)0x00) && (bom[1] == (byte)0x00) &&
 80                   (bom[2] == (byte)0xFE) && (bom[3] == (byte)0xFF) ) {
 81          encoding = "UTF-32BE";
 82          unread = n - 4;
 83       } else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) &&
 84                   (bom[2] == (byte)0x00) && (bom[3] == (byte)0x00) ) {
 85          encoding = "UTF-32LE";
 86          unread = n - 4;
 87       } else if (  (bom[0] == (byte)0xEF) && (bom[1] == (byte)0xBB) &&
 88             (bom[2] == (byte)0xBF) ) {
 89          encoding = "UTF-8";
 90          unread = n - 3;
 91       } else if ( (bom[0] == (byte)0xFE) && (bom[1] == (byte)0xFF) ) {
 92          encoding = "UTF-16BE";
 93          unread = n - 2;
 94       } else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) ) {
 95          encoding = "UTF-16LE";
 96          unread = n - 2;
 97       } else {
 98          // Unicode BOM mark not found, unread all bytes
 99          encoding = defaultEnc;
100          unread = n;
101       }      
102       //System.out.println("read=" + n + ", unread=" + unread);
103 
104       if (unread > 0) internalIn.unread(bom, (n - unread), unread);
105 
106       isInited = true;
107    }
108 
109    public void close() throws IOException {
110       //init();
111       isInited = true;
112       internalIn.close();
113    }
114 
115    public int read() throws IOException {
116       //init();
117       isInited = true;
118       return internalIn.read();
119    }
120 }

UnicodeReader

  1 /**
  2  version: 1.1 / 2007-01-25
  3  - changed BOM recognition ordering (longer boms first)
  4 
  5  Original pseudocode   : Thomas Weidenfeller
  6  Implementation tweaked: Aki Nieminen
  7 
  8  http://www.unicode.org/unicode/faq/utf_bom.html
  9  BOMs:
 10    00 00 FE FF    = UTF-32, big-endian
 11    FF FE 00 00    = UTF-32, little-endian
 12    EF BB BF       = UTF-8,
 13    FE FF          = UTF-16, big-endian
 14    FF FE          = UTF-16, little-endian
 15 
 16  Win2k Notepad:
 17    Unicode format = UTF-16LE
 18 
 19 This class will do everything ever more transparently. Just instantiate it and read text.
 20 ***/
 21 
 22 import java.io.*;
 23 
 24 /**
 25  * Generic unicode textreader, which will use BOM mark
 26  * to identify the encoding to be used. If BOM is not found
 27  * then use a given default or system encoding.
 28  */
 29 public class UnicodeReader extends Reader {
 30    PushbackInputStream internalIn;
 31    InputStreamReader   internalIn2 = null;
 32    String              defaultEnc;
 33 
 34    private static final int BOM_SIZE = 4;
 35 
 36    /**
 37     *
 38     * @param in  inputstream to be read
 39     * @param defaultEnc default encoding if stream does not have 
 40     *                   BOM marker. Give NULL to use system-level default.
 41     */
 42    UnicodeReader(InputStream in, String defaultEnc) {
 43       internalIn = new PushbackInputStream(in, BOM_SIZE);
 44       this.defaultEnc = defaultEnc;
 45    }
 46 
 47    public String getDefaultEncoding() {
 48       return defaultEnc;
 49    }
 50 
 51    /**
 52     * Get stream encoding or NULL if stream is uninitialized.
 53     * Call init() or read() method to initialize it.
 54     */
 55    public String getEncoding() {
 56       if (internalIn2 == null) return null;
 57       return internalIn2.getEncoding();
 58    }
 59 
 60    /**
 61     * Read-ahead four bytes and check for BOM marks. Extra bytes are
 62     * unread back to the stream, only BOM bytes are skipped.
 63     */
 64    protected void init() throws IOException {
 65       if (internalIn2 != null) return;
 66 
 67       String encoding;
 68       byte bom[] = new byte[BOM_SIZE];
 69       int n, unread;
 70       n = internalIn.read(bom, 0, bom.length);
 71 
 72       if ( (bom[0] == (byte)0x00) && (bom[1] == (byte)0x00) &&
 73                   (bom[2] == (byte)0xFE) && (bom[3] == (byte)0xFF) ) {
 74          encoding = "UTF-32BE";
 75          unread = n - 4;
 76       } else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) &&
 77                   (bom[2] == (byte)0x00) && (bom[3] == (byte)0x00) ) {
 78          encoding = "UTF-32LE";
 79          unread = n - 4;
 80       } else if (  (bom[0] == (byte)0xEF) && (bom[1] == (byte)0xBB) &&
 81             (bom[2] == (byte)0xBF) ) {
 82          encoding = "UTF-8";
 83          unread = n - 3;
 84       } else if ( (bom[0] == (byte)0xFE) && (bom[1] == (byte)0xFF) ) {
 85          encoding = "UTF-16BE";
 86          unread = n - 2;
 87       } else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) ) {
 88          encoding = "UTF-16LE";
 89          unread = n - 2;
 90       } else {
 91          // Unicode BOM mark not found, unread all bytes
 92          encoding = defaultEnc;
 93          unread = n;
 94       }    
 95       //System.out.println("read=" + n + ", unread=" + unread);
 96 
 97       if (unread > 0) internalIn.unread(bom, (n - unread), unread);
 98 
 99       // Use given encoding
100       if (encoding == null) {
101          internalIn2 = new InputStreamReader(internalIn);
102       } else {
103          internalIn2 = new InputStreamReader(internalIn, encoding);
104       }
105    }
106 
107    public void close() throws IOException {
108       init();
109       internalIn2.close();
110    }
111 
112    public int read(char[] cbuf, int off, int len) throws IOException {
113       init();
114       return internalIn2.read(cbuf, off, len);
115    }
116 
117 }

（2）下面的代码分析了产生BOM头的原因：

ContentNotAllowedInProlog

import java.io.*;
import java.nio.charset.Charset;
import javax.xml.parsers.*;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

public class ContentNotAllowedInProlog {
  private static void parse(InputStream stream) throws SAXException,
      ParserConfigurationException, IOException {
    SAXParserFactory.newInstance().newSAXParser().parse(stream,
        new DefaultHandler());
  }

  public static void main(String[] args) {
    String[] encodings = { "UTF-8", "UTF-16", "ISO-8859-1" };
    for (String actual : encodings) {
      for (String declared : encodings) {
        if (actual != declared) {
          String xml = "<?xml version='1.0' encoding='" + declared
              + "'?><x/>";
          byte[] encoded = xml.getBytes(Charset.forName(actual));
          try {
            parse(new ByteArrayInputStream(encoded));
            System.out.println("HIDDEN ERROR! actual:" + actual + " " + xml);
          } catch (Exception e) {
            System.out.println(e.getMessage() + " actual:" + actual + " xml:"
                + xml);
          }
        }
      }
    }
  }
}

2. 注释问题，得到的异常是：

Nested exception: org.xml.sax.SAXParseException: The string "--" is not permitted within comments.

产生这个异常的原因有很多。其中一个是中间有“--”字符；另外一个是“-->”前面不是空格，如“abc-->”则会抛出异常，而不是“abc -->”则不会。

我的解决方式就是：删掉所有注释！

针对这两个问题，写了个简单工具类：

XmlUtil

  1 import java.io.BufferedReader;
  2 import java.io.FileInputStream;
  3 import java.io.FileNotFoundException;
  4 import java.io.FileOutputStream;
  5 import java.io.IOException;
  6 import java.io.InputStream;
  7 import java.io.InputStreamReader;
  8 import java.io.OutputStreamWriter;
  9 import java.io.PushbackInputStream;
 10 import java.io.UnsupportedEncodingException;
 11 import java.util.HashMap;
 12 import java.util.List;
 13 import org.dom4j.DocumentHelper;
 14 import org.dom4j.Element;
 15 import org.dom4j.XPath;
 16 import org.slf4j.Logger;
 17 import org.slf4j.LoggerFactory;
 18 
 19 public class XmlUtil extends InputStream {
 20     private static final Logger log = (Logger) LoggerFactory
 21             .getLogger(XmlUtil.class);
 22     private static final int BOM_SIZE = 4;
 23     PushbackInputStream internalIn;
 24     boolean isInited = false;
 25     String defaultEnc;
 26     String encoding;
 27 
 28     public XmlUtil(InputStream in, String defaultEnc) {
 29         internalIn = new PushbackInputStream(in, BOM_SIZE);
 30         this.defaultEnc = defaultEnc;
 31     }
 32 
 33     public String getDefaultEncoding() {
 34         return defaultEnc;
 35     }
 36 
 37     /**
 38      * Read-ahead four bytes and check for BOM marks. Extra bytes are unread
 39      * back to the stream, only BOM bytes are skipped.
 40      */
 41     protected void initXmlBOM() throws IOException {
 42         if (isInited)
 43             return;
 44 
 45         byte bom[] = new byte[BOM_SIZE];
 46         int n, unread;
 47         n = internalIn.read(bom, 0, bom.length);
 48 
 49         if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)
 50                 && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
 51             encoding = "UTF-32BE";
 52             unread = n - 4;
 53         } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)
 54                 && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
 55             encoding = "UTF-32LE";
 56             unread = n - 4;
 57         } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)
 58                 && (bom[2] == (byte) 0xBF)) {
 59             encoding = "UTF-8";
 60             unread = n - 3;
 61         } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
 62             encoding = "UTF-16BE";
 63             unread = n - 2;
 64         } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
 65             encoding = "UTF-16LE";
 66             unread = n - 2;
 67         } else {
 68             // Unicode BOM mark not found, unread all bytes
 69             encoding = defaultEnc;
 70             unread = n;
 71         }
 72         // log.info("read=" + n + ", unread=" + unread);
 73 
 74         if (unread > 0)
 75             internalIn.unread(bom, (n - unread), unread);
 76 
 77         isInited = true;
 78     }
 79 
 80     public String getEncoding() {
 81         if (!isInited) {
 82             try {
 83                 initXmlBOM();
 84             } catch (IOException ex) {
 85                 IllegalStateException ise = new IllegalStateException(
 86                         "Init method failed.");
 87                 ise.initCause(ise);
 88                 throw ise;
 89             }
 90         }
 91         return encoding;
 92     }
 93 
 94     public static void removeXmlBomAndComment(String filePath) {
 95         XmlUtil uins = null;
 96         BufferedReader bufr = null;
 97         OutputStreamWriter osw = null;
 98         String enc = "ISO-8859-1";
 99 
100         String fileContent = "";
101         String leftBracket = "<!--";
102         String rightBracket = "-->";
103         int leftBracketIndex = 0;
104         int rightBracketIndex = 0;
105 
106         String line = "";
107         StringBuffer fileContentBuffer = new StringBuffer();
108         try {
109             // 根据BOM Mark编码方式，对文件进行重新编码
110             uins = new XmlUtil(new FileInputStream(filePath), enc);
111             enc = uins.getEncoding();
112 
113             if (enc == null) {
114                 bufr = new BufferedReader(new InputStreamReader(uins));
115             } else {
116                 bufr = new BufferedReader(new InputStreamReader(uins, enc));
117             }
118 
119             while ((line = bufr.readLine()) != null) {
120                 fileContentBuffer.append(line);
121             }
122             uins.close();
123             bufr.close();
124 
125             // 删除"<!-- -->"格式的注释
126             fileContent = fileContentBuffer.toString();
127             leftBracketIndex = fileContent.indexOf(leftBracket);
128             rightBracketIndex = fileContent.indexOf(rightBracket);
129             while (leftBracketIndex < rightBracketIndex
130                     && rightBracketIndex != 0) {
131                 fileContent = fileContent.substring(0, leftBracketIndex)
132                         + fileContent.substring(rightBracketIndex + 3,
133                                 fileContent.length());
134                 leftBracketIndex = fileContent.indexOf(leftBracket);
135                 rightBracketIndex = fileContent.indexOf(rightBracket);
136             }
137 
138             // 将处理过的内容，写入文件
139             osw = new OutputStreamWriter(new FileOutputStream(filePath));
140             osw.write(fileContent);
141             osw.flush();
142             osw.close();
143 
144         } catch (FileNotFoundException e) {
145             e.printStackTrace();
146         } catch (UnsupportedEncodingException e) {
147             e.printStackTrace();
148         } catch (IOException e) {
149             e.printStackTrace();
150         } finally {
151             if (uins != null) {
152                 try {
153                     uins.close();
154                 } catch (IOException e) {
155                     e.printStackTrace();
156                 }
157             }
158 
159             if (bufr != null) {
160                 try {
161                     bufr.close();
162                 } catch (IOException e) {
163                     e.printStackTrace();
164                 }
165             }
166 
167             if (osw != null) {
168                 try {
169                     osw.close();
170                 } catch (IOException e) {
171                     e.printStackTrace();
172                 }
173             }
174         }
175     }
176 
177     /**
178      * 如果根元素有声明命名空间，通过xpath匹配子元素时，需要特殊处理。
179      * */
180     public static List<Element> getNameSpaceElement(Element root, String node) {
181         // 获得节点的命名空间
182         HashMap<String, String> map = new HashMap<String, String>();
183         map.put("mvn", root.getNamespaceURI());
184         XPath xpath = DocumentHelper.createXPath("//mvn:" + node);
185         xpath.setNamespaceURIs(map);
186 
187         @SuppressWarnings("unchecked")
188         List<Element> selectedNodes = (List<Element>) xpath.selectNodes(root
189                 .getDocument());
190         return selectedNodes;
191     }
192 
193     @Override
194     public void close() throws IOException {
195         // init();
196         isInited = true;
197         internalIn.close();
198     }
199 
200     @Override
201     public int read() throws IOException {
202         // init();
203         isInited = true;
204         return internalIn.read();
205     }
206 }