Htmlparser一个分析html内容的很好用的工具。
htmlparser是一个纯的java写的html解析的库,它不依赖于其它的java库文件,主要用于改造或提取html。
它能超高速解析html,而且不会出错。现在htmlparser最新版本为2.0。
毫不夸张地说,htmlparser就是目前最好的html解析和分析的工具。
无论你是想抓取网页数据还是改造html的内容,用了htmlparser绝对会忍不住称赞。
附上一点儿代码,留个纪念。(⊙_⊙)
package home.study.htmlparser.main; import java.io.IOException; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.HasChildFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.visitors.TextExtractingVisitor; public class HtmlParserStudy { private static String URL_ADDRESS = "http://localhost:8080/htmlparsertest/HTMLParserTester.html"; public static void main(String[] args) { try { Parser parser = createParser(); getEverythingInTags(parser); getNodeInformation(parser); getNodeInfoByFilter(parser); } catch (ParserException e) { e.printStackTrace(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } /** * Normal way to get a parser * * @return parser * @throws ParserException * @throws MalformedURLException * @throws IOException */ public static Parser createParser() throws ParserException, MalformedURLException, IOException { Parser parser = new Parser((HttpURLConnection) (new URL(URL_ADDRESS)).openConnection()); return parser; } /** * The normal way to visit the content in page * * @param parser * @throws ParserException */ public static void getEverythingInTags(Parser parser) throws ParserException { // create a visitor TextExtractingVisitor visitor = new TextExtractingVisitor(); // visit all the nodes in the parser given html page // there are several kinds of NodeVisitors parser.visitAllNodesWith(visitor); // get the text inside the tags String textInPage = visitor.getExtractedText(); System.out.println(textInPage); } /** * Get the node then use it * * @param parser * @throws ParserException */ public static void getNodeInformation(Parser parser) throws ParserException { NodeList nodesHaveChildren = new NodeList(); while (parser.elements().hasMoreNodes()) { Node currentNode = parser.elements().nextNode(); NodeFilter aFilter = new TagNameFilter("a"); NodeFilter hasChild = new HasChildFilter(aFilter); currentNode.collectInto(nodesHaveChildren, hasChild); } System.out.println(nodesHaveChildren.toHtml()); } /** * NodeFilter contains AndFilter, OrFilter, CssSelectorNodeFilter, * HasAttributeFilter, HasChildFilter, HasParentFilter, HasSiblingFilter, * TagNameFilter and so on * * @param parser * @throws ParserException */ public static void getNodeInfoByFilter(Parser parser) throws ParserException { TagNameFilter trTag = new TagNameFilter("tr"); NodeList trNodeList = parser.extractAllNodesThatMatch(trTag); for (int i = 0; i < trNodeList.size(); i++) { System.out.println(i + " node :"); System.out.println(trNodeList.elementAt(i).toHtml()); } } }