HTTPCLIENT抓取网页内容

通过httpclient抓取网页信息。

public class SnippetHtml{
	
	/**
	 * 通过url获取网站html
	 * @param url 网站url
	 */
	public String parseHtml (String url) {
		// 测试HttpClient用法
		HttpClient client=new HttpClient();
		//设置代理服务器地址和端口
		HttpMethod method = null;
		String html = "";
		try {
			method = new GetMethod(url);
			client.executeMethod(method);
			html = method.getResponseBodyAsString();//获取网页内容
		} catch (HttpException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} finally {
			//释放连接
			if (method != null) {
				method.releaseConnection();  
			}
		}
		return html ;
	}
	
	/**
	 * 解析html获取地震bean
	 * @param html 解析网页html
	 * @return List 
	 */
	public void getHtmlEarthBean (String html) {
		if (html != null && !"".equals(html)) {
			Document doc = Jsoup.parse(html);   
			Elements linksElements = doc.getElementsByAttributeValue("class", "news-table");//获取class名字为 news-table
			for (Element ele : linksElements) {
				Elements linksElements1 = ele.getElementsByTag("td");//获取网页td的标签元素
				for (Element ele1 : linksElements1) {
					System.out.println(ele1.text());
				}
			}   
		}
	}
}


需要下载jar包:commons-httpclient-3.1.jar 和  jsoup-1.6.1.jar  作为抓取和解析。

原文地址:https://www.cnblogs.com/jiangu66/p/3181722.html