MinerHtmlThread.java 爬取页面线程

MinerHtmlThread.java 爬取页面线程

package com.iteye.injavawetrust.miner;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

/**
 * 爬取页面线程
 * @author InJavaWetrust
 *
 */
public class MinerHtmlThread extends Thread {
	
	private static final Log LOG = LogFactory.getLog(MinerHtmlThread.class);
	
	private MinerConfig config = null;
	
	public MinerHtmlThread(MinerConfig config) {
		this.config = config;
	}
	
	@Override
	public void run() {
		while (!MinerMonitorThread.done) {
			minerHtml();
		}
	}
	
	public synchronized void minerHtml() {
		MinerUrl minerUrl = MinerQueue.unVisitedPoll(); // 待访问出队列。
		try {
			//判断当前页面爬取深度
			if(null == minerUrl || MinerUtil.isBlank(minerUrl.getUrl()) || minerUrl.getDepth() > config.getMaxDepth()) {
				return;
			}
			//判断爬取页面URL是否包含http
			if("http".contains(minerUrl.getUrl())) {
				LOG.info("MinerHtmlThread当前爬取URL[" + minerUrl.getUrl() + "]没有http");
				return;
			}
			LOG.info("MinerHtmlThread当前爬取页面[" + minerUrl.getUrl() + "]爬取深度[" + minerUrl.getDepth() + "] 当前线程 [" + Thread.currentThread().getName() + "]");
			Connection conn = Jsoup.connect(minerUrl.getUrl());
			conn.header("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13");//配置模拟浏览器  
			Document doc = conn.get();
			String page = doc.html();
			Html html = new Html();
			html.setUrl(minerUrl.getUrl());
			html.setHtml(page);
			html.setDepth(minerUrl.getDepth());
			
			// 添加到存储队列
			MinerQueue.addStore(html);
			
			// 已经爬取的页面 添加到等待提取URL的分析页面队列
			MinerQueue.addWaitingMisering(html); 
			
		} catch(Exception e) {
			LOG.info("MinerHtmlThread爬取页面失败 URL [" + minerUrl.getUrl() + "]");
			LOG.info("MinerHtmlThreadError info [" + e.getMessage() + "]");
		}
		
	}

}

返回列表

原文地址:https://www.cnblogs.com/new0801/p/6146688.html