提取新闻下一页

package com.unbank.robotspider.util;

import java.util.HashMap;
import java.util.Map;

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class SmartNextPageFecther {

	public static void main(String[] args) {
		String url = "http://focus.stockstar.com/SS2014061700001351.shtml";
		Document document = JsoupUtil.readUrl(url);
		Map<Integer, String> pageList = new SmartNextPageFecther()
				.getNextPageUrl(document, url);

		for (int i = 0; i < pageList.size() + 3; i++) {
			String nextUrl = pageList.get(i);
			if (nextUrl != null) {
				System.out.println(nextUrl);
			}

		}

	}

	public Map<Integer, String> getNextPageUrl(Document doc, String baseurl) {
		Document document = doc.clone();
		Map<Integer, String> map = new HashMap<Integer, String>();
		Elements a_elements = document.getElementsByTag("a");
		int prePageNum = 5;
		int pageNum = 0;
		for (Element e : a_elements) {
			String uu = e.attr("href");
			uu = UrlTools.getFullUrl(baseurl, uu);
			if (uu == null || uu.trim().isEmpty()) {
				continue;
			}
			String a_text = e.text();
			// 是否是下一页的
			boolean bl = checkText(a_text);

			if (bl) {
				int cu = checkUrl(baseurl, uu);
				if (cu != -1) {
					pageNum = pageNum > cu ? pageNum : cu;
					prePageNum = prePageNum < cu ? prePageNum : cu;
					map.put(cu, uu);
				}
			}
		}
		if (map.size() >= 2) {
			// 说明是3页了
			String second = null;
			String third = null;
			if (prePageNum == 0) {
				second = map.get(0);
				third = map.get(1);
			} else if (prePageNum == 1) {
				second = map.get(1);
				third = map.get(2);
			} else if (prePageNum == 2) {
				second = map.get(2);
				third = map.get(3);
			}
			String urlRule = UrlRuleUtil.getURlRule(second, third);
			for (int i = prePageNum; i <= pageNum; i++) {
				if (map.get(i) == null) {
					String page = UrlRuleUtil.getcheckURL(urlRule, i);
					map.put(i, page);
				}
			}

		}

		return map;
	}

	public boolean checkText(String text) {
		String[] texts = { "首页", "第一页", "下一页", "末页", "最后一页", "尾页" };
		for (int i = 0; i < texts.length; i++) {
			if (texts[i].equals(text)) {
				return true;
			}
		}
		if (text.matches("\d{1,2}")) {
			return true;
		}
		return false;
	}

	public int checkUrl(String url1, String url2) {
		int l1 = url1.length();
		int l2 = url2.length();
		if (l1 == 0 || l2 == 0) {
			return -1;
		}

		String longStr = l1 > l2 ? url1 : url2;
		String shortStr = l1 < l2 ? url1 : url2;
		int j = 0;
		StringBuffer sb = new StringBuffer();
		for (int i = 0; i < longStr.length() - 1; i++) {
			if (longStr.charAt(i) != shortStr.charAt(j)) {
				sb.append(longStr.charAt(i));
			} else {
				j++;
				if (j == shortStr.length()) {
					break;
				}
			}
		}
		if (sb.length() == 0) {
			return -1;
		}
		String variances = sb.toString();
		String numStr = variances.replaceAll("_", "").replaceAll("=", "")
				.replaceAll("index", "").replaceAll("page", "")
				.replaceAll("p", "").replaceAll("-", "");
		if (numStr.matches("\d{1,2}")) {
			return Integer.valueOf(numStr);
		} else {
			return -1;
		}
	}
}
原文地址:https://www.cnblogs.com/tomcattd/p/3808550.html