Java爬虫搜索原理实现

新人国庆没事做，又研究了一下爬虫搜索，两三天时间总算是把原理闹的差不多了，基本实现了爬虫搜索的原理，本次实现还是俩程序，分别是按广度优先和深度优先完成的，广度优先没啥问题，深度优先请慎用，有极大的概率会造成死循环情况，下面深度优先的测试网站就造成了死循环。。。。好吧，我承认是我人品不太好。。。下面有请代码君出场~~~~~~~~~~~~~~~

1.广度优先

/**
 * 完成广度优先搜索
 */
package net.meteor.java;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author 魏诗尧
 * @version 1.8
 * @emali inwsy@hotmail.com
 */
public class SearchCrawlerBreadth {

	// 将网页源码下载到本地
	private void downHTML(String urlstr, String htmltxt) {
		// 声明链接
		HttpURLConnection con = null;
		// 声明输入流
		InputStream in = null;
		// 声明输出流
		FileOutputStream out = null;

		try {
			// 实例化url
			URL url = new URL(urlstr);
			// 打开链接
			con = (HttpURLConnection) url.openConnection();

			con.connect();
			// 打开输入流
			in = con.getInputStream();
			// 打开输出流创建接收文件
			out = new FileOutputStream(htmltxt);

			byte[] b = new byte[1024];

			int len = 0;
			// 将文件写入接收文件
			while ((len = in.read(b, 0, 1024)) != -1) {
				out.write(b, 0, len);
			}
			// 开始第二次爬行
			new SearchCrawlerBreadth().readTxt("src/href.txt");

		} catch (Exception e) {
			System.out.println("未知主机！！");
		} finally {
			try {
				// 关闭流
				if (out != null)
					out.close();
				if (in != null)
					in.close();

			} catch (Exception e) {

				e.printStackTrace();
			}
		}
	}

	// 页面解析
	private void readTxt(String hreftxt) {
		// 声明输入流
		InputStream in = null;
		FileWriter file = null;
		BufferedReader br = null;

		try {
			// 实例化IO流，允许文件追加写
			file = new FileWriter(hreftxt, true);

			in = new FileInputStream("src/html.txt");

			br = new BufferedReader(new InputStreamReader(in));
			// 开始解析html
			while (br.readLine() != null) {

				String line = br.readLine();
				// 创建正则表达式
				Pattern pattern = Pattern.compile(
						"<a\s+href\s*=\s*"?(.*?)["|>]",
						Pattern.CASE_INSENSITIVE);
				// 创建匹配器
				Matcher matcher = pattern.matcher(line);
				// 开始与正则表达式进行匹配
				while (matcher.find()) {
					String str = matcher.group(1);
					// 跳过链到本页面内链接和无效链接
					if (str.length() < 1) {
						continue;
					}

					if (str.charAt(0) == '#') {
						continue;
					}

					if (str.startsWith("/")) {
						continue;
					}
					
					if (str.indexOf("mailto:") != -1) {
						continue;
					}
					if (str.toLowerCase().indexOf("javascript") != -1) {
						continue;
					}

					if (str.startsWith("'")) {
						continue;
					}
					// 将有效链接打印到屏幕
					System.out.println(str);
					// 将有效链接写入到文件
					file.write(str + "
");

				}

			}

		} catch (Exception e) {
			System.out.println("无效链接！！");
		} finally {
			// 关闭IO流
			try {
				if (file != null)
					file.close();
				if (br != null)
					br.close();
				if (in != null)
					in.close();
			} catch (Exception e) {

				e.printStackTrace();
			}
		}
	}

	// 进行深度搜索
	private void search() {
		// 声明IO流
		InputStream in = null;

		BufferedReader br = null;

		try {
			// 实例化IO流

			in = new FileInputStream("src/href.txt");
			br = new BufferedReader(new InputStreamReader(in));
			// 创建SearchCrawler的对象
			SearchCrawlerBreadth sc = new SearchCrawlerBreadth();
			// 开始按行读取有效链接的文件
			while (br.readLine() != null) {
				String line = br.readLine();
				// 递归调用爬虫爬行页面
				sc.downHTML(line, "src/html.txt");
			}

		} catch (IOException e) {

			e.printStackTrace();

		} finally {
			try {
				// 关闭流
				if (br != null)
					br.close();
				if (in != null)
					in.close();
			} catch (Exception e2) {

				e2.printStackTrace();
			}
		}

	}

	public static void main(String[] args) throws Exception {
		// 传入要爬行的页面和保存HTML源码的文件地址
		new SearchCrawlerBreadth().downHTML("http://www.hao123.com/", "src/html.txt");
		// 调用第二次的搜索
		new SearchCrawlerBreadth().search();
	}
}

上面广度优先没啥问题，本人昨天凌晨3点多做的测试，15分钟左右的时间，这只小爬虫爬到了30W+的链接，能力还是蛮强大的么，顺便提一下，白天测试的时候会非常非常的慢，推荐各位测试君在晚上12点以后做测试。。。。。虽然不太人道。。。

下面是深度优先的代码，测试的时候每次都能造成死循环。。。好吧，我承认我没有人品。。。其实基本方法和广度优先没啥区别，我每个页面爬出来的链接只拿第一个去爬下一个页面，总共爬多少层我懒的木有定义，就是想看看最多能爬到哪。。。然后每次都能悲剧的死循环了。。。我明明也设置了跳出的方法了啊，我有判断有效链接的方式，但是我的判断并不完善么，跳出方法我写到了catch中，只要有一个无效链接，就可以跳出来了么。。。今天凌晨全都是死循环。。。。无奈了。。。。下面请代码君上场~~~~~~~~~~

/**
 * 完成深度优先搜索
 * 爬虫进行深度优先很有可能会出现死循环的情况
 */
package net.meteor.java;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.HashSet;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author 魏诗尧
 * @version 1.8
 * @emali inwsy@hotmail.com
 */
public class SearchCrawlerDepth {
	// 声明一个静态集合，用来存放爬虫爬到的URL
	private static HashSet<String> set = new HashSet<String>();

	// 将网页源码下载到本地
	private void downHTMLDepth(String urlstr, String htmltxt) {
		// 声明链接
		HttpURLConnection con = null;
		// 声明输入流
		InputStream in = null;
		// 声明输出流
		FileOutputStream out = null;

		try {
			// 实例化url
			URL url = new URL(urlstr);
			// 打开链接
			con = (HttpURLConnection) url.openConnection();

			con.connect();
			// 打开输入流
			in = con.getInputStream();
			// 打开输出流创建接收文件
			out = new FileOutputStream(htmltxt);

			byte[] b = new byte[1024];

			int len = 0;
			// 将文件写入接收文件
			while ((len = in.read(b, 0, 1024)) != -1) {
				out.write(b, 0, len);
			}

			new SearchCrawlerDepth().readTxtDepth("src/hrefdepth.txt");
		} catch (Exception e) {
			System.out.println("未知主机！！，爬行结束！！");
		} finally {
			try {
				// 关闭流
				if (out != null)
					out.close();
				if (in != null)
					in.close();

			} catch (Exception e) {

				e.printStackTrace();
			}
		}
	}

	// 页面解析
	private void readTxtDepth(String hreftxt) {
		// 声明输入流
		InputStream in = null;

		BufferedReader br = null;

		try {
			// 实例化IO流，允许文件追加写

			in = new FileInputStream("src/htmldepth1.txt");

			br = new BufferedReader(new InputStreamReader(in));
			// 开始解析html
			A: while (br.readLine() != null) {

				String line = br.readLine();
				// 创建正则表达式
				Pattern pattern = Pattern.compile(
						"<a\s+href\s*=\s*"?(.*?)["|>]",
						Pattern.CASE_INSENSITIVE);
				// 创建匹配器
				Matcher matcher = pattern.matcher(line);
				// 开始与正则表达式进行匹配
				while (matcher.find()) {
					String str = matcher.group(1);
					// 跳过链到本页面内链接和无效链接
					if (str.length() < 1) {
						continue;
					}

					if (str.charAt(0) == '#') {
						continue;
					}
					
					if (str.startsWith("/")) {
						continue;
					}

					if (str.indexOf("mailto:") != -1) {
						continue;
					}
					if (str.toLowerCase().indexOf("javascript") != -1) {
						continue;
					}

					if (str.startsWith("'")) {
						continue;
					}
					// 将有效链接打印到屏幕
					System.out.println(str);
					// 将第一个有效链接写入到hashset
					 while (str != null) {
						set.add(str);
						new SearchCrawlerDepth().downHTMLDepth(str, "src/htmldepth1.txt");
						break A;
					} 
				}
			}
		} catch (Exception e) {
			System.out.println("无效链接！！本次爬行结束！！");
			new SearchCrawlerDepth().searchDepth();
		} finally {
			// 关闭IO流
			try {

				if (br != null)
					br.close();
				if (in != null)
					in.close();
			} catch (Exception e) {

				e.printStackTrace();
			}
		}
	}

	public void searchDepth() {

		FileWriter file = null;

		try {
			// 声明文件路径，可以追加写
			file = new FileWriter("src/hrefdepth1.txt", true);
			// 用迭代器遍历得到链接
			Iterator<String> it = set.iterator();
			while (it.hasNext()) {
				System.out.println(it);
				file.write(it + "
");
			}

		} catch (IOException e) {
			System.out.println("无效链接，本次爬行结束！！");
			e.printStackTrace();
		} finally {

			try {
				if (file != null)
					file.close();
			} catch (IOException e) {
				
				e.printStackTrace();
			}
		}
	}
	
	public static void main(String[] args) {
		new SearchCrawlerDepth().downHTMLDepth("http://www.hao123.com", "src/htmldepth1.txt");
		new SearchCrawlerDepth().searchDepth();
	}
}

上面这两篇代码本身是十分不完善的，时间原因，我基本只实现了最基本的原理，能改动增加的地方还有很多，主要是增加，很多地方都可增加代码来增强程序的健壮性。。。比如有效链接判断的地方，我们从href标签中取出来的内容除了我写的几条判断意外还有好多东西都没有处理掉，这个地方还是能增加很多东西的。。。