自己做一个网页爬虫用来抓取一个网站的地址

以前做过全文检索加网页爬虫，针对的是整个因特网，不过，用的开源的网页抓取工具，hreitrix，研究其源码，但也只是稍微修改了部分源码，以达到业务需要，不过，后面，因为项目停了，这个工作我也就搁置了，后面自己单独写了个类似原理的爬虫，当然，我写的这个简单的很，代码也没有写规范。

现在有个任务，需要将整个湖南省的所有区域写入数据库中，我就将原来的写的一个爬虫工具拿出来，修改了一下，刚好能跑，按照需求也算完成了任务。

原理就是用jsoup将网页下载到本地：

利用递归算法解析出所有的链接，然后再拼出正确的http路径，当然，你也可以不用下载到本地，这样的话，你每测试一次就会从网上去下载一次，这样的话，性能很差，所以，我选择的是下载到本地
然后利用htmlparser工具包，解析对应的数据，然后将数据导入到数据库里

/**

*用线程池的方法添加启动程序，效率有所提升

public class ChoeseClass {

public static String url = "http://www.stats.gov.cn/tjbz/cxfldm/2012/43.html";// 网站地址

public static String chose = ".citytable";//命名的规则，其实就是一个网页里的一个class = // '.tt1'的网页

static BlockingQueue<String> queue = new LinkedBlockingQueue<String>(10);

public static void main(String[] args) throws IOException {

ExecutorService service = Executors.newCachedThreadPool();

Thread t1 = new CatchURL(queue);

service.execute(t1);//抓取链接地址是单线程的

Thread t2 = new DownLoadHtml(queue);

for (int i = 0; i < 10; i++) {//下载网页用多线程

service.execute(t2);

}

class DownLoadHtml extends Thread {

private BlockingQueue<String> queue;

public DownLoadHtml(BlockingQueue<String> queue){

this.queue = queue;

}

@Override

public void run() {

try {

while (true) {

System.out.println(Thread.currentThread().getName());

//System.out.println(this.queue.take().toString());

new MirroCraw(this.queue.take().toString(), "G://test//local");

// Thread.sleep(10000);// 设置等待时间为10秒钟，下载一个网页的最大时间为10秒钟

if (this.queue.size() == -1)// 一直判断是否还是没有处理完的链接

break;

}

} catch (Exception e) {

e.printStackTrace();

}

/**

* 抓取链接地址，并添加到一个线程安全的队列中

（自己也实现了一个线程安全的队列，用双向list添加，由于添加了list去重方法，效率很低，所以改成jdk自带的队列来实现）

class CatchURL extends Thread {

private BlockingQueue<String> queue;

public CatchURL(BlockingQueue<String> queue){

this.queue = queue;

}

@Override

public void run() {

try {

choClass(ChoeseClass.url, ChoeseClass.chose);

} catch (IOException e) {

e.printStackTrace();

}

public void choClass(String url, String chose) throws IOException {

Document doc = null;

doc = Jsoup.connect(url).timeout(30000).get();

String classes = chose;

Elements el = doc.select(classes);

List<String> links = getPageLinks(el.toString());

List<String> tempLinks = new ArrayList<>();

for (String string : links) {

if(!tempLinks.contains(string)) {

tempLinks.add(string);

}

// 对url的一些组装

for (String string : tempLinks) {

if (!(string.equals("#"))) {

if (!string.startsWith("http://")) {

if (string.startsWith("/")) {

string = assemblyUrl(url) + string;

} else {

if (string.startsWith("../")) {// 如果是这种开头，则对应的上级目录加这个../后的目录

string = string.replace("../", "");

string = url.substring(0,

url.substring(0, url.lastIndexOf("/"))

.lastIndexOf("/") + 1)

+ string;

} else {

string = url.substring(0, url.lastIndexOf("/") + 1)

+ string;

}

if (!string.contains(url)) {

this.queue.add(string);

System.out.println("添加："+Thread.currentThread().getName() + " " + string);

int i = ContainChar.ContainCharByStr(string);

if (i == 7) {

choClass(string, ".countytable");

}

if (i == 8) {

choClass(string, ".towntable");

}

if (i == 9) {

choClass(string, ".towntable");

}

protected static List<String> getPageLinks(String html) {

Parser parser = null;

NodeList nodeList = null;

NodeFilter filter = null;

List<String> urlList = new ArrayList<String>();

try {

// 创建指定编码的Parser对象

parser = Parser.createParser(html, "GB18030");

// 创建一个接受标签A的过滤器

filter = new TagNameFilter("A");

// 过滤节点内容

nodeList = parser.extractAllNodesThatMatch(filter);

int size = nodeList.size();

for (int i = 0; i < size; i++) {

LinkTag tag = (LinkTag) nodeList.elementAt(i);

// System.out.println(tag.getLink());

urlList.add(tag.getLink());

}

} catch (ParserException e) {

e.printStackTrace();

}

return urlList;

}

/**

* 组装url

* @param url

public static String assemblyUrl(String url) {

if (url.startsWith("http://")) {

String newUrl = url.substring(0, url.indexOf("/", 8));

return newUrl;

}

return url;

}

下面部份则是批量解析的代码，没时间注释

public class MainAction {

public static void main(String[] args) throws IOException {

File dir = new File("G:\test\local\www.stats.gov.cn");

File [] files = readFiles(dir);

for (File file : files) {

startAction(file);

}

public static void startAction(File file) throws IOException{

FileReader fr = null;

StringBuffer sb = new StringBuffer();

try {

fr = new FileReader(file);

char[] buf = new char[1024];

int len = 0;

while ((len = fr.read(buf)) != -1) {

sb.append(new String(buf, 0, len));

}

} catch (FileNotFoundException e1) {

e1.printStackTrace();

} finally {

fr.close();

}

int fileName = file.getName().length();

String fileNametemp [] = file.getName().split("\.");

String anaylisisHtml = null;

String type = null;

if(fileName == 9){

anaylisisHtml = anayli(sb.toString(), ".countytable");

type=".countytable";

}else if(fileName == 11) {

anaylisisHtml = anayli(sb.toString(), ".towntable");

type=".towntable";

}else if(fileName == 14){

anaylisisHtml = anayli(sb.toString(), ".villagetr");

type=".villagetr";

}

List<String> list = getPageLinks(anaylisisHtml,type);

List<ParamModel> models = new ArrayList<>();

if(type.equals(".villagetr")){

for (int i = 0; i < list.size(); i = i + 3) {

System.out.println(fileNametemp[0]+" "+ list.get(i) + " " + list.get(i+2));

ParamModel model = new ParamModel();

model.setFileName(fileNametemp[0]);

model.setObj1(list.get(i));

model.setObj2(list.get(i+2));

models.add(model);

}

try {

//DateUtil.save(models);

} catch (Exception e) {

e.printStackTrace();

}

}else{

for (int i = 0; i < list.size(); i = i+2) {

System.out.println(fileNametemp[0]+" "+ list.get(i) + " " + list.get(i+1));

ParamModel model = new ParamModel();

model.setFileName(fileNametemp[0]);

model.setObj1(list.get(i));

model.setObj2(list.get(i+1));

models.add(model);

}

try {

//DateUtil.save(models);

} catch (Exception e) {

e.printStackTrace();

}

/**

* 得到文件夹下的所有文件

* @param dir

public static File [] readFiles(File dir){

File [] files = dir.listFiles();

return files;

}

/**

* 进行class过滤，返回过滤后的class内的所有html元素

* @param html

* @param choClass

* @return

public static String anayli(String html,String choClass){

Document doc = Jsoup.parse(html);

Elements el = doc.select(choClass);

return el.toString();

}

protected static List<String> getPageLinks(String html,String type) {

Parser parser = null;

NodeList nodeList = null;

NodeFilter filter = null;

List<String> urlList = new ArrayList<String>();

try {

// 创建指定编码的Parser对象

parser = Parser.createParser(html, "GB18030");

// 创建一个接受标签A的过滤器

if(type.equals(".villagetr")){

filter = new TagNameFilter("td");

}else {

filter = new TagNameFilter("A");

}

// 过滤节点内容

nodeList = parser.extractAllNodesThatMatch(filter);

int size = nodeList.size();

for (int i = 0; i < size; i++) {

if(type.equals(".villagetr")){

Tag tag = (Tag) nodeList.elementAt(i);

//String temStr = tag.getFirstChild().toHtml();

urlList.add(tag.getFirstChild().toHtml());

}else{

LinkTag tag = (LinkTag) nodeList.elementAt(i);

//System.out.println(tag.getLink());

//System.out.println(tag.getChildrenHTML());

urlList.add(tag.getChildrenHTML());

}

} catch (ParserException e) {

e.printStackTrace();

}

return urlList;

}

工程下载地址：http://files.cnblogs.com/wxwall/Mycrawler.zip