webmagic学习之路-3:采集安居客经纪人详情页

这里希望安居客的同行的轻喷!!单纯的做测试,玩玩。

就这么糟践你们的服务器了!!!sorry!

这次学会了webmagic 设置处理的访问HTML返回代码,因为之前一直404的页面process根本都不会进来,纳闷很久,也百度了半天。

看源码看了好半天,才知道原来有这个方法设置进process的状态码,让我看源码的决心来源于 我用logger 打印的内容告诉我,webmagic已经获取了404,只是没处理而已。

也同时学会了 scheduler

package com.action;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.management.JMException;
import javax.swing.plaf.synth.SynthSpinnerUI;

import org.apache.commons.collections.bag.SynchronizedSortedBag;
import org.apache.log4j.Logger;
import org.bson.Document;

import com.model.AgentListByNumModel;
import com.model.AgentListModel;
import com.model.Model_AnjukeList;
import com.mongodb.BasicDBObject;
import com.util.Constants;
import com.util.GetDate;
import com.util.MysqlUtils;
import com.util.MD5With32;
import com.util.MongoDBUtil;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.AbstractDownloader;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.monitor.SpiderMonitor;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.PriorityScheduler;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;

public class GetAnjukeAgentByNum implements PageProcessor {
    
    static Logger logger = Logger.getLogger(GetAnjukeAgentByNum.class); 
    static AgentListByNumModel anjukeList;
    static List<String> list = new ArrayList<String>();
    static List<AgentListByNumModel> list_insert = new ArrayList<AgentListByNumModel>();
    static BasicDBObject doc = null;
    static int num = 0;
    private Site site = Site.me().setSleepTime(1000).setRetryTimes(3).setCharset("UTF-8").setUserAgent(
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36");
    @Override
    public Site getSite() {
        // TODO Auto-generated method stub
        Set<Integer> acceptStatCode = new HashSet<Integer>();
        acceptStatCode.add(200);
        acceptStatCode.add(404);
        site = site.setAcceptStatCode(acceptStatCode);
        return this.site;
    }
    @Override
    public void process(Page page) {
        if(page.getStatusCode()==404
                ||page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[2]/text()").get().replace("经纪人", "").equals("")
                ||(page.getHtml()+"").contains("经纪人店铺暂时关闭")){
            String spider_urls = page.getUrl() +"";
            anjukeList = new AgentListByNumModel("", "", "", "", "anjuke", GetDate.getDay0(),
                    spider_urls, "", spider_urls, "", "", "", "", "", "");
            list_insert.add(anjukeList);
            MysqlUtils.InsertAnjukeAgentByNum(list_insert);
            list_insert.clear();
        }else{
            if((page.getHtml()+"").contains("访问验证-安居客")){
                num = num+1;
                System.out.println("被封次数 : "+num);
            }
            String zone = "";
            String street = "";
            String contact = "";
            String city = page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[2]/text()").get().replace("经纪人", "");
            String name = page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[4]/text()").get().replace("的店铺", "");
            String staffNo = page.getUrl() + "";
            String company = page.getHtml().xpath("//div[@class='section service']/dl/dd/p[1]/a/text()").get();
            String company_url = page.getHtml().xpath("//div[@class='section service']/dl/dd/p[1]/a/@href").get();
            String store = page.getHtml().xpath("//div[@class='section service']/dl/dd/p[2]/a/text()").get();
            String store_url = page.getHtml().xpath("//div[@class='section service']/dl/dd/p[2]/a/@href").get();
            String comms = page.getHtml().xpath("//dl[@class='item last']/dd/a/text()").all() + "";
            comms = comms.replace("[", "").replace("]", "");
            String contacts = page.getHtml().xpath("//head/meta[3]/@content").get();
            Pattern p = Pattern.compile(Constants.reg_phone);
            Matcher m = p.matcher(contacts);
            if (m.find()) {
                contact = m.group(0);
            }
            Object[] zs = page.getHtml().xpath("//div[@class='section service']/dl[3]/dd/a/text()").all().toArray();
            if (zs == null || zs.length == 0) {
                String zss = page.getHtml().xpath("//div[@class='details-item']/span[@class='comm-address']/@title").get();
                if (zss!=null&&(zss.contains("[") && zss.contains(" "))) {
                    zss = zss.substring(zss.indexOf("["), zss.indexOf("]"));
                    zss = zss.substring(1, zss.indexOf(" "));
                    if (zss.contains("-")) {
                        zone = zss.split("-")[0];
                        street = zss.split("-")[1];
                    }
                }
                anjukeList = new AgentListByNumModel("", city, zone, street, "anjuke", GetDate.getDay0(),
                        page.getUrl() + "", name, staffNo, company, company_url, store, store_url, contact, comms);
                list_insert.add(anjukeList);
            } else {
                for (int i = 0; i < zs.length; i++) {
                    if ((zs[i] + "").contains("-")) {
                        String[] zss = zs[i].toString().split("-");
                        zone = zss[0];
                        street = zss[1];
                    }
                    anjukeList = new AgentListByNumModel("", city, zone, street, "anjuke", GetDate.getDay0(),
                            page.getUrl() + "", name, staffNo, company, company_url, store, store_url, contact, comms);
                    list_insert.add(anjukeList);
                }
            }
            if(list_insert.size()>0){
                MysqlUtils.InsertAnjukeAgentByNum(list_insert);
                list_insert.clear();
            }
        }
        
        
    }

    public static void main(String[] args) {
        MysqlUtils.SelectSpiderID();
        PriorityScheduler scheduler = new PriorityScheduler();
        Spider spider = Spider.create(new GetAnjukeAgentByNum()).setScheduler(scheduler).addPipeline(new ConsolePipeline());
        for (int n = 0; n < 100000; n++) {
            if(Constants.map_id.containsKey(n+"")){
                System.out.println("contain : " +n);
                continue;
            }
            String url = "https://junranfangchan.anjuke.com/gongsi-jjr-" + n + "/";
            scheduler.push(new Request(url), spider);
        }
        System.out.println("total task num :" +scheduler.getTotalRequestsCount(spider));
        spider.thread(25).run();
//        Spider.create(new GetAnjukeAgentByNum()).addUrl("https://junranfangchan.anjuke.com/gongsi-jjr-99988/")
//        .addPipeline(new ConsolePipeline()).thread(1).run();
//        
    }
}
原文地址:https://www.cnblogs.com/tnsay/p/10895366.html