webmagic学习之路-1:采集安居客列表页测试

---恢复内容开始---

package com.action;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.management.JMException;

import org.bson.Document;

import com.model.Model_AnjukeList;
import com.mongodb.BasicDBObject;
import com.util.Constants;
import com.util.GetDate;
import com.util.MD5With32;
import com.util.MongoDBUtil;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.monitor.SpiderMonitor;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.processor.PageProcessor;

public class GetAnjukeListNum implements PageProcessor {

    public static Model_AnjukeList anjukeList;
    public static List<String> list = new ArrayList<String>();
    public static List<BasicDBObject> list_insert = new ArrayList<BasicDBObject>();
    private Site site = Site.me().setSleepTime(1000).setRetryTimes(3).setCharset("UTF-8")
            .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36");
    @Override
    public Site getSite() {
        // TODO Auto-generated method stub
        return this.site;
    }
    
    @Override
    public void process(Page page) {
        String found = null;
        BasicDBObject  doc = null;
        page.addTargetRequests(Constants.list_urls);
        System.out.println("code:"+page.getStatusCode());
        String pg = page.getHtml().toString();
        if(pg.length()>100){
            Pattern p = Pattern.compile(Constants.anjuke_Reg_Found);
            Matcher m = p.matcher(pg);
            while(m.find()){
                found = m.group(0).replace(""found":", "").replace(",", "");
                String id = MD5With32.encryption(page.getUrl().toString());
                if(!Constants.map_urls.containsKey(id)){
                    continue;
                }
                Model_AnjukeList model_AnjukeList = Constants.map_urls.get(id);
//mongo存储! doc
= new BasicDBObject("_id",id) .append("city", model_AnjukeList.getCity()) .append("towards", model_AnjukeList.getTowards()) .append("zone_urls", model_AnjukeList.getZone_urls()) .append("zone", model_AnjukeList.getZone()) .append("site", model_AnjukeList.getSite()) .append("decoration", model_AnjukeList.getDecoration()) .append("flag", model_AnjukeList.getFlag()) .append("street", model_AnjukeList.getStreet()) .append("type", model_AnjukeList.getType()) .append("page", model_AnjukeList.getPage()) .append("urls", model_AnjukeList.getUrls()) .append("found", found) .append("update_time", model_AnjukeList.getUpdate_time()) ; list_insert.add(doc); } } } public static void main(String[] args) { String city = "北京"; String urls = "https://beijing.anjuke.com/sale/"; MongoGetUrls.GetMongoUrls(city); System.out.println("任务总数:"+Constants.list_urls.size()); Spider.create(new GetAnjukeListNum()) .addUrl(urls) .addPipeline(new ConsolePipeline()) .thread(30) .run(); MongoDBUtil.saveMany(..., list_insert); } }

第一次用webmagic 很多东西不懂,也没有重写。

很多都是用纯java实现

让我们慢慢发现webmagic的强大吧!

原文地址:https://www.cnblogs.com/tnsay/p/10895284.html