小小爬虫

const request=require('request');
const cheerio=require('cheerio');
(function () {
    //页面信息==》简介相关
    var getInfo=function (i) {
        var url='http://www.cnblogs.com/flyings/default.html?page='+i;
        request(url,function (err,res,body){
            let $ = cheerio.load(body);
            for(let i=0; i<$('.postTitle').length; i++){
                let href=$('.postTitle').eq(i).find('a').attr('href');
                let bid=/d*.html/.exec(href)[0].replace('.html','');
                let title=$('.postTitle').eq(i).find('a').text();
                let desc=$('.postCon').eq(i).find('.c_b_p_desc').text().replace(/阅读全文/ig,"").replace(/摘要:/ig,"");
                let blog={bid:bid,title:title, desc:desc }

                console.log(blog)


                // 存到数据库
                // let param={
                //     where:{bid:blog.bid},
                //     data:blog,
                //     option:{upsert:true}
                // }
                //blogModel.saveOrUpdate(param, function (error) {});


                //根据pid爬出详情数据
                getInforDetail(bid)
            }

        })
    }

    //页面信息==》详情相关
    var getInforDetail=function (j) {
        var url='http://www.cnblogs.com/flyings/p/'+j+'.html';
        request(url,function (err,res,body){
            let $ = cheerio.load(body);
            let bid=j;
            let title=$('.postTitle').find('a').text();
            let content=$('#cnblogs_post_body').html();
            let blogDetail={bid:bid,title:title,content:content}
            console.log(blogDetail)

            // 存到数据库
            // let param={
            //     where:{bid:blogDetail.bid},
            //     data:blogDetail,
            //     option:{upsert:true}
            // }
            // blogDetailModel.saveOrUpdate(param, function (error) {});
        })
    }


    //获取自己博客文章总页数,并开始爬数据
    var getInit=function () {
        request('http://www.cnblogs.com/flyings/default.html?page=2',function (err,res,body){
            console.log(123)
            let $ = cheerio.load(body);
            let str= $('.pager').eq(1).text();
            let pages=/共d*页/.exec(str)[0].replace(/[^0-9]/ig,"");
            for(let i=0; i<pages; i++){
                getInfo(i)
            }
        })
    }

    getInit()


})()

原文地址:https://www.cnblogs.com/dshvv/p/8016639.html