nodejs爬虫

初学nodejs爬虫,记录一下:

var request = require('request');
var iconv = require('iconv-lite'); //转码
var cheerio = require('cheerio');
var fs = require("fs");

var hrefArr = [ 'categories/2017627968.html',
'categories/2017627967.html',
'categories/2017627966.html',
'categories/2017627965.html',
'categories/2017627964.html',
'categories/2017627963.html',
'categories/2017627962.html',
'categories/2017627961.html',
'categories/2017627960.html',
'categories/2017627977.html',
'categories/2017627976.html',
'categories/2017627975.html',
'categories/2017627974.html',
'categories/2017627973.html',
'categories/2017627972.html',
'categories/2017627971.html',
'categories/2017627970.html',
'categories/2017627969.html',
'categories/2017627978.html',
'categories/2017627983.html',
'categories/2017627986.html',
'categories/2017627985.html',
'categories/2017627984.html',
'categories/2017627982.html',
'categories/2017627981.html',
'categories/2017627980.html',
'categories/2017627979.html',
'categories/2017627994.html',
'categories/2017627993.html',
'categories/2017627992.html',
'categories/2017627991.html',
'categories/2017627990.html',
'categories/2017627989.html',
'categories/2017627988.html',
'categories/2017627987.html',
'categories/2017627950.html',
'categories/20176281003.html',
'categories/20176281002.html',
'categories/20176281001.html',
'categories/20176281000.html',
'categories/2017628999.html',
'categories/2017628998.html',
'categories/2017628997.html',
'categories/2017628996.html',
'categories/2017628995.html',
'categories/20176281012.html',
'categories/20176281011.html',
'categories/20176281010.html',
'categories/20176281009.html',
'categories/20176281008.html',
'categories/20176281007.html',
'categories/20176281006.html',
'categories/20176281005.html',
'categories/20176281004.html',
'categories/20176281022.html',
'categories/20176281021.html',
'categories/20176281020.html',
'categories/20176281019.html',
'categories/20176281018.html',
'categories/20176281017.html',
'categories/20176281016.html',
'categories/20176281015.html',
'categories/20176281014.html',
'categories/20176281031.html',
'categories/20176281030.html',
'categories/20176281029.html',
'categories/20176281028.html',
'categories/20176281027.html',
'categories/20176281026.html',
'categories/20176281025.html',
'categories/20176281024.html',
'categories/20176281023.html',
'categories/2017627959.html',
'categories/2017627958.html',
'categories/2017627957.html',
'categories/2017627956.html',
'categories/2017627955.html',
'categories/2017627954.html',
'categories/2017627952.html',
'categories/2017627951.html',
'categories/2017627949.html' ]

var viewInfo = [];
for(var i = 0; i < hrefArr.length; i++ ){
    var href = "http://www.yinghexinxi.cn/"+hrefArr[i];
    request.get({url:href,encoding:null},function(err,response,body){
        var buf =  iconv.decode(body, 'gb2312');
        var $ = cheerio.load(buf);
        var data = [];
        
        $("[height='22']").each(function(index, element){
            var info = $(element).text().trim();
            var splitInfo = info.split(":");
            data.push(splitInfo[1]);
        })
        var obj = {
            id: href.replace(/[^0-9]/ig,""),
            name: data[0],
            tel: data[1],
            qq: data[2],
            date: data[3],
            area: data[4],
            email: data[5],
            location: data[6],
            ip: data[7],
            phone: data[8],
        }
        viewInfo.push(obj);
    });
}

setTimeout(function(){
    var data = JSON.stringify(viewInfo)
    console.log("准备写入文件");
    fs.writeFile('hrefInView.json', data, function(err) {
        if (err) {
            return console.error(err);
        }
        console.log("数据写入成功!");
    });
},6000)
原文地址:https://www.cnblogs.com/muou2125/p/9400958.html