nodejs实现抓取图片的爬虫脚本--crawler.js

仅做了必要的注释,我太懒了。目前只是一个雏形,实现基本的需求。有时间会修改的完善一些。

/*
 * @Author: jiahaiLiu
 * @Date:   2017-07-17 10:44:03
 * @Last Modified by:   jiahaiLiu
 * @Last Modified time: 2017-07-17 18:53:48
 * @Usage: node crawler [100]
 */

'use strict';

/*
 * Request is designed to be the simplest way possible to make http calls. 
 * It supports HTTPS and follows redirects by default.
 */
const request = require('request');
// cheerio是为服务器特别定制的,快速、灵活、实施的jQuery核心实现.
const cheerio = require('cheerio');
/*
 * Async is a utility module which provides straight-forward,
 * powerful functions for working with asynchronous JavaScript.
 */
const async = require('async');
const path = require('path');
const fs = require('fs');
const url = require('url');
// 自定义下载图片数量,默认为100
let targetAmount = process.argv.splice(2)[0] || 100;
/*let getLink = 'http://image.so.com/j?q=%E7%BE%8E%E5%A5%B3&src=srp&correct=%E7%BE%8E%E5%A5%B3&sn=61&pn=60&sid=7e73fad3c0eb8367ede610dcf2784c0e&ran=0&ras=0';*/
// 定义存储图片的文件夹名称
let collect_pic_dir = './collect_pic/';

let imgList = []; // 图片链接集合
let dest,
    start = 0;
let t1 = new Date().getTime();
let urlObj = {
    protocol: 'http:',
    slashes: true,
    auth: null,
    host: 'image.so.com',
    port: null,
    hostname: 'image.so.com',
    hash: null,
    query: {
        q: '美女',
        src: 'srp',
        correct: '美女',
        sn: '0',
        pn: '60',
        sid: '7e73fad3c0eb8367ede610dcf2784c0e',
        ran: '0',
        ras: '0'
    },
    pathname: '/j',
};
let urlLink,
    resObj;



if (!fs.existsSync(collect_pic_dir)) {
    fs.mkdirSync(collect_pic_dir);
    console.log('The ' + collect_pic_dir + ' folder has been created!');
}

loop(asyncDownload);

function loop(cb) {
    urlLink = url.format(urlObj);
    console.log(urlLink);
    request(urlLink, function(err, res, body) {
        if (!err && res.statusCode === 200) {
            /*const $ = cheerio.load(body);
            JSON.parse($('script[id="initData"]').html()).list.forEach(function(item) {
                imgList.push(item.img)
            });*/
            resObj = JSON.parse(res.body);
            /* resObj example
                        {
                            total: 1500,
                            end: false,
                            sid: "6b57a007f19740b44d562f6e0ec6e050",
                            ran: 0,
                            ras: 0,
                            lastindex: 121,
                            ceg: 181011782,
                            list: [{
                                id: "7697671c2932936c55a39fd2e4d30ceb",
                                qqface_down_url: false,
                                downurl: false,
                                grpmd5: false,
                                type: 0,
                                src: "1",
                                index: 61,
                                title: "<em>美女</em>诱惑_peaceful",
                                litetitle: "",
                                 "1000",
                                height: "1504",
                                imgsize: "225KB",
                                imgtype: "JPEG",
                                key: "7913541bc5",
                                dspurl: "blog.sina.com.cn",
                                link: "http://blog.sina.com.cn/s/blog_a5bc8202010109ta.html",
                                source: 2,
                                img: "http://img165.poco.cn/mypoco/myphoto/20111030/05/54704062201110300502223689419360167_010.jpg",
                                thumb_bak: "http://p0.so.qhmsg.com/t01da6596eb67097425.jpg",
                                thumb: "http://p0.so.qhmsg.com/t01da6596eb67097425.jpg",
                                _thumb_bak: "http://p0.so.qhmsg.com/sdr/_240_/t01da6596eb67097425.jpg",
                                _thumb: "http://p0.so.qhmsg.com/sdr/_240_/t01da6596eb67097425.jpg",
                                thumbWidth: 160,
                                dsptime: "",
                                thumbHeight: 240,
                                grpcnt: "8",
                                fixedSize: false
                            }],
                            boxresult: null,
                            wordguess: null
                    }
         */
            resObj.list.forEach(function(item) {
                imgList.push(item.img);
            });

            if (imgList.length >= targetAmount) {
                cb();
            } else {
                if (!resObj.end) {
                    urlObj.query.sn = resObj.lastindex + 1;
                    urlObj.query.sid = resObj.sid;
                    loop(cb);
                } else {
                    console.log('no more datas from source url');
                }
            }
        }
    });
}

// download picture
function asyncDownload() {
    console.log('图片总数:', imgList.length);
    async.mapSeries(imgList, function(item, callback) {
            setTimeout(function() {
                downloadPic(item, collect_pic_dir + start + '.jpg');
                callback(null, item);
                start++;
            }, 400);
        }, function(err, results) {
            let t2 = new Date().getTime();
            console.log('全部完成,总耗时:', (t2 - t1) + 'ms';
            });

    }

    function downloadPic(src, dest) {
        request
            .get(src)
            .on('response', function(response) {
                // console.log (response);
                // console.log(response.statusCode) // 200 
                // console.log(response.headers['content-type']) // 'image/png' 
            })
            .on('error', function(err) {
                console.log(err)
            })
            .pipe(fs.createWriteStream(dest));
    }
原文地址:https://www.cnblogs.com/xiaohaifengke/p/7698913.html