node爬虫之图片下载

背景:针对一些想换头像的玩家,而又不知道用什么头像的,作为一名代码爱好者,能用程序解决的,就不用程序来换头像,说干就干,然后就整理了一下。

效果图

环境配置

  • 安装node环境
  • node -v
  • node版本最好在8.11.1以上

项目结构

assets是存放所下载的图片

static是静态资源页面

eg.js是下载图片示例(node eg.js)

img.json是网页所获取的json数据

index.js属于服务端

安装依赖

npm init ( 会生成一个package.json)

npm i express --save-dev

npm i cheerio--save-dev

npm i superagent--save-dev

npm i superagent-charset--save-dev

npm i request--save-dev

代码区

1. eg.js

var fs = require('fs');
var request = require("request");
var path = require('path');
var src = "https://pic.qqtn.com/up/2019-6/2019061811092772406.jpg";
var writeStream = fs.createWriteStream('./assets/aa.png');
var readStream = request(src)
readStream.pipe(writeStream);

readStream.on('end', function() {
    console.log('文件下载成功');
});
readStream.on('error', function() {
    console.log("错误信息:" + err)
})
writeStream.on("finish", function() {
    console.log("文件写入成功");
    writeStream.end();
});

2.index.js

var superagent = require('superagent');
var charset = require('superagent-charset');
charset(superagent);
var express = require('express');
var baseUrl = 'https://www.qqtn.com/';
const cheerio = require('cheerio');
var request = require("request");
var fs = require('fs')
var path = require('path')

var checkDir = fs.existsSync("assets");
var app = express();
app.use(express.static('static'))
app.get('/index', function (req, res) {
    //设置请求头
    res.header("Access-Control-Allow-Origin", "*");
    res.header('Access-Control-Allow-Methods', 'PUT, GET, POST, DELETE, OPTIONS');
    res.header("Access-Control-Allow-Headers", "X-Requested-With");
    res.header('Access-Control-Allow-Headers', 'Content-Type');
    //类型
    console.log(req.query, '类型')
    var type = req.query.type;
    //页码
    var page = req.query.page;
    type = type || 'weixin';
    page = page || '1';
    var route = `tx/${type}tx_${page}.html`
    //网页页面信息是gb2312,所以chaeset应该为.charset('gb2312'),一般网页则为utf-8,可以直接使用.charset('utf-8')
    superagent.get(baseUrl + route)
        .charset('gb2312')
        .end(function (err, sres) {
            var items = [];
            if (err) {
                console.log('ERR: ' + err);
                res.json({ code: 400, msg: err, sets: items });
                return;
            }
            var $ = cheerio.load(sres.text);
            $('div.g-main-bg ul.g-gxlist-imgbox li a').each(function (idx, element) {
                var $element = $(element);
                var $subElement = $element.find('img');
                var thumbImgSrc = $subElement.attr('src');
                items.push({
                    title: $(element).attr('title'),
                    href: $element.attr('href'),
                    thumbSrc: thumbImgSrc
                });
            });
            if (!checkDir) {
                fs.mkdir('assets', function (error) {
                    if (error) {
                        console.log(error);
                        return false;
                    }
                    console.log('创建目录成功');
                })
            }
            fs.access(path.join(__dirname, '/img.json'), fs.constants.F_OK, err => {
                if (err) { // 文件不存在
                    fs.writeFile(path.join(__dirname, '/img.json'), JSON.stringify([
                        {
                            route,
                            items
                        }
                    ]), err => {
                        if (err) {
                            console.log(err)
                            return false
                        }
                        console.log('保存成功')
                    })
                } else {
                    fs.readFile(path.join(__dirname, '/img.json'), (err, data) => {
                        if (err) {
                            return false
                        }
                        data = JSON.parse(data.toString())
                        let exist = data.some((page, index) => {
                            return page.route == route
                        })
                        if (!exist) {
                            fs.writeFile(path.join(__dirname, 'img.json'), JSON.stringify([
                                ...data,
                                {
                                    route,
                                    items
                                },
                            ]), err => {
                                if (err) {
                                    return false
                                }
                            })
                        }
                    })
                }
                res.json({ code: 200, msg: "", data: items });
            })
            try {
                fs.readFile(path.join(__dirname, '/img.json'), (err, data) => {
                    if (err) {
                        return false
                    }else{
                        data = JSON.parse(data.toString());
                        data.map((v, i) => {
                            v.items.map((v,i) => {
                                i = request(v.thumbSrc)
                                // 后缀.jpg可用正则匹配
                                i.pipe(fs.createWriteStream('./assets/' + v.title + '.jpg'));
                            })
    
                        })
                    }
                })
            } catch(err){}
        })
});
app.get('/show', (req, res) => {
    fs.readFile(path.join(__dirname, 'img.json'), (err, data) => {
        if (err) {
            console.log(err)
            return false
        }
        res.json(data.toString())
    })
})
var server = app.listen(8081, function () {
    var host = server.address().address
    var port = server.address().port
})
View Code

3.static文件夹下index.html

<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <meta http-equiv="X-UA-Compatible" content="ie=edge">
  <title>Document</title>
</head>
<body>
  <script src="./index.js"></script>
</body>
</html>

4.static文件夹下index.js

fetch('/index', {
  method: 'GET'
}).then(res => {
  return res.json()
}).then(res => {

  if (res.code == 200) {
    fetch('/show', {
      method: 'GET'
    }).then(res => {
      return res.json()
    }).then(res => {
      res = JSON.parse(res)
      console.log(res, res.length)
      document.body.innerHTML = res.map((page, index) => {
        console.log(page)
        return page.items.map((item, itemIndex) => {
          return `<a href="${item.thumbSrc}" ><img src="${item.thumbSrc}" width="200" height="200"/></a>`
        }).join('')
      }).join('')
    })
  }

})

总结

写到这里基本是结束了,对于node我还是怀着一个敬畏的心,摸摸索索终于把这个demo写完了,项目也传到gitHub了如有需要可私信

原文地址:https://www.cnblogs.com/gaoht/p/11303611.html