node抓取图片

const https = require('https')
const http = require('http') /* 方式二时使用*/
const fs = require('fs')
const cheerio = require('cheerio')
const request = require('request')
const path = require('path');

const imgDir = path.join(__dirname, 'img');



let url = 'https://www.3dmgame.com/gl/3749617.html'
// const title = "猎人"

const list1 = [
  {url:"https://www.3dmgame.com/gl/3748911.html", title:"./img/战士"},
  {url:"https://www.3dmgame.com/gl/3749617.html", title:"./img/猎人"},
  {url:"https://www.3dmgame.com/gl/3749938.html", title:"./img/机器人"},
];

const getImg = (url, title) => {
  fs.mkdirSync(`${title}`, {recursive: true});//同步创建目录
  https.get(url, (res) => {
    // 安全判断
    const { statusCode } = res
    const contentType = res.headers['content-type']
    console.log(statusCode, contentType)

    let err = null
    if (statusCode !== 200) {
      err = new Error('请求状态错误')
    } else if (!/^text/html/.test(contentType)) {
      err = new Error('请求类型错误')
    }

    if (err) {
      console.log(err)
      res.resume() //重置缓存
      return false
    }

    let resData = ''
    res.on('data', (data) => {
      resData += data.toString('utf8')
    })
    res.on('end', () => {
      //将请求数据保存在本地
      let $ = cheerio.load(resData)
      console.error($('img').length);

      let id = 0;
      $('img').each((index, el) => {
        let imgUrl = $(el).attr('src')
        // console.log($(el).attr('src'))
        if (imgUrl) {

          // let filename = imgUrl.split('/').pop()
          // /* 方式一*/
          // // request('http:'+imgUrl).pipe(fs.createWriteStream(imgDir + '/' + filename));
          // /* 方式二*/
          // var req = http.get('http:'+imgUrl, function (res) {
          //     var imgData = "";
          //     res.setEncoding("binary"); //一定要设置response的编码为binary否则会下载下来的图片打不开
          //     res.on("data", function (chunk) {
          //       imgData += chunk;
          //     });
          //     res.on("end", function () {
          //      let filename = imgUrl.split('/').pop()
          //       fs.writeFile(imgDir + '/' + filename, imgData, "binary", function (err) {
          //         if (err) {
          //           console.log("保存失败");
          //         }
          //         console.log("保存成功");
          //       });
          //     });
          //     res.on("error", function (err) {
          //       console.log("请求失败");
          //     });
          //   });

          if (!imgUrl.includes("https://img.3dmgame.com/uploads/images/news")) {
            return;
          }

          const ext = imgUrl.substring(imgUrl.length - 4, imgUrl.length);
          console.error(`ext=${ext}`);
          if (imgUrl.substring(imgUrl.length - 4, imgUrl.length) === ".jpg") {
            return;
          }
          console.error(imgUrl);
          var writeStream = fs.createWriteStream(`${title}//${++id}_${imgUrl.substring(imgUrl.length - 10, imgUrl.length - 4)}.png`);
          var readStream = request(imgUrl);
          readStream.pipe(writeStream);
          readStream.on('end', function () {
            console.log('文件下载成功');
          });
          readStream.on('error', function () {
            console.log(1);
            // console.log("错误信息:"+ err)
          })
          writeStream.on("finish", function () {
            console.log("文件写入成功");
            writeStream.end();
          });
        }

      });
      console.log('数据传输完毕')
    })
  }).on('error', (err) => {
    console.log('请求错误')
  })
}

for(let item of list1){
  getImg(item.url, item.title);
}

这里主要是抓取网页上的所有图片,然后过滤图片。

原文地址:https://www.cnblogs.com/gongzhuiau/p/15241474.html