node

cherrio模块

安装

cnpm install cherrio

使用方法

const cheerio = require('cheerio')
const $ = cheerio.load('<h2 class="title">Hello world</h2>')
 
$('h2.title').text('Hello there!')
$('h2').addClass('welcome')
 
$.html()
//=> <html><head></head><body><h2 class="title welcome">Hello there!</h2></body></html>

request模块

var request = require('request');
request('http://www.google.com', function (error, response, body) {
  console.log('error:', error); // Print the error if one occurred
  console.log('statusCode:', response && response.statusCode); // Print the response status code if a response was received
  console.log('body:', body); // Print the HTML for the Google homepage.
});

爬虫系统 request cheerio

爬取一个网站的内容信息
分析内容
储存数据下载图片

简单的爬虫

//发起服务端请求 请求一个网页 
const request = require('request')
const  fs= require('fs')
const path= require('path') 
const cheerio = require('cheerio')
//以百度为例
let url ='https://www.baidu.com/'
request(url,(err,response,body)=>{
 console.log(err)
 //把爬取到的body 写入新文件中
  fs.writeFile(path.join(__dirname,'./baidu.html'),body,(err)=>{
    if(err){
      console.log('爬取失败')
    }else{
      console.log('爬取成功')
    }
  })
//根据一个网址 下载对应的网页文件
const $ = cheerio.load(body)
let imgs = []
// 用正则判断数组中的路径是否存在https
var Reg = /(http[s]?|ftp)/;
$('img').each((index, ele) => {  // 遍历所有
    var src = $(e).attr('src');
    if (!Reg.test(src)) {
        src = src.replace(//{2}/, 'https://') //正则判断
    }
    imgs.push(src)
})
// 下载数组里的图片
for (let index = 0; index < imgs.length; index++) {
    if (imgs[index].indexOf('png') !== -1) {
        request(imgs[index]).pipe(fs.createWriteStream(`./img/${index}.png`))    //用下标命名，要建好img文件夹
    };
    
}
})