使用Nodejs 爬虫爬取网站内容

要会的知识点

jquery $第二个参数

1.jQuery(selector, [context])

这种用法,相当于 $(context).find(selector) 或者 context.find(selector)

依赖的库、需要用到的东西

const url = require('url');
const util = require('util');
const Promise = require('bluebird');  //bluebird(蓝鸟) 是一个第三方 Promise 规范实现库,它不仅完全兼容原生 Promise 对象,且比原生对象功能更强大。
const request = require('request');  //request是服务端发起请求的工具包
const iconv = require('iconv-lite');  //使用iconv-lite解决node当中不支持GBK编码的问题
const cheerio = require('cheerio');  // jquery核心功能,在服务端操作DOMconst sample = require('lodash/sample'); //从collection(集合)中获得一个随机元素。const isEmpty = require('lodash/isEmpty'); //检查 value 是否为一个空对象,集合,映射或者set。 isEmpty(null) => true ;  true => true , 1 => true , [1,2,3] => false ,{'a':1} => true 
const Logger = require('../Logger');  //日志输出
const configs = require('../configs'); // 配置环境
const logger = new Logger('namespace');

// 伪装浏览器
const USER_AGENT = [
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
]

Logger.js 日志输出文件

/** 日志输出接口 **/

const debug = require('debug'); // 

class Logger {
    constructor(namespace) {
        this.namespace = namespace;
        this.logger = debug('app');
    }

    error(...arg) {
        this.logger.extend('error').extend(this.namespace)(...arg);
    }

    info(...arg) {
        const logger = this.logger.extend('info').extend(this.namespace);
        logger.log = console.log.bind(console);
        logger(...arg);
    }

    log(...arg) {
        this.info(...arg);
    }

    debug(...arg) {
        const logger = this.logger.extend('debug').extend(this.namespace);
        logger.log = console.log.bind(console);
        logger(...arg);
    }
}

module.exports = Logger;

configs.js 配置服务器

const md5 = require('md5');
const get = require('lodash/get');
//                   path      defaultValue
const env = get(process.env, 'NODE_ENV', 'development');

const development = {
    // 调用接口时,默认登录的用户和密码
    apiRoot: 'http://127.0.0.1/xxx',
    sessionUser: 'admin',
    sessionPassword: md5('test1234567'),
};

const production = {
    // 调用接口时,默认登录的用户和密码
    apiRoot: 'http://127.0.0.1/xxx',
    sessionUser: 'admin',
    sessionPassword: md5('test1234567'),
};

module.exports = env === 'production' ? production : development;
原文地址:https://www.cnblogs.com/it-Ren/p/13897240.html