Node 简单爬虫

以爬慕课网Hadoop进阶课程为例，用Node写一个简单的爬虫：

先抓取这个网站的源码：

var http = require('http');
var url = 'http://www.imooc.com/learn/890';

http.get(url, function(res) {
    var html = '';

    res.on('data', function(data) {
        html += data;
    })

    res.on('end', function() {
        console.log(html);
    })
}).on('error', function() {
    console.log('获取课程数据出错！')
})

然后分析这个页面的Dom，如图：

每大章节都被一个chapter包围，抓取下来就是一个数组，对每个item，这张的大标题在strong里面，每章的小章节在video标签里，然后小标题就是J-media-item的text，id就用video的编号，字符串截取下来。

代码：

var http = require('http');
var cheerio = require('cheerio')
var url = 'http://www.imooc.com/learn/890';

function filterChapters(html) {
    var $ = cheerio.load(html)
    var chapters = $('.chapter')

    // [{
    //     chapterTitle: "",
    //     videos: [
    //         title: "",
    //         id: ""
    //     ]
    // }]

    var courseData = []
    chapters.each(function(item) {
        var chapter = $(this)
        var chapterTitle = chapter.find('strong').text()
        var videos = chapter.find('.video').children('li')
        var chapterData = {
            chapterTitle: chapterTitle,
            videos: []
        }

        videos.each(function(item) {
            var video = $(this).find('.J-media-item')
            var videoTitle = video.text()
            var id = video.attr('href').split('video/')[1]

            chapterData.videos.push({
                title: videoTitle,
                id: id
            })
        })
        courseData.push(chapterData)
    })
    return courseData
}

function printCourseInfo(courseData) {
    courseData.forEach(function(item) {
        var chapterTitle = item.chapterTitle
        console.log(chapterTitle + '
')

        item.videos.forEach(function(video) {
            console.log(video.id + video.title + '
');
        })
    })
}

http.get(url, function(res) {
    var html = '';

    res.on('data', function(data) {
        html += data;
    })

    res.on('end', function() {
        var courseData = filterChapters(html)
        printCourseInfo(courseData)
    })
}).on('error', function() {
    console.log('获取课程数据出错！')
})

当然爬下来的结果格式有点杂乱无章：

可以在加点代码格式化一下，也可以用IO写进磁盘文件。