node.js 小爬虫 imooc 2016.03.06

爬虫目标:获取http://www.imooc.com/learn/348网页中的章节标题和视频信息。

var http = require('http');
var cheerio = require('cheerio');
var url = 'http://www.imooc.com/learn/348';

//获得html后,取得章节标题和视频信息
function filterChapters(html) {
	var $ = cheerio.load(html);
	var chapters = $('.chapter');
	var courseData = [];
	
	//遍历每个章节
	chapters.each(function(item) {
		var chapter = $(this);
		var chapterTitle = chapter.find('strong').text();
		var videos = chapter.find('.video').children('li');
		
		var chapterData = {
			'chapterTitle' : chapterTitle,
			'videos':[]
		};
		//遍历每个视频
		videos.each(function(item) {
			var video = $(this).find('.studyvideo');
			var videoTitle = video.text();
			var id = video.attr('href').split('video/')[1];
			
			chapterData.videos.push({
				'title': videoTitle,
				'id':id
			});
		});
                //将每个章节获取的内容导入课程信息
		courseData.push(chapterData);
	});
	return courseData;
};

//输出结果
function printCourseInfo(courseData) {
	courseData.forEach(function(item) {

		var chapterTitle = item.chapterTitle;
		console.log(chapterTitle + '\n');

		item.videos.forEach(function(video) {
			console.log('  [' + video.id + ']' + video.title + '\n');
		});
	});
};

//获取html文件内容
http.get(url, function(res) {
	var html = '';

	res.on('data', function(data) {
		html += data;
	});

	res.on('end', function() {
		var courseData = filterChapters(html);
		printCourseInfo(courseData);
	});
}).on('error', function() {
	console.log('error!!');
});

原文地址:https://www.cnblogs.com/daisykoo/p/5522297.html