Node2.js

Node.js简单爬虫的爬取,也是跟着慕课网上抄的,网站有一点点改动,粘上来好复习嘛

var http = require('http')
var cheerio = require('cheerio')
var url = 'http://www.imooc.com/learn/348'


function filterChapters(html){
    var $ = cheerio.load(html)

    var chapters =$('.chapter')

    // [{
    //     chapterTitle:'',
    //     videos:[
    //     title:'', 
    //     id:''
    //     ]
    // }]
    // 
    
    var courseData=[]



    chapters.each(function(item){

        var chapter = $(this)

      
        var chapterTitle = chapter.find('h3').text()

        // var videos =chapter.find('.video').children('li')
         var videos =chapter.find('.video').children('li')
        var chapterData = {
            chapterTitle: chapterTitle,
            videos:[]
        }



        videos.each(function(item){
            var video = $(this).find('.J-media-item')
            var videoTitle = video.text()
           //  var id = video.attr('href').split('video/')[1]
              // var id = video.find('.data-media-id').text();
               var id = video.attr('href').split('/video/')[1]
            chapterData.videos.push({
                title: videoTitle,
                id: id
            })

        })


        courseData.push(chapterData)
    })

    return courseData
}




function printCourseInfo(courseData){
    courseData.forEach(function(item){
         var chapterTitle = item.chapterTitle

         console.log(chapterTitle+ '
')

         item.videos.forEach(function(video){
            console.log('【'+video.id+'】'+video.title+'
')
         })
    })
}




http.get(url, function(res){
    var html = ''

    res.on('data', function(data){
        html += data;
    })

    res.on('end', function(){
       // filterChapters(html)
        var courseData = filterChapters(html)
//console.log(courseData+'finish'+'
')
      printCourseInfo(courseData)
    })
}).on('error',function(){
    console.log('获取课程数据出错')
})

效果

数据还有一点没整理好得日后再弄

就是把不想要的也取回来了,现在还不懂怎么数据清洗干净,先记下来。

原文地址:https://www.cnblogs.com/ironSheet-SRS/p/9990640.html