JavaScript之从浏览器一键获取教务处个人课程信息【插件】

由于博主的个人网站(:http://www.johnnyzen.cn/),每学期都需要更新呈现课程的静态信息,由于课程量多,而且手动爬取很冗杂,特别想自动化实现。这不,今天终于有点时间了,把之前写nodejs的爬虫的思路转移到前端js上,同时更新了抓取数据的算法,比起之前的来说,自然是更加灵活高效了。

声明:如读者需引用,必须在文章显著处声明或者与博主取得联系,以示尊重劳动成果,非常感谢 0.0

var Course = function(seletorForTds){
    var tds = [];
    var courseUnitCount = 0;
    
    //将dom对象数组转换为text文本数组
    //TextnumFilter [以设置判断是否是课程的td格子的字符串长度作为特征判断值,如果没有达到此长度则会被删除 | 20]
    var tdsToTextArray = function($tds,TextnumFilter){
        if($tds == undefined) 
            throw new Error("$tds is not defined!");
        
        if(TextnumFilter == undefined)
            TextnumFilter = 20;

        var array = [];
        for(var i = 0; i < $tds.length; i++){
            if($tds[i].innerText.replace(/s*/g,"").length > TextnumFilter){
                array.push($tds[i].innerText.replace(/s*/g,"").replace(/(<fontcolor="red">(.[^font]*)</font>)*/g,""));
            }
        }
        return array;
    };

    // (UTF-8)汉字转换为英文数字    
    var chineseToEnglishNumber = function(chiNum){
        if(chiNum == undefined) throw new Error("$ don't load html!");
        switch(chiNum){
            case "零":return 0;break;
            case "一":return 1;break;
            case "二":return 2;break;
            case "三":return 3;break;
            case "四":return 4;break;
            case "五":return 5;break;
            case "六":return 6;break;
            case "七":return 7;break;
            case "八":return 8;break;
            case "九":return 9;break;
        }
    }

    //从td中生成课程单元数组(但仍未被解析,属于课程单元的原始信息数组)
    function generateCelltoCourseRawUnits(cell){
        var cellItems = cell.innerText.split("
");//以换行符为标志分割
        // console.log('[generateCelltoCourseRawUnits] cellItems:', cellItems);
        // console.log('[generateCelltoCourseRawUnits] cellItems[0]:', cellItems[0]);
        var courses = [];
        var previousStopFlag = false; //是否上一个数组元素也是停用标志元素:(调、(换、""等
        var count = 0;//记录原始课程单元的元素的长度(4 or 6 等)
        for(var i = 0, length = cellItems.length; i < length; i++){
            // console.log('cellItems[' + i + '].indexOf("(换"): ', cellItems[i].indexOf("(换"));
            // console.log('cellItems[' + i + '].indexOf("(调"): ', cellItems[i].indexOf("(调"));
            // console.log('cellItems[' + i + ']: ', cellItems[i]);

            //假如当前元素是最后一个元素时
            if(i == length-1){
                if((cellItems[i].indexOf("(换") != -1) || (cellItems[i].indexOf("(调") != -1) || (cellItems[i] === "")){//如果当前元素为停用标志元素时
                    count++;
                    courses.push(cellItems.slice(i - count + 1, i));
                    // console.log('【1】cellItems.slice(' + i + ' - ' + count + ' + 1, ' + i + '):', cellItems.slice(i - count + 1, i));
                }else if(previousStopFlag == false){//如果当前元素为非停用标志元素,且上一个元素非停用标志元素时
                    count++;
                    courses.push(cellItems.slice(i - count + 1, i + 1));
                    // console.log('【2】cellItems.slice(' + i + ' - ' + count + ' + 1, ' + i + '):', cellItems.slice(i - count + 1, i + 1));
                }
            }

            //如果当前元素是停用标志元素时
            if( (cellItems[i].indexOf("(换") != -1) || (cellItems[i].indexOf("(调") != -1) || (cellItems[i] === "")){
                if(previousStopFlag == false && (i != length-1)){//如果上一门课程未被填充且非最后一个元素时(即 上一个元素非停用标志元素且非最后一个元素时,push当前的course)
                    // console.log('push raw course:count:', count);
                    courses.push(cellItems.slice(i - count, i));
                    // console.log('【3】cellItems.slice(' + i + ' - ' + count + ', ' + i + '):', cellItems.slice(i - count, i));
                    
                } else {//上一个元素为停用标志元素时

                }
                
                count = 0;
                // console.log("count " + count + " 【" + i + "】" + cellItems[i] + ' test 3');
                previousStopFlag = true;//表示已经将上一门课程是停用标志元素

            } else {//如果当前元素不是停用标志元素
                if( (previousStopFlag == true) || (i == 0)) { //如果上一个元素是停用标志元素或者当前元素属于第一个元素,则说明当前元素已经属于一门新的课程信息的单元对象的课程名了,需要创建一个新的课程单元数组

                } else { //如果上一个元素不是停用标志元素且非首元素,则说明当前元素已经属于正在填充的课程单元

                }
                // if(i != length-1){
                    count++;
                    // console.log("count " + count + " 【" + i + "】" + cellItems[i] + ' test 1');
                    previousStopFlag = false;
                // }
            }
        }
        // console.log('courses:', courses);
        return courses;
    }

    //根据原始的课程单元生成课程对象
    var generateCourses = function(CourseRawUnits,courses){
        for(var i = 0,CourseUnitSize = CourseRawUnits.length; i < CourseUnitSize; i++){
            switch(CourseRawUnits[i].length) {//根据课程单元的元素长度解析成对应课程对象
                case 4:
                case 6:{
                    var course = {};

                    course.name = CourseRawUnits[i][0];
                    course.week_index =chineseToEnglishNumber( CourseRawUnits[i][1].charAt( CourseRawUnits[i][1].search(/周[一二三四五六七]/gi) + 1 ));
                    
                    try {
                        var patCourse_index = new RegExp("第\d*[,\d*]*节","gi");
                    
                        // console.log('test CourseRawUnits[i][1]:', CourseRawUnits[i][1]);
                        course.course_index = patCourse_index.exec(CourseRawUnits[i][1])[0].replace("第","").replace("节","").split(",").map(function(ele,index,array){
                            return parseInt(ele);
                        })
                        // console.log('course.course_index:', course.course_index );
                    } catch(error){
                        console.log('[generateCourses] error.message:', error.message);

                    }

                    var patWeeks = new RegExp("第\d*[-]*[\d*]*周","gi");
                    // var course_indexArray = pat.exec(CourseRawUnits[i][1])[0].split("-");
                    var course_Weeks = patWeeks.exec(CourseRawUnits[i][1])[0].replace("第","").replace("周","").split("-").map(function(ele,index,array){
                            return parseInt(ele);
                        })

                    // console.log('test course_Weeks:', course_Weeks);

                    course.week_start = course_Weeks[0];
                    course.week_end = course_Weeks[1];
                    
                    course.teacher = CourseRawUnits[i][2];
                    course.location = CourseRawUnits[i][3];

                    courses.push(course);
                    break;
                }
                case 0:
                    break;
            }
        }
        return courses;
    } 

    //清除无关dom节点
    var ArraysClearEmptyItem = function(array,condition){
        if(tds == undefined) 
            throw new Error("tds is not defined!");
        // console.log('ArrayClearEmptyItem array', array);

        var newArray = [];
        for(var i = 0; i < array.length; i++){
            // console.log('[ArrayClearEmptyItem] array[i].innerText:', array[i].innerText);
            if(array[i].innerText.length > 20 && array[i].innerText != "&nbsp;"){
                newArray.push(array[i]);
                // console.log('push:', array[i]);
            }
        }
        return newArray;
    };

    // [ArrayClearEmptyItem 清除数组内为空字串""的元素]
    var ArrayClearEmptyItem = function(array,condition){
        if(tds == undefined) throw new Error("tds is not defined!");

        var newArray = [];
        for(var i = 0;i < array.length;i++){
            if(array[i].length > 1){
                newArray.push(array[i]);
                // console.log('push:',array[i]);
            }
        }
        return newArray;
    } 

    var tdsItemsToCourses = function($tds){    //$tds.length
        if($tds == undefined) 
            throw new Error("$tds is not defined!");
        
        $tds = ArraysClearEmptyItem($tds);//清除数组内空字串""的元素

        var courses = [];
        for(var j = 0; j < $tds.length; j++){
            courses = generateCourses(generateCelltoCourseRawUnits($tds[j]), courses);
        }
        return courses;
    };

    this.load = function(){
        tds = document.querySelectorAll(seletorForTds);
        
        // console.log('tds:', tds);
        tdsToTextArray(tds, 20);
        var courses = tdsItemsToCourses(tds);
        
        // console.log('courses: ', courses);
        // window.courses = courses;
        // console.log("课表课程解析:
",JSON.stringify(courses));

        return courses;
    }
} 

var Student = function(seletorOption){
    var that = this;
    that.load = function(){
        return {
            sno : document.querySelectorAll(seletorOption.sno)[0].innerText.trim().replace(/学号:/gi, ""),
            sname : document.querySelectorAll(seletorOption.sname)[0].innerText.trim().replace(/姓名:/gi, ""),
            college : document.querySelectorAll(seletorOption.college)[0].innerText.trim().replace(/学院:/gi, ""),
            profession : document.querySelectorAll(seletorOption.profession)[0].innerText.trim().replace(/专业:/gi, ""),
            clazz : document.querySelectorAll(seletorOption.clazz)[0].innerText.trim().replace(/行政班:/gi, ""),
            courses: (new Course(seletorOption.courseTable)).load()
        }
    };
    that.stringify = function(){
        return JSON.stringify(that.load());
    }
}


//demo 
var seletorOption = {
    sno:"#Label5",
    sname:"#Label6",
    college:"#Label7",
    profession:"#Label8",
    clazz:"#Label9",
    courseTable:"#Table1 td"
};
var student = (new Student(seletorOption));

原文地址:https://www.cnblogs.com/johnnyzen/p/7980969.html