ui爬虫工具-未完成

抓取页面的一块ui,将属于ui的html、css分离出来,需要配合浏览器机器人抓取html

const TinyCss=require('./utils/TinyCss')
var getCssText = require("./utils/getCssText");
var getText = require("./utils/getText");
var setText = require("./utils/setText");
//ui 爬虫
async function init() {
    const htmlText=await getText('./src/test.html');
    const cssText=await getText('./src/test.css');
    // const cssText=await getCssText('https://cloud.baidu.com/product/bcd/search.html?keyword=%E5%85%AB%E6%88%92%E7%AE%97%E5%91%BD','html')
    // console.log(htmlText)
    // console.log(cssText);
    const app=new TinyCss([htmlText],cssText);
    const css=app.getTinyAst('.note-list-wrapper').toString();
    // console.log(css);
    setText('./src/testmin.css',css);

}
init();

TinyCss.js

//TinyCss.js
const Api=require('./Api');
//解析成语法树
const compiler = require('vue-template-compiler');
const postcss  = require('postcss');
const querySelectorList=require('./querySelectorList')

//构建出一个css语法树和多个html语法书,分析css的使用率。
class TinyCss{
    constructor(htmlTextArr,cssText){

        //多个html书法树
        this.htmlTextArr=htmlTextArr;

        //一个css书法树
        this.cssAst=postcss.parse(cssText);
        this.cssList=Api.depthSearch(this.cssAst,'nodes').filter(function (node) {
            return node.type==='rule'&&!/keyframes/.test(node.parent.name);
        })

        //输出的部分
        this.bigMap=null;
        this.map=null;
        this.data=null;
        this.emptyCss=null;
        this.emptyKeyFrames=null;


    }

    //移除数组中的子元素
    removeObj(item,arr){
        for(let i=0;i<arr.length;i++){
            if(arr[i]===item){
                arr.splice(i,1)
                break;
            }
        }
    }
    //获取矩阵数据
    getBigMap(){
        if(this.bigMap){
            return this.bigMap;
        }
        let map=[];
        for(let i=0;i<this.htmlTextArr.length;i++){
            const htmlAst=compiler.compile(this.htmlTextArr[i]).ast;

            const ccRect=new querySelectorList(htmlAst,this.cssList);
            const rect=ccRect.analysis();
            map.push(rect)
        }
        this.bigMap=map;
        return map;
    }
    //获取小数据,矩阵数据
    getMap(){
        if(this.map){
            return this.map;
        }
        let map=[];
        for(let i=0;i<this.htmlTextArr.length;i++){
            const htmlText=this.htmlTextArr[i];
            const htmlAst=compiler.compile(htmlText).ast;
            const ccRect=new querySelectorList(htmlAst,this.cssList);
            const arr=ccRect.analysis().map(function (item) {
                return item.reduce((x,y)=>x+y);
            });
            for(let j=0;j<arr.length;j++){
                if(!map[j])map[j]=[];
                map[j].push(arr[j])
            }
        }
        this.map=map;
        return map;
    }
    getUiMap(selector){
        if(this.uiMap){
            return this.uiMap;
        }
        let map=[];
        for(let i=0;i<this.htmlTextArr.length;i++){
            const htmlText=this.htmlTextArr[i];
            const htmlAst=compiler.compile(htmlText).ast;
            const ccRect=new querySelectorList(htmlAst,this.cssList);
            const uiArr=ccRect.querySelectorAndChild(selector)
            const arr=ccRect.analysis().map(function (item) {
                let index=0;
                for(let k=0;k<item.length;k++){
                    if(item[k]===1&&uiArr[k]===1){
                        index++;
                    }
                }
                return index;
            });
            for(let j=0;j<arr.length;j++){
                if(!map[j])map[j]=[];
                map[j].push(arr[j])
            }
        }
        this.uiMap=map;
        return map;
    }
    //移除无用的css
    getEmptyCss(selector){
        if(this.emptyCss){
            return this.emptyCss;
        }
        const cssList=this.cssList;
        const data=[];
        const map=selector?this.getUiMap(selector):this.getMap();
        for(let i=0;i<map.length;i++){
            //存在比0大的就是用到的,都是0就是无用的css
            if(map[i].every(function (n) {
                return n===0
            })){
                //从ast中移除节点
                this.removeObj(cssList[i],cssList[i].parent.nodes);
                data.push(cssList[i].selector);
            }
        }

        this.emptyCss=data;
        return data;
    }
    //移除空的动画
    getEmptyKeyFrames(){
        if(this.emptyKeyFrames){
            return this.emptyKeyFrames;
        }
        const keyframesList=Api.depthSearch(this.cssAst,'nodes').filter(function (node) {
            return node.type==='atrule'&&/keyframes/.test(node.name);
        })
        const vals=Api.depthSearch(this.cssAst,'nodes').filter(function (node) {
            return node.type==='decl'&&/animation/.test(node.prop);
        })
        const delArr=keyframesList.filter(function (node) {
            return !vals.some(function (node2) {
                return node2.value.split(' ').indexOf(node.params)>-1
            })
        })
        const emptyKeyFrames=[];
        delArr.forEach( (node) =>{
            //从ast中移除节点
            this.removeObj(node,node.parent.nodes);
            emptyKeyFrames.push('@'+node.name+' '+node.params)
        })
        this.emptyKeyFrames=emptyKeyFrames;
        return emptyKeyFrames;
    }
    //移除注释
    removeComment(){
        const commentArr=Api.depthSearch(this.cssAst,'nodes').filter(function (node) {
            return node.type==='comment';
        })
        commentArr.forEach((node)=>{
            this.removeObj(node,node.parent.nodes);
        })
    }
    getTinyAst(selector){
        this.getEmptyCss(selector);
        this.getEmptyKeyFrames();
        this.removeComment();

        return this.cssAst;
    }
}
module.exports=TinyCss;
querySelectorList.js
//querySelectorList.js
const Api=require('./Api');
//命中规则

/*css rule矩阵,3*6
行对应selector['.id','.class1','.class2']
列对应html节点 ['body','body div','body div div','body div p','body div span','body div span a']
[
    [0,0,0,0,1,0],
    [0,0,0,0,1,0],
    [0,0,0,0,1,0]
]
*/
class querySelectorList{

    constructor(htmlAst,cssList){

        //记录selector查找历史
        this.selectotCache={};

        //构建html语法树和矩阵bitmap
        this.htmlAst=htmlAst;
        this.htmlList=Api.depthSearch(this.htmlAst).filter(function (node) {
            return node.type===1;
        })

        //构建css语法树和矩阵bitmap
        this.cssList=cssList;
    }
    //分析
    analysis(){
        const cssList=this.cssList;
        const map=[]
        for(let i=0;i<cssList.length;i++){
            map[i]=this.querySelector(cssList[i].selector);
        }
        return map;
    }
    //获取选择器和它得子元素
    querySelectorAndChild(selector){
        const arr=this.querySelector(selector);
        for(let i=0;i<arr.length;i++){
            if(arr[i]===1){
                const cLen=Api.depthSearch(this.htmlList[arr[i]]).filter(function (node) {
                    return node.type===1;
                }).length;
                for(let k=1;k<cLen;k++){
                    i++;
                    arr[i]=1;
                }
            }
        }
        return arr;
    }
    //可能是多选择器
    querySelector(selector){
        if(/,/.test(selector)){
            const arr=selector.split(',');
            const data=[];
            for(let i=0;i<arr.length;i++){
                const item=this.queryOneSelector(arr[i]);
                for(let k=0;k<item.length;k++){
                    if(item[k]===1){
                        data[k]=1;
                    }else{
                        data[k]=0;
                    }
                }
            }
            return data;
        }else{
            return this.queryOneSelector(selector)
        }
    }
    //查询css_rule,返回[array astNode]
    queryOneSelector(selector){
        selector=selector.trim();//去掉左右空格

        //解析css rule
        const selectorArr=[]
        selector.replace(/(.+?)([ >~+]+(?!d)(?! *:)|$)/ig,function (m,p1,p2) {
            selectorArr.push(p1,p2);
        })
        // console.log(selectorArr)
        this.selectorArr=selectorArr;
        // console.log(selectorArr)
        //设置缓存

        let preSelector='';
        for(let i=0;i<selectorArr.length;i=i+2){
            const exec=selectorArr[i-1]||'';
            const curSelector=selectorArr[i];

            this.setSelectotCache(preSelector,exec,curSelector);
            preSelector=preSelector+exec+curSelector
        }
        const arr=new Array(this.htmlList.length).fill(0);
        // if(/ ::/.test(selector))
        // console.log(selector,selectorArr)
        this.selectotCache[selector].forEach( (node) =>{
            arr[this.htmlList.indexOf(node)]=1;
        })
        return arr;
    }
    //记录selector查询html语法树
    setSelectotCache(preSelector,exec,curSelector){

        const nextSelector=preSelector+exec+curSelector;
        //已有缓存
        if(this.selectotCache[nextSelector]){return;}
        if(!preSelector&&!exec){
            this.selectotCache[curSelector]=this.breadthHit(curSelector,this.htmlAst)
            return;
        }
        const arr=this.selectotCache[preSelector];

        this.selectotCache[nextSelector]=[];
        if(/^ +$/.test(exec)){
            arr.forEach((node)=>{
                this.selectotCache[nextSelector]=this.selectotCache[nextSelector].concat(this.breadthHit(curSelector,node));
            })
        }else if(/^ *> *$/.test(exec)){
            arr.forEach((node)=>{
                this.selectotCache[nextSelector]=this.selectotCache[nextSelector].concat(this.childHit(curSelector,node));
            })
        }else if(/^ *+ *$/.test(exec)){
            arr.forEach((node)=>{
                this.selectotCache[nextSelector]=this.selectotCache[nextSelector].concat(this.sublingHit(curSelector,node));
            })
        }else if(/^ *~ *$/.test(exec)){
            arr.forEach((node)=>{
                this.selectotCache[nextSelector]=this.selectotCache[nextSelector].concat(this.sublingsHit(curSelector,node));
            })
        }else{
            console.log('exec异常:'+exec)
        }

    }
    //css_rule:element+element
    sublingHit(tag,astNode){
        if(!astNode.parent){
            return [astNode].filter( (node) =>{
                return this.hitNode(tag,node);
            })
        }
        return Api.nextSublingSearch(astNode,astNode.parent).filter( (node) =>{
            return this.hitNode(tag,node);
        })
    }
    //css_rule:element~element
    sublingsHit(tag,astNode){
        return Api.nextSublingsSearch(astNode,astNode.parent).filter(function (node) {
            return this.hitNode(tag,node);
        })
    }
    //css_rule:element element
    breadthHit(tag,astNode){
        return Api.breadthSearch(astNode).filter( (node)=> {
            return node.type===1&&this.hitNode(tag,node);
        })
    }
    //css_rule:element>element
    childHit(tag,astNode){
        return Api.childSearch(astNode).filter( (node)=> {
            return node.type===1&&this.hitNode(tag,node);
        })
    }
    //tag是否命中ast节点,返回true、false
    hitNode(selector,astNode) {

        //分割字符串 (tag)、(id、class)(val)
        if(selector==='*'){
            return true;
        }else if(/:root/.test(selector)){
            return astNode.tag==='html';
        }else{
            const arr=[];
            //tag
            if(/(^[a-z]+)/i.test(selector)){
                const tag=RegExp.$1;
                arr.push(astNode.tag===tag)
            }
            //class
            if(/.([w-]+)/.test(selector)){
                const val=RegExp.$1;
                arr.push(astNode.attrsMap.class&&astNode.attrsMap.class.split(' ').indexOf(val)>-1);
            }
            //id
            if(/#(w+)/.test(selector)){
                const val=RegExp.$1;
                arr.push(astNode.attrsMap.id===val);
            }
            //属性
            if(/[([w-]+)(~=|=||=)?(w+)?]/.test(selector)){
                const key=RegExp.$1;
                const exec=RegExp.$2;
                const val=RegExp.$3;
                // console.log(selector,'属性选择器,只判断是否存在属性')
                arr.push(astNode.attrsMap.hasOwnProperty(key));
            }
            //伪类选择器
            if(/(:.+)/.test(selector)){
                const key=RegExp.$1;
                // console.log(selector,'解析->',selector.replace(/:.+$/,''))
                arr.push(true)
                // arr.push(astNode.attrsMap.id===val);
            }
            if(arr.length==0){
                // console.log(this.selectorArr)
                console.log(selector,this.selectorArr,'css 解析异常')
            }
            return arr.every((item)=>item);
        }



    }
}
module.exports=querySelectorList;
原文地址:https://www.cnblogs.com/caoke/p/11269720.html