文本分割-隐马尔可夫模型

/**
 文字分词 隐马尔可夫模型
 共4种状态S B M E
 AMap 为状态转移概率矩阵 4*4,表示从{S B M E}到{S B M E}的概率
 BMap 为当前字属于某种状态{S B M E}的概率
 * */
//有限状态
const S=['S','B','Mn','E']

const mekflink={
  empty:{S:1/16},
  AMap:{
    'S-S':1000,
    'S-B':1000,
    'E-B':1000,
    'E-S':1000,
  },
  BMap:{},
  AMapGl:{},
  BMapGl:{},
  add(text){
    if(text.length>1){
      for(let i=0;i<text.length;i++){
        if(i===0){
          this.push(text[i],'B')
        }else if(i===text.length-1){
          this.push(text[i],'E')
          if(text.length>2){
            this.pushState('M'+(i-1),'E')
          }else{
            this.pushState('B','E')
          }
        }else if(i===1){
          this.push(text[i],'M'+i)
          this.pushState('B','M'+i)
        }else{
          this.push(text[i],'M'+i)
          this.pushState('M'+(i-1),'M'+i)
        }
      }
    }else{
      this.push(text,'S')
    }
  },
  pushState(t0N,t1N){
    const AMap=this.AMap;
    const key=t0N+'-'+t1N
    if(!AMap[key]){
      AMap[key]=0
    }
    AMap[key]++;
  },
  push(key,state){
    const BMap=this.BMap
    if(!BMap[key]){
      BMap[key]={}
    }
    if(!BMap[key][state]){
      BMap[key][state]=0
    }
    BMap[key][state]++;
  },
  //生成模型
  makeGl() {
    const AMap=this.AMap;
    const BMap=this.BMap;
    const AMapGl=this.AMapGl;
    const BMapGl=this.BMapGl;
    //统计A
    const AMapT={}
    for(let key in AMap){
      const [t0,t1]=key.split('-')
      if(!AMapT[t0]){
        AMapT[t0]=0;
      }
      AMapT[t0]=AMapT[t0]+AMap[key];
    }
    for(let key in AMap){
      const [t0,t1]=key.split('-')
      AMapGl[key]=this.chu(AMap[key],AMapT[t0])
    }
    //统计B
    for(let key in BMap){
      let t=0;
      for(let k in BMap[key]){
        t=t+BMap[key][k]
      }
      const obj=Object.create(this.empty)
      for(let k in BMap[key]){
        obj[k]=this.chu(BMap[key][k],t)
      }
      BMapGl[key]=obj;
    }
    return {
      AMapGl,BMapGl
    }
  },
  chu(p1,p2){
    return p1/p2;
  },
  exex(p1,p2){
    return p1*p2;
  },
  isBig(p1,p2){
    return p1>p2
  },
  getT1Arr(t0Obj,BObj){
    const AMapGl=this.AMapGl;
    const t1Obj={};
    let glAll=0;
    for(let t0 in t0Obj){
      const link=t0Obj[t0]
      for(let k in AMapGl){
        const arr=k.split('-')
        if(t0===arr[0]&&BObj[arr[1]]){
          const gl=this.exex(link.gl,this.exex(AMapGl[k],BObj[arr[1]]))
          if(gl>0){
            if(!t1Obj[arr[1]]){
              glAll=glAll+gl;
              t1Obj[arr[1]]={
                gl:gl,
                data:link.data+'-'+arr[1]
              }
            }else if(this.isBig(gl,t1Obj[arr[1]].gl)){
              glAll=glAll+gl-t1Obj[arr[1]].gl;
              t1Obj[arr[1]]={
                gl:gl,
                data:link.data+'-'+arr[1]
              }
            }
          }

        }
      }
    }
    for(let k in t1Obj){
      const gl=parseInt(t1Obj[k].gl/glAll*100);
      t1Obj[k].gl=gl
      if(gl===0){
        delete t1Obj[k]
      }
    }
    return t1Obj;
  },
  solve(text){
    const AMapGl=this.AMapGl;
    const BMapGl=this.BMapGl;
    console.log('状态转移概率',AMapGl)
    console.log('特征统计概率',BMapGl)
    //马尔可夫链条
    //获取当前状态可能的下一个状态
    let t0Obj={
      'S':{
        gl:1,data:'S'
      },
      'B':{
        gl:1,data:'B'
      }
    }

    for(let i=1;i<text.length;i++){
      t0Obj=this.getT1Arr(t0Obj,BMapGl[text[i]]||Object.create(this.empty))
    }
    const cache={}
    for(let k in t0Obj){
      const dstr=t0Obj[k].data.replace(/[d-]/g,'')
      const data=[]
      let start,end;
      for(let i=0;i<text.length;i++){
        if(dstr[i]==='B'){
          start=i;
        }else if(dstr[i]==='E'){
          end=i;
          data.push(start,end);
        }
      }
      const key=data.join(',')
      if(typeof cache[key]!=='undefined'){
        cache[key].gl=cache[key].gl+t0Obj[k].gl;
      }else{
        cache[key]={
          gl:t0Obj[k].gl,
          data:data
        };
      }
    }
    const lArr=[]
    for(let k in cache){
      lArr.push(cache[k])
    }
    if(lArr.length>0){
      lArr.sort(function (p1,p2) {
        return p2.gl-p1.gl;
      })
    }
    return lArr;
  }
}

module.exports=mekflink;
//demo
const arrH=['1ec66668876666666']
arrH.forEach(function (text) {
  mekflink.add(text)
})
mekflink.makeGl()
const text='1ec666688766666661ec66668876666666211ec6666887666666621';
const arr=mekflink.solve(text);
const aArr=[]
arr.forEach(function (item) {
  const tArr=[]
  for(let i=0;i<item.data.length;i=i+2){
    const t=text.substring(item.data[i],item.data[i+1]+1)
    tArr.push(arrH.indexOf(t))
  }
  aArr.push(tArr)
})
console.log(aArr)

  

原文地址:https://www.cnblogs.com/caoke/p/13686121.html