纯php分词封装的类

  分享一个纯php分词封装的类

<?php
/*
 * 本插件非成品插件,只是封装的一个底层类,可用于各种需要分词的,同义词替换的场合
 */

class trie
{
    protected $dict;
    protected $dictFile;
    protected $specSymbol; //规格常见符号
    protected $ty_dict;

    /**
     * @param string $dictFile 字典文件路径, 每行一句
     */
    public function __construct()
    {
        $this->dict = [];
        $this->ty_dict = [];
        $this->specSymbol = "*|M|m|φ|Φ|st|ST";
    }

    public function loadData($cache = true)
    {
        global $dc;

        $cacheKey = __CLASS__ . "_" . md5($this->dictFile);
        if ($cache && false !== ($this->dict = $dc->get($cacheKey))) {
            return;
        }

        $this->loadDataFromFile();

        if ($cache) {
            $dc->set($cacheKey, $this->dict, null, 100000);
        }
    }

    /**
     * 从文件加载字典数据, 并构建 trie 树
     */
    public function loadDataFromFile()
    {
        $file = $this->dictFile;
        if (!file_exists($file)) {
            throw new InvalidArgumentException("字典文件不存在");
        }

        $handle = @fopen($file, "r");
        if (!is_resource($handle)) {
            throw new RuntimeException("字典文件无法打开");
        }
        while (!feof($handle)) {
            $line = fgets($handle);
            if (empty($line)) {
                continue;
            }
            $this->addWords(trim($line));
        }

        fclose($handle);
    }

    /**
     * 分割文本(注意ascii占1个字节, unicode...)
     *
     * @param string $str
     *
     * @return string[]
     */
    protected function splitStr($str)
    {
        return preg_split("//u", $str, -1, PREG_SPLIT_NO_EMPTY);
    }

    /**
     * 往dict树中添加语句
     *
     * @param $wordArr
     */
    protected function addWords($words)
    {
        $wordArr = $this->splitStr($words);
        $curNode = &$this->dict;
        foreach ($wordArr as $char) {
            if (!isset($curNode)) {
                $curNode[$char] = [];
            }

            $curNode = &$curNode[$char];
        }
        // 标记到达当前节点完整路径为"敏感词"
        $curNode['end']++;
    }

    /**
     * 过滤文本
     *
     * @param string $str 原始文本
     * @param string $replace 敏感字替换字符
     * @param int    $skipDistance 严格程度: 检测时允许跳过的间隔
     *
     * @return string 返回过滤后的文本
     */
    public function filter($str, $replace = '*', $skipDistance = 0)
    {
        $maxDistance = max($skipDistance, 0) + 1;
        $strArr = $this->splitStr($str);
        $length = count($strArr);
        for ($i = 0; $i < $length; $i++) {
            $char = $strArr[$i];

            if (!isset($this->dict[$char])) {
                continue;
            }

            $curNode = &$this->dict[$char];
            $dist = 0;
            $matchIndex = [$i];
            for ($j = $i + 1; $j < $length && $dist < $maxDistance; $j++) {
                if (!isset($curNode[$strArr[$j]])) {
                    $dist ++;
                    continue;
                }

                $matchIndex[] = $j;
                $curNode = &$curNode[$strArr[$j]];
            }

            // 匹配
            if (isset($curNode['end'])) {
//                Log::Write("match ");
                foreach ($matchIndex as $index) {
                    $strArr[$index] = $replace;
                }
                $i = max($matchIndex);
            }
        }
        return implode('', $strArr);
    }

    /**
     * 查找
     *
     * @param $strArr
     *
     * @return bool|mixed
     */
    public function isMatch($strArr)
    {
        $strArr = is_array($strArr) ? $strArr : $this->splitStr($strArr);
        $curNode = $this->dict;
        foreach ($strArr as $char) {
            if (!isset($curNode[$char])) {
                return false;
            }else{
                $curNode = $curNode[$char];
            }
        }
        return isset($curNode['end']) ? $curNode['end'] : false;
    }

    /*
     * 判断词是否存在于词库中
     */
    public function isType($word,$filename='word'){
        //判断
        return $this->isMatch($word);
    }


    /*
     * 对前端传过来的$kw对进行分词
     * 然后返回对应类型的词
     * $kw string 前端传过来的关健词
     * $filename string 词库文件名
     * $ty_file string 同义词库文件名
     */
    public function split_kw($kw,$filename='word',$ty_file=''){
        $this->dictFile = DT_ROOT.'/api/dtapicom/trie/'.$filename.'.txt';
        $this->loadData();
        //第一步,先进行空格,,号拆分
        $temp = preg_split("/[s,,]+/", $kw); //explode(' ',trim($kw));
        $data = [];
        if(!empty($temp)){
            foreach ($temp as $k=>$v){
                if($v) $data[] = $v;
            }
        }else{
            $data[] = $kw;
        }
        $word = []; //用来保存词库中匹配上的词
        //第二步,先把初步分词的去词库中匹配
        foreach ($data as $k=>$v){
            if($this->isMatch($v,$filename)){
                $word[] = $v;//保存进已匹配数组中
                unset($data[$k]); //删除已匹配上的词
            }
        }
        //第三步,对未匹配上的词进一步分词处理
        if(!empty($data)){
            foreach ($data as $k=>$v){
                $temp = $this->split_word($v);
                if(!empty($temp)){
                    foreach ($temp as $str){
                        $word[] = $str;
                        $v = str_replace($str,'',$v);
                        $data[$k] = $v;
                    }
                    //当前词已经为空时,删除当前元素
                    if(trim($v)=='') unset($data[$k]);
                }

            }
        }

        //第四步,对剩下的词进行替换同义词
        if(!empty($data) && $ty_file){
            foreach ($data as $k=>$v){
                $word[] = $this->tyReplace($v,$ty_file);
            }
        }
        return $word;
    }


    /*
     * 词库精细分词
     */
    public function split_word($strArr){
        $strArr = is_array($strArr) ? $strArr : $this->splitStr($strArr);
        $curNode = $this->dict;
        $find = [];
        $rootpostion = 0;//词根位置
        $prenode = false; //回塑参数,词典ab在字符串aab中时,需要把i向前回塑一次
        $words = [];
        $len = count($strArr);
        foreach ($strArr as $k=>$char) {
            $word = '';
            if (isset($curNode[$char])) {
                for($i=$k;$i<$len;$i++){
                    $word .= $strArr[$i];
                    $curNode = $curNode[$strArr[$i]];
                    //遇到end时,将词保存下来
                    if(isset($curNode['end'])){
                        $words[] = $word;

                    }
                }
            }
            //if($k) break;
            $curNode = $this->dict;
        }
        return $words;
    }


    /*
     * 编译同义词库
     */
    public function load_tongyi($filename='tongyi',$cache = true){
        global $dc;
        $file = DT_ROOT.'/api/dtapicom/trie/'.$filename.'.txt';
        $cacheKey = __CLASS__ . "_" . md5($file);
        if ($cache && false !== ($this->ty_dict = $dc->get($cacheKey))) {
            return;
        }


        if (!file_exists($file)) {
            throw new InvalidArgumentException("字典文件不存在");
        }

        $handle = @fopen($file, "r");
        if (!is_resource($handle)) {
            throw new RuntimeException("字典文件无法打开");
        }
        while (!feof($handle)) {
            $line = fgets($handle);
            if (empty($line)) {
                continue;
            }
            $this->addTongyi(trim($line));
        }

        fclose($handle);

        if ($cache) {
            $dc->set($cacheKey, $this->ty_dict, null, 100000);
        }
    }

    /*
     * 添加同义词进字典
     */
    protected function addTongyi($str)
    {
        $arr = explode('=',$str);
        $words = $arr[0];
        $oldword = $arr[1];
        $wordArr = $this->splitStr($words);
        $curNode = &$this->ty_dict;
        foreach ($wordArr as $char) {
            if (!isset($curNode)) {
                $curNode[$char] = [];
            }

            $curNode = &$curNode[$char];
        }
        // 标记到达当前节点完整路径为"敏感词"
        $curNode['end'] = $oldword;
    }

    /*
     * 同义词替换
     */
    public function tyReplace($strArr,$ty_file='tongyi'){
        $this->load_tongyi($ty_file);
        $arr = is_array($strArr) ? $strArr : $this->splitStr($strArr);
        $data = $this->ty_dict;
        foreach ($arr as $k=>$v){
            $data = $data[$v];
        }
        return $data['end'] ? $data['end'] : $strArr;
    }

    /*
     * 替换文本中的指定词
     * $text string 要替换的文本
     * $filename string 使用的词库
     */
    public function contentReplace($text,$filename='tongyi'){
        $str = strip_tags($text);
        preg_match_all('/([wx{4e00}-x{9fa5}]+)/u', $text,$arr);
        $this->load_tongyi($filename);
        $this->dict = $this->ty_dict;
        //先用同义词库分词
        foreach ($arr[0] as $k=>$v){
            $word = $this->split_word($v);
            if($word){
                foreach ($word as $t){
                    $tyc = $this->tyReplace($t,$filename);
                    $text = str_replace($t,$tyc,$text);
                }
            }
        }
        return $text;
    }


}

  

原文地址:https://www.cnblogs.com/68xi/p/12266291.html