PHP代码实现AC自动机

能高效地匹配字符串,具体原理就不搬了,这边给出PHP的实现代码:

<?php
class AcAutomation
{
    private $root;

    public function __construct($keywords = array())
    {
        $this->root = $this->createNode();
        foreach ($keywords as $keyword) {
            $this->addKeyword($keyword);
        }
        $this->buildFailIndex();
    }

    /**
     * 创建节点
     * @param string $value
     * @return stdClass
     */
    private function createNode($value = "")
    {
        $node = new stdClass();
        $node->value = $value;
        $node->next  = array();
        $node->fail  = NULL;
        $node->len   = 0;         // keyword长度,0表示非末尾,非0表示keyword的长度
        return $node;
    }


    /**
     * 添加关键词
     * @param $keyword
     */
    private function addKeyword($keyword)
    {
        $keyword = trim($keyword);
        if (!$keyword) {
            return;
        }
        $cur = $this->root;
        $matches = unpack('N*',iconv('UTF-8', 'UCS-4', strtolower($keyword)));
        for ($i = 1; isset($matches[$i]); $i++) {
            $v = $matches[$i];
            if (!isset($cur->next[$v])) {
                $node = $this->createNode($v);
                $cur->next[$v] = $node;
            }
            if (!isset($matches[$i+1])) {
                $cur->next[$v]->len = $i;
            }
            $cur = $cur->next[$v];
        }
    }

    /**
     * 构造fail指针
     */
    private function buildFailIndex()
    {
        $queue = array();
        foreach ($this->root->next as $node) {
            $node->fail = $this->root;
            $queue[] = $node;
        }
        while ($queue) {
            $node = array_shift($queue);
            foreach ($node->next as $child_node) {
                $val = $child_node->value;
                $p = $node->fail;
                while ($p != NULL) {
                    if (isset($p->next[$val])) {
                        $child_node->fail = $p->next[$val];
                        break;
                    }
                    $p = $p->fail;
                }
                if ($p === NULL) {
                    $child_node->fail = $this->root;
                }
                $queue[] = $child_node;
            }
        }
    }

    /**
     * 搜索
     * @param $content
     * @return array
     */
    public function search($content)
    {
        $pos_list = array();
        $p = $this->root;

        $matches = unpack('N*',iconv('UTF-8', 'UCS-4', strtolower($content)));
        for ($i = 1; isset($matches[$i]); $i++) {
            $val = $matches[$i];
            while (!isset($p->next[$val]) && $p != $this->root) {
                $p = $p->fail;
            }
            $p = isset($p->next[$val]) ? $p->next[$val] : $this->root;
            $temp = $p;
            while ($temp != $this->root) {
                if ($temp->len) {
                    $pos_list[] = array($i - $temp->len, $i);
                }
                $temp = $temp->fail;
            }
        }
        return $pos_list;
    }
}

Example:

<?php
$content = file_get_contents(dirname(__FILE__).'/content.txt');
$keywords = array('复杂度','个数','返回','代表','性能','数据','GET','安全性','耗时','优化','文档','并发','类型');
$ac = new AcAutomation($keywords);
$res = $ac->search($content);
foreach ($res as $pos) {
    echo mb_substr($content,$pos[0],$pos[1] - $pos[0],'UTF-8'),PHP_EOL;
}
原文地址:https://www.cnblogs.com/RainCry/p/13207393.html