HDOJ 2222 Keywords Search(AC自动机入门)

题意:

确定有多少模式串是目标串的字串。

思路:

AC自动机:Aho-Corasick Algorithm (Aho-Corasick Automaton)

学习博客:http://www.notonlysuccess.com/index.php/aho-corasick-automaton/

学习资料:http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf

关于博客中的 ACAutomaton 的一点说明:

1. Construct 函数应该是最难理解的 : trie[u][i] = trie[fail[u]][i] 

            for (int i = 0; i < CHILD_NUM; ++i)
            {
                int& v = trie[u][i];
                if (v)
                {
                    fail[v] = trie[fail[u]][i];
                    que.push_back(v);
                }
                else 
                    v = trie[fail[u]][i];
            }

   这其实代码中用到的一个技巧,是为了方便 fail 指针的计算。比如 fail[u] = x,求 fail[v] = trie[fail[u]][m] -> fail[v] = trie[x][m]

   如果 x 节点没有 m 边呢?但是又存在 m 边的前缀,这样的话 trie[u][m] = trie[fail[u]][m] 这句话就会起到关键作用,相当于预存储的作用

2. Work() 函数: 一旦遇到一个匹配点 p 比如: uhersme 中的 he 匹配了,先把 he 的情况加起来,因为 i 是继续往前走的,如果这时 he 中的 e

   也有匹配的话,也要考虑进来,这就是代码中 while 循环的作用,也是ac automaton 的精妙之处:查找 fail 指针,看看 e 是否在 trie 中。

    int Work(char* word)
    {
        int ret = 0, p = 0;
        for (int i = 0; word[i]; ++i)
        {
            int m = hashtab[word[i]];
            int t = p = trie[p][m];
            while (value[t])
            {
                ret += value[t];
                value[t] = 0;
                t = fail[t];
            }
        }
        return ret;
    }
#include <iostream>
#include <deque>
using namespace std;

const int MAX_NODE = 500010;
const int CHILD_NUM = 26;

class AcAutomation 
{
private:
    int size;
    int trie[MAX_NODE][CHILD_NUM];
    int value[MAX_NODE];
    int fail[MAX_NODE];
    deque<int> que;
    int hashtab[128];

public:
    void Initialize()
    {
        fail[0] = 0;
        for (int i = 0; i < 26; ++i)
            hashtab['a' + i] = i;
    }

    void Reset()
    {
        size = 1;
        memset(trie[0], 0, sizeof(trie[0]));
        memset(value, 0, sizeof(value));
    }

    void Insert(char* word, int key)
    {
        int p = 0;
        for (int i = 0; word[i]; ++i)
        {
            int m = hashtab[word[i]];
            if (!trie[p][m])
            {
                memset(trie[size], 0, sizeof(trie[0]));
                trie[p][m] = size++;
            }
            p = trie[p][m];
        }
        value[p] += key;
    }

    void Construct()
    {
        que.empty();

        for (int i = 0; i < CHILD_NUM; ++i)
        {
            if (trie[0][i])
            {
                fail[trie[0][i]] = 0;
                que.push_back(trie[0][i]);
            }
        }

        while (!que.empty())
        {
            int u = que.front();
            que.pop_front();

            for (int i = 0; i < CHILD_NUM; ++i)
            {
                int& v = trie[u][i];
                if (v)
                {
                    fail[v] = trie[fail[u]][i];
                    que.push_back(v);
                }
                else 
                    v = trie[fail[u]][i];
            }
        }
    }

    int Work(char* word)
    {
        int ret = 0, p = 0;
        for (int i = 0; word[i]; ++i)
        {
            int m = hashtab[word[i]];
            int t = p = trie[p][m];
            while (value[t])
            {
                ret += value[t];
                value[t] = 0;
                t = fail[t];
            }
        }
        return ret;
    }
};

AcAutomation Ac;
char word[1000010];

int main()
{
    int cases;
    scanf("%d", &cases);
    Ac.Initialize();

    while (cases--)
    {
        int n;
        scanf("%d", &n);
        Ac.Reset();

        char is[56];
        for (int i = 0; i < n; ++i)
        {
            scanf("%s", is);
            Ac.Insert(is, 1);
        }
        Ac.Construct();

        scanf("%s", word);
        printf("%d\n", Ac.Work(word));
    }
    return 0;
}
原文地址:https://www.cnblogs.com/kedebug/p/2879207.html