kmp总结及其应用

kmp含义

　　克努斯-莫里斯-普拉特算法，一种字符串查找算法。

　　字符串算法主要是用于主串 S( s1,s2,s3,...,sn ), 模式串T( t1,t2,...,tm ), 之间的匹配问题.　

　　相对与模式匹配O(n^2)而言: 当 Si != Tj 失配时, 主串下标i不回溯, 而是将模式串下标j回溯到合适的地方,再继续比较 Tj ,Si.

时间复杂度极端情况是 O(N*M), 但是一般情况下总能保证O(N+M).

　　假定串 S( i-j+1, i ) 与模式串 T( 1, j ) 匹配时, Si != Tj 不匹配,此时需j最短回溯到 k,

　　则存在 T(1,k-1) = T( j-k+1, j-1 ), 此时 k = next[j], 再令 Si 与 Tk 比较.

　　则我们得出 next[] 的定义:

　　　　next[i] = 0, 当 i = 0

　　　　next[i] = Max{ k | 1 < k < j, 且 T(1,k-1) = T(j-k+1,j-1),当此集合不空时 }

　　　　next[i] = 1, 其它情况.

 1 int kmp( char *S, char *T ){ // 主串S,模式串T, 下标皆从1开始.
 2     int la = strlen(S), lb = strlen(T);
 3     int i = 1, j = 1;
 4     while( i <= la && j <= lb ){
 5         if( j == 0 || S[i] == T[j] ) i++, j++;
 6         else    j = next[j]; //模式串向前滑动到 nxt[j]位置,继续比较
 7     }
 8     if( j > lb ) return i-j; //匹配成功,返回最初匹配点
 9     return -1; //匹配失败
10 }

next数组

　　next函数,表示对于模式串而言,其最长的前缀与后缀相同的长度.

　　有定义知道 next[1] = 0;

　　设 next[j] = k, 这表明在模式串中存在下列关系

　　　　T( 1, k-1 ) = T( j-k+1, j-1 )

　　此时 next[ j+1 ]的取值有两种情况:

　　　　1. 当 T[k] == T[j] 时, 此时有 T( 1,k ) = T( j-k+1, j ), 则此时 next[ j+1 ] = next[j] + 1

　　　　2. 但 T[k] == T[j] 时, 此时可把求 next函数值的问题看作是一个模式匹配的问题.整个模式串既是主串又是模式串.

按照前面主串与模式串匹配的思路, 则当 T[k] != T[j] 时, 应将模式串下标 k滑动到 next[k]时, 再与 T[j] 比较,

　　　　最终可能出现两种情况:

　　　　　　　　1. 匹配到, 此时 next[ j+1 ] = next[ k` ] + 1;

　　　　　　　　2. 一直无法匹配则最后会得到, next[ j+1 ] = 1.

 1 void GetNext( char *T, int *nxt ){
 2     int len = strlen(T);
 3     int i = 0, j = 1;
 4     nxt[1] = 0;
 5     while( j <= len ){
 6         if( i == 0 || T[i] == T[j] )
 7             nxt[ ++j ] = ++i;
 8         else i = nxt[i];
 9     }
10 }

应用模型

　　1. 模式串是否在主串中出现.

　　　　 poj 3080 Blue Jeans

　　　　枚举其中一个串的主串,然后与其他串进行KMP匹配即可. 此题细节处理使用了STL.string.substr( 起点l, 数量num ).

View Code

#include<cstdio>
#include<cstdlib>
#include<cstring>
#include<string>
#include<algorithm>
using namespace std;

char str[15][100];
int n, next[100];
string res;
bool flag;

void GetNxt( string T, int *nxt, int len ){
    int i = 0, j = 1;
    while( j <= len ){
        if( i == 0 || T[i-1] == T[j-1] ) 
            nxt[++j] = ++i;
        else    i = nxt[i];
    }
}
bool kmp(char *S, string T){
    int la = strlen(S), lb = T.size();
    int i = 1, j = 1;
    GetNxt( T, next, lb );
    while( i <= la && j <= lb ){
        if( j == 0 || S[i-1] == T[j-1] ) i++, j++;
        else j = next[j];
        if( j > lb ) return true;
    }
    return false;
}
void solve(){
    flag = false;    
    string st = str[0], tmp;    
    for(int L = 60; L >= 3; L--){
        for(int i = 0; i+L <= 60; i++){    
            tmp = st.substr(i,L);
            bool a = true;    
            for(int k = 1; k < n && a; k++)
                if(  kmp( str[k], tmp ) == false ) a = false;
            if( a == true ){
                if( flag == false ) flag = true, res = tmp;
                if( res > tmp ) res = tmp;
            }    
        }
        if( flag ) return;    
    }
}
int main(){
    int T;    
    scanf("%d", &T);
    while( T-- ){
        scanf("%d", &n );
        for(int i = 0; i < n; i++)
            scanf("%s", str[i] );
        solve();
        if( flag == false ) puts("no significant commonalities");
        else printf("%s\n", res.c_str() );
    }
    return 0;
}

　　　 poj 3450 Corporate Identity

　　　　同上题差不多.但是这题 N达到了4000,串长度为200, 暴力肯定不行,二分枚举长度,然后进行匹配.

View Code

#include<cstdio>
#include<cstring>
#include<cstdlib>
#include<string>
#include<algorithm>
using namespace std;
const int N = 4010;

char str[N][210];
int n, next[N], Len[N];
string res, st;
bool flag;

void GetNxt(string T,int *nxt, int len){
    int i = 0, j = 1; nxt[1] = 0;
    while( j <= len ){
        if( i == 0 || T[i-1]==T[j-1] ) 
            nxt[++j] = ++i;
        else i = nxt[i];
    }
} 
bool kmp( char *S, string T, int la, int lb ){ 
    int i = 1, j = 1; GetNxt(T,next,lb);
    while( i <= la && j <= lb ){
        if( j == 0 || S[i-1] == T[j-1] ) i++, j++;
        else    j = next[j];
        if( j > lb ) return true;    
    }
    return false;
}
bool find( int L ){
    string st = str[0],tmp;    
    for(int i = 0; i+L <= Len[0]; i++){
        tmp = st.substr( i, L );
        bool f = true;    
        for(int k = 1; k < n && f; k++)
            if( kmp( str[k], tmp, Len[k], L ) == false ) f = false; 
        if( f ) return true;
    }    
    return false;
}
void solve(){
    flag = false;
    int l = 0, r = Len[0], maxlen = -1;
    while( l < r ){    
        int m = (r+l)>>1;
        if( find(m) ) maxlen = m, l = m+1;
        else r = m;    
    
    }    
    if( maxlen != -1 ){
        string tmp, st = str[0]; l = maxlen;
        for(int i = 0; i+l <= Len[0]; i++){
            tmp = st.substr( i, l );
            bool f = true;
            for(int k = 1; k < n && f; k++)
                if( !kmp( str[k], tmp, Len[k], l) ) f = false;
            if( f ){
                if(flag ==false) flag=true, res = tmp;
                if( res > tmp ) res = tmp;
            }
        }
    }
}
int main(){
    while( scanf("%d", &n), n ){
        for(int i = 0; i < n; i++){
            scanf("%s", str[i] ); Len[i] = strlen(str[i]);
        }
        solve();
        if( flag ) printf("%s\n", res.c_str() );
        else puts("IDENTITY LOST");
    }    
    return 0;
}

　　　　poj 1226 Substrings 　　　　

　　　　本质还是一样求模式串在主串中是否出现. 拿一个串从大到小暴力分解子串. 与其他原串与inverse串匹配.

View Code

#include<cstdio>
#include<cstring>
#include<cstdlib>
#include<string>
#include<algorithm>
using namespace std;

const int N = 110;

char str[120][N];
string bap[120];
int n, m, minlen;
int Len[120], next[120];

void GetNxt(const char *T, int len){
    int i = 1, j = 0; next[1] = 0;
    while( i <= len ){
        if( j == 0 || T[i-1]==T[j-1] )
            next[++i] = ++j;
        else j = next[j];
    } 
} 
bool kmp(const char *S,int la,const char *T,int lb){
    int i = 1, j = 1; GetNxt(T,lb);
    while( i<=la && j<=lb ){
        if( j == 0 || S[i-1] == T[j-1] ) i++,j++;
        else    j = next[j];
        if( j > lb ) return true;    
    }
    return false;
}
int solve(){
    string st = str[0];
    for(int L = minlen; L >= 1; L-- ){
        for(int i = 0; i+L <= Len[0]; i++){
            bool find = true;
            string tmp = st.substr( i, L );
            for(int j = 1; j < n && find; j++){
                if( !kmp(str[j],Len[j],tmp.c_str(),L) && !kmp(bap[j].c_str(),Len[j],tmp.c_str(),L) )
                    find = false;
            }
            if(find) return L;    
        } 
    }
    return 0;
}
int main(){
    int T;
    scanf("%d", &T);
    while( T-- ){
        scanf("%d", &n);
        scanf("%s", str[0] );    
        minlen = (Len[0]=strlen(str[0]));    
        for(int i = 1; i < n; i++){
            scanf("%s", str[i] );        
            bap[i] = str[i];
            Len[i] = strlen(str[i]);    
            minlen = min( minlen, Len[i] );    
            reverse( bap[i].begin(), bap[i].end() );
        }
        int d = solve();
        printf("%d\n", d );
    }
    return 0;
}

　　　　poj 2541 Binary Witch

　　　　这一题还是暴力过去的.不过据说有 dp(i,j)的状态压缩, 字符逆序处理,然后KMP.string.substr挺管用..

View Code

#include<cstdio>
#include<cstdlib>
#include<cstring>
#include<algorithm>
#include<string>
using namespace std;
const int N = (int)1e6+1100;

char str[N];
int n, m;
int next[N];

void GetNxt(string T,int *nxt, int len){
    int i = 1, j = 0; nxt[1] = 0;
    while( i <= len ){
        if( j == 0 || T[i-1]==T[j-1] )
            nxt[++i] = ++j;
        else j = nxt[j];
    }
}
int kmp(string S, int la, string T, int lb){
    int i = 1, j = 1; GetNxt(T,next,lb);
    while( i <= la && j <= lb ){
        if( j == 0 || S[i-1] == T[j-1] )
            i++, j++;
        else j = next[j];
        if( j > lb ) return i-j;    
    }
    return -1;    
} 
int main(){
    while( scanf("%d%d", &n,&m) != EOF){
        scanf("%s", str);
        int start = n;    
        for(int i = 0; i < m; i++){
            string s = str;                
            reverse( s.begin(), s.end() );    
            bool find = false;    
            for(int L = min(13,n); L >= 1 && !find; L-- ){
                int la = n-1, lb = L;    
                string s1 = s.substr(1,la), t1 = s.substr(0,lb);
            //    printf("s1 = %s, t1 = %s\n", s1.c_str(), t1.c_str() );    
                int d = kmp( s1, la, t1, lb );
                if( d != -1 ) find = true, str[n++] = s[d];    
            }    
            if( find == false ) str[n++] = '0';
            str[n] = '\0';    
            //printf("str = %s\n", str);    
        }    
        for(int i = start; i < n; i++) printf("%c",str[i]);    
    }    
    return 0;
}

　　2. 模式串在主串中的出现次数.

　　　　poj 3461 Oulipo

　　　　因为next函数值意义为最长的前缀与后缀相同长度. 当模式串Tj与主串Si 在 (i,j)匹配完成,此时下一个可能出现的匹配的起始位置为 (i+1,lenS) , 若我们使主串下标i回溯时,则会使时间复杂度达到O(N*M), 因为是要找与模式串相同的. 则我们只需要令j = next[j], 此时 T( 1, nxt[j]-1 ) = S( i-nxt[j]+1, i-1 ) , 表示其最长的前缀和后缀,此时i就无需回溯,然后继续匹配.统计次数即可.

　　　　核心点是主串下标不回溯, 并利用 next函数意义(最长的相同前缀和后缀)

View Code

#include<cstdio>
#include<cstdlib>
#include<cstring>

const int N = (int)1e6+10;

char s1[N], s2[10010];
int next[10010];

void GetNxt( char *T, int *nxt, int len ){
    int i = 0, j = 1; nxt[1] = 0;
    while( j <= len ){
        if( i == 0 || T[i-1] == T[j-1] )
            nxt[++j] = ++i;
        else i = nxt[i];
    }
}
int kmp( char *S, char *T ){
    int la = strlen(S), lb = strlen(T), cnt = 0;
    GetNxt( T, next, lb );
    int i = 1, j = 1;
    while( i <= la && j <= lb ){
        if( j == 0 || S[i-1] == T[j-1] )
            i++, j++;
        else j = next[j];
        if( j > lb ) cnt++, j = next[j];    
    }
    return cnt;
}

int main(){
    int T;
    scanf("%d", &T);
    while( T-- ){
        scanf("%s", s2);
        scanf("%s", s1);
        printf("%d\n", kmp( s1, s2 ) );
    }
    return 0;
}

　　　　poj 3167 Cow Patterns 有点难度.

　　　　这题是求一模式串与主串的相对大小匹配,所有位置.

　　　　如果给我们的是绝对大小,那么我们就能用 poj 3461的解法,每次匹配到了再令j = next[j] 即可,得出所有匹配位置.

而对于相对大小,我们需要使用到一个结论:

　　　　两个偏序序列, 对于其每一位, 其前面比起小的数量,和与其相等的数量, 都相等, 则两个偏序序列相同. (小,和等于都一样,则大于也一样- -..)

利用这个结论,我们就可以判定快速判定两个偏序序列是否相同. 从宏观的角度上看, 还是一样对模式串求个next函数,然后再对模式串与主串kmp匹配.

　　　　这里比较特殊的地方, 就在于, 两个值的比较, 根据定义, (1,k) = (i-k+1,i) 时, next[ i+1 ] = k+1 . 模式串中的总是用的前缀,而主串中一直用的后缀.

那么我们就可以预处理出模式串的 m1(小于数量), m2(等于数量), 对于主串则使用树状数组来维护, 当失配时,则为维护树状数组.具体如下.

　　　　若当前模式串 T(1,j) 与主串S( i-j+1, i ) 比较时, Tj != Si, 此时失配, 需要令 j = next[j] 再进行匹配. 模式串我们预处理了前缀.可以O(1)得出.无需处理.

而,对于主串而言, 前面的树状数组中存放的元素是, ( i-j+1, i ), 当令 j = next[j], 再与 Si比较时, 此时树状数组中应该存放序列 S( i-next[j]+1, i ) , 那么我们就

需要手动的删除掉 S( i+j-1, i-next[j] ) 这一段. 对于模式串自身求next函数,操作一样.

View Code

//poj 3167 kmp + binary index tree
//yefeng1627
#include<cstdio>
#include<cstring>
#include<cstdlib>
const int N = (int)1e5+10;
const int K = (int)3e4+10;

int a[N], b[K], c[30];
int nxt[K], m1[K], m2[K];
int n, k, S;
int cnt, res[N];

void add(int x,int v){
    while(x<30) c[x]+=v, x+=(x&(-x));
}
int sum(int x){
    int res = 0;
    while(x>=1) res += c[x],x-=(x&(-x));
    return res;
}
void GetNxt(){
    memset( c, 0, sizeof(c));
    int i = 1, j = 0; nxt[1] = 0;
    while( i <= k ){
    //    printf("i:%d,j:%d b-1=%d, b=%d\n", i,j, sum(b[i]-1),sum(b[i]) );    
        if( j == 0 || (sum(b[i]-1)==m1[j]&&sum(b[i])==m2[j]) )
        {    nxt[++i] = ++j; if(i<=k) add(b[i],1); }
        else{
            for(int x = i-j+1; x <= i-nxt[j]; x++) add(b[x],-1);
            j = nxt[j];
        }        
    }
    //printf("k = %d, i = %d\n", k, i );    
    //for(i = 1; i <= k+1; i++)
    //    printf("%d ", nxt[i] ); puts("");
}
void kmp(){
    cnt = 0; GetNxt();
    int i = 1, j = 1;
    memset(c,0,sizeof(c));    
    add(a[1],1);
    while( i<=n && j<=k ){
        if( j == 0 || (sum(a[i]-1)==m1[j]&&sum(a[i])==m2[j]) ){
            ++i,++j; if(i<=n) add(a[i],1);
        }    
        else{
            for(int x = i-j+1; x <= i-nxt[j]; x++) add(a[x],-1);
            j = nxt[j];
        }
        if( j > k ){
    //        printf("i = %d, k = %d\n", i, k);    
            res[cnt++] = i-k;
            for(int x = i-j+1; x <= i-nxt[j]; x++) add(a[x],-1);
            j = nxt[j];
        }    
    }
}
int main(){
    while( scanf("%d%d%d", &n,&k,&S) != EOF){
        for(int i = 1; i <= n; i++) scanf("%d", &a[i] );
        memset(c,0,sizeof(c));
        for(int i = 1; i <= k; i++){
            scanf("%d", &b[i] );
            add( b[i], 1 );
            m1[i] = sum(b[i]-1),m2[i] = sum(b[i]);    
        //    printf("i:%d, m1 = %d, m2 = %d\n", i, m1[i], m2[i] );    
        }    
        kmp();
        printf("%d\n", cnt );    
        for(int i = 0; i < cnt; i++)            
            printf("%d\n", res[i] );
    }        
    return 0;
}

　　 3. 求循环节长度 / 最小覆盖子串长度图形介绍 http://blog.csdn.net/fjsd155/article/details/6866991

　　　　poj 2406 Power Strings

　　　　kmp的nxt函数过程,会将模式串一个周期一个周期的构造, 对于 (i+1) - nxt[ i+1 ] (因为我们是通过 Ti与Tj 得到nxt[i+1]的),

　　　　即是其周期长度, 当目前总长度 i % { (i+1)-nxt[i+1] } = 0, 时, 则意味着最后一个周期构造完成, 否则 i % { (i+1)-nxt[i+1] }表示目前最后一个周期串已构造出了多少个.

　　　　poj 2185 Milking Grid 有点难度,且题意不是很好懂.

　　　　这题所指的最小覆盖长度,其实就是最小循环周期长度.当然并非是完成循环,换句话说是单元串a,重复k次可以覆盖str, 其中streln(a*k) >= strlen(str),

　　　　并且我们知道 N-next(N)是最小覆盖长度, 之后的 j = next( next(N) )逐渐增大, 解决此题的思路是:

　　　　首先处理宽度width, 寻找所有行都有的最小覆盖宽度 w`, 极端情况是 c. 因为每个串都能覆盖本身.

　　　　之后在将 r长度为c的串(1,c). 截断成 r个长度为width的串(1,width), 然后对这c个串进行一个HASH值.得到一个数组key[C].

　　　　然后对这个数组求一个next函数, 高度 high 即为 C - next(C),

View Code

#include<cstdio>
#include<cstring>
#include<cstdlib>

const int N = (int)1e6+10;
char s[N];
int nxt[N];

int main(){
    while( scanf("%s", s) != EOF ){
        if( s[0] == '.' ) break;    
        int len = strlen(s);
        int i = 0, j = 1; nxt[1] = 0;
        while( j <= len ){
            if( i == 0 || s[i-1] == s[j-1] )
                nxt[++j] = ++i;
            else i = nxt[i];
        }    
        if( len%(len+1-nxt[len+1]) ) puts("1");
        else printf("%d\n", len/(len+1-nxt[len+1]) );
    }    
    return 0;
}

　　　　poj 1961 Period

　　　　对于模式串本身求next函数值时,其实其是一个一个周期在构造串, N-next(N)表示串的循环周期, 而N%(N-next(N))即为最后一个周期已构造串的数量.

View Code

#include<cstdio>
#include<cstring>
#include<cstdlib>
#include<string.h>
#include<algorithm>
using namespace std;

const int N = (int)1e6+10;
int n, nxt[N];
char s[N];

int main(){
    int Case = 1;
    while( scanf("%d",&n), n ){
        scanf("%s", s);
        int len = strlen(s);
        int i = 1, j = 0; nxt[1] = 0;
        while( i <= len ){
            if( j == 0 || s[i-1] == s[j-1] )
                nxt[++i] = ++j;
            else j = nxt[j];
        } 
        printf("Test case #%d\n", Case++);    
        for(int i = 2; i <= len; i++){
            if( (i%(i+1-nxt[i+1])==0) && (nxt[i+1]>1) )
                printf("%d %d\n", i, i/(i+1-nxt[i+1]) );        
        }    
        puts("");    
    }
    return 0;
}

　　 4. 求串的前缀最大长度, 且其前缀与后缀相同. (最大前缀与后缀)

　　　　poj 2752 Seek the Name

　　　　对于串T(1,i), 我们考虑 next函数定义

　　　　next[ i ] = Max{ k | 1 < k < i && T( 1,k-1 ) = T( i-k+1, i-1 ) 且集合不为空, } , 则可以知道,

字串(1,next[ i-1 ]) 即为串 T(1,i-1) 的最大前缀与后缀. 此时再考虑串 [1,next[i-1] ]的最大前缀与后缀,

如此反复,直到 i = 0 结束. 因为定义 k < i, 其实其本身 (1,i)也是其最大前缀和后缀. 逆序输出即可.

　　　　再重复说明下, kmp的next函数值是通过比较 Ti 与 Tj , 若 Ti = Tj ,则 next[ j+1 ] = i+1, 所以,

我们要获取i位置的最后匹配位置,则需要用next[ i+1 ], 因为其包含了 Ti = T[ next[i+1] - 1 ].

View Code

#include<cstdio>
#include<cstring>
#include<cstdlib>
const int N = 400010;
char s[N];
int res[N], nxt[N];

int main(){
    while( scanf("%s", s) != EOF){
        int len = strlen(s);
        int i = 0, j = 1; nxt[1] = 0;
        while( j <= len ){
            if( i == 0 || s[i-1] == s[j-1] ) 
                nxt[++j] = ++i;
            else i = nxt[i];
        }
        int cnt = 0, x = nxt[len+1];
        res[cnt++] = len;
        while( x > 1 ) { res[cnt++] = x-1; x = nxt[x]; } 
        for(int i = cnt-1; i >= 0; i--)
            printf( i == 0 ? "%d" : "%d ", res[i] ); puts("");    
    }    
    return 0;
}