POJ 3693 Maximum repetition substring(后缀数组+RMQ)

                                                                                                                                       Maximum repetition substring

The repetition number of a string is defined as the maximum number R such that the string can be partitioned into R same consecutive substrings. For example, the repetition number of "ababab" is 3 and "ababa" is 1.

Given a string containing lowercase letters, you are to find a substring of it with maximum repetition number.

Input

The input consists of multiple test cases. Each test case contains exactly one line, which
gives a non-empty string consisting of lowercase letters. The length of the string will not be greater than 100,000.

The last test case is followed by a line containing a '#'.

Output

For each test case, print a line containing the test case number( beginning with 1) followed by the substring of maximum repetition number. If there are multiple substrings of maximum repetition number, print the lexicographically smallest one.

Sample Input
ccabababc
daabbccaa
#
Sample Output
Case 1: ababab
Case 2: aa

分析:题意为求重复次数最多的连续子串。
可以枚举长度L,找长度为L的字符串最多连续出现的次数
如果有长度为L的字符串重复出现,那么str[0],str[L],str[2*L]...这种字符,肯定会有两个连续的,在重复出现的连续字符串上.
那么找str[i*L],str[(i+1)*L]的最大公共前缀,就可以找到它们向后匹配的重复连续字符串的长度h。
这个时候还需要找 它是否能够向前匹配到字符。
设k=L-h%L; 那么它需要向前匹配一轮需要的字符为k个
这样就可以转化为判断str[i*L-k]和str[(i+1)*L-k]的最大公共前缀能否到达h
匹配的字符数/L+1为当前长度为L的字符串出现次数
找最大公共前缀的时候,需要用RMQ进行height的预处理操作,
找到最大的出现次数后,用数组记录能达到最大出现次数的长度L(方便输出字典序优先的字符串)
最后需要输出字典序最小的字符串,那么因为后缀排名越靠前,字典序较小.
所以按后缀排名进行遍历,找到字典序最小的字符串.
代码如下:
#include <cstdio>
#include <iostream>
#include <algorithm>
#include <cstring>
#include <cmath>
typedef long long ll;
using namespace std;
const int MAXN=100010;
int wa[MAXN],wb[MAXN],wv[MAXN],Ws[MAXN];
char str[MAXN];
int st[MAXN];
int minsum[MAXN][20];
void RMQ_In(int num) //预处理->O(nlogn)
{
    for(int j = 1; j < 20; ++j)
        for(int i = 1; i <= num; ++i)
            if(i + (1 << j) - 1 <= num)
            {
                minsum[i][j] = min(minsum[i][j - 1], minsum[i + (1 << (j - 1))][j - 1]);
            }
}
int  RMQ_Query(int src,int des)
{
        int minn;
        int k=(int)(log(des-src+1.0)/log(2.0));
        minn=min(minsum[src][k],minsum[des-(1<<k)+1][k]);
        return minn;
}

int cmp(int *r,int a,int b,int l)
{return r[a]==r[b]&&r[a+l]==r[b+l];}
void da(const char r[],int sa[],int n,int m)  //n为len+1,m一般比数组中最大的数大一点即可
{
      int i,j,p,*x=wa,*y=wb,*t;
      for(i=0; i<m; i++) Ws[i]=0;
      for(i=0; i<n; i++) Ws[x[i]=r[i]]++;
      for(i=1; i<m; i++) Ws[i]+=Ws[i-1];
      for(i=n-1; i>=0; i--) sa[--Ws[x[i]]]=i;
      for(j=1,p=1; p<n; j*=2,m=p)
      {
            for(p=0,i=n-j; i<n; i++) y[p++]=i;
            for(i=0; i<n; i++) if(sa[i]>=j) y[p++]=sa[i]-j;
            for(i=0; i<n; i++) wv[i]=x[y[i]];
            for(i=0; i<m; i++) Ws[i]=0;
            for(i=0; i<n; i++) Ws[wv[i]]++;
            for(i=1; i<m; i++) Ws[i]+=Ws[i-1];
            for(i=n-1; i>=0; i--) sa[--Ws[wv[i]]]=y[i];
            for(t=x,x=y,y=t,p=1,x[sa[0]]=0,i=1; i<n; i++)
                  x[sa[i]]=cmp(y,sa[i-1],sa[i],j)?p-1:p++;
      }
      return;
}
int sa[MAXN],Rank[MAXN],height[MAXN];// sa是通过后缀排名找到它在字符串中的位置,rank是根据位置找到后缀排名,两者相逆,该模板中sa数组的最小值为1。

void calheight(const char *r,int *sa,int n)
{
      int i,j,k=0;
      for(i=1; i<=n; i++) Rank[sa[i]]=i;
      for(i=0; i<n; height[Rank[i++]]=k)
            for(k?k--:0,j=sa[Rank[i]-1]; r[i+k]==r[j+k]; k++);
}
int ct[MAXN];
int main()
{
  int t,len,maxx,r,a,b,c,h,q1,q2,tem,h2,ans,y,times,cnt=0,anslen,s,start,k2,Case=0;
   while(scanf("%s",str)!=EOF){
        Case++;
        maxx=0;
      if(str[0]=='#')break;
      len=strlen(str);
      da(str,sa,len+1,130);
      calheight(str,sa,len);
      for(int i=2;i<len;i++)
          minsum[i][0]=height[i];
          RMQ_In(len);
      for(int k=1;k<len;k++)
      {
          for(int j=0;j<len;j+=k)
          {
              if(j+k>=len)
                break;
           a=min(Rank[j],Rank[j+k]);
           b=max(Rank[j],Rank[j+k]);
           h=RMQ_Query(a+1,b);
           times=h/k+1;
           y=h%k;
            if(j-(k-h%k)>=0)
            {
              q1=j-(k-h%k);
              q2=j+k-(k-h%k);
              int a=min(Rank[q1],Rank[q2]);
              int b=max(Rank[q1],Rank[q2]);
              h2=RMQ_Query(a+1,b);
              if(h2>=h)
              times++;
            }
             if(times>maxx){
                maxx=times;
                cnt=0;
             }
             if(times==maxx)
                st[cnt++]=k;

             }
           // maxx=max(ans,maxx);
        }
          anslen=-1;
          for(int i=1;i<=len&&anslen==-1;i++)
            for(int j=0;j<cnt;j++)
            {
                int a=min(i,Rank[sa[i]+st[j]]);
                int b=max(i,Rank[sa[i]+st[j]]);
               s=RMQ_Query(a+1,b);
               if(s>=(maxx-1)*st[j])
                {
                 start=sa[i];
                // cout<<"start="<<start<<endl;
                 anslen=maxx*st[j];
                 break;
               }
            }
     printf("Case %d: ",Case);
      for(int i=start;i<start+anslen;i++)
       printf("%c",str[i]);

        printf("
");
      }

return 0;
}


原文地址:https://www.cnblogs.com/a249189046/p/7419410.html