二次离线莫队

正常的莫队如果其单次插入或者删除的时间开销是$f(n)$,那么我们的莫队的时间开销就是$O(f(n)*nsqrt n)$或者$O((f(n)*n)^{1.5})$次方，当$f(n)$过大，比如$log^2(n)$，那么时间开销就无法接受。二次离线莫队可以把时间开销降到$O(nf(n)+n^{1.5})$

设 $x$ 对区间 $[l,r]$ 的贡献为 $f(x,[l,r])$

我们考虑区间端点变化对答案的影响：以$[l..r]$ 变成 $[l..(r+k)]$为例

增量是 $sum_{x=r+1}^{r+k} f(x,[l,x-1])=sum_{x=r+1}^{r+k} f(x,[1,x-1])-f(x,[1,l-1]) $ （这一步要求询问有可加性，或者经过处理后有可加性）

前半部分自然可以O(1)处理，后面部分本质上是O(N^{1.5})次的对$f(x,[1,y])$的询问，只要能对给定x进行一些预处理后可以$O(1)$返回$f(x,[1,y])$，就得到一个时间开销是$O(N^{1.5}+Nf(N))$，空间开销$O(N^{1.5})$的优秀做法

但空间开销$O(N^{1.5})$在有些时候也是不可接受的，我们试图进行空间优化。我们发现每次询问时$y$形成区间，每次存储用区间存就能降到$O(N)$了

我们来看2道例题：

洛谷P4887

处理前缀询问的时候，有异或运算的交换律，即$a xor b=c Leftrightarrow a xor c=b$

//#pragma GCC optimize(3,"Ofast","inline")
//#pragma GCC optimize("-Ofast","-funroll-all-loops")
#include<bits/stdc++.h>
//#define getchar nc
#define sight(c) ('0'<=c&&c<='9')
#define swap(a,b) a^=b,b^=a,a^=b
#define LL long long
#define debug(a) cout<<#a<<" is "<<a<<"
"
#define dput(a) puts("a")
#define eho(x) for(int i=head[x];i;i=net[i])
#define fi first
#define se second
#define N  100007
inline char nc(){
static char buf[1000000],*p1=buf,*p2=buf;
return p1==p2&&(p2=(p1=buf)+fread(buf,1,1000000,stdin),p1==p2)?EOF:*p1++;
}
template <class T>
inline void read(T &x){// unsigned
    static char c;
    for (c=getchar();!sight(c);c=getchar());
    for (x=0;sight(c);c=getchar())x=x*10+c-48;
}
template <class T> void write(T x){if (x<10) {putchar('0'+x); return;} write(x/10); putchar('0'+x%10);}
template <class T> inline void writeln(T x){ if (x<0) putchar('-'),x*=-1; write(x); putchar('
'); }
template <class T> inline void writel(T x){ if (x<0) putchar('-'),x*=-1; write(x); putchar(' '); }
using namespace std;
struct Node{
    int x,y,id; LL ans; 
}Q[N];
int a[N],blo[N],n,m,k,SIZ;
int A[N],cnt[N],pre[N],now[N];
vector<int> buf;
vector<array<int,3>> V[N];
LL ans[N]; 
signed main() {
    #ifdef LOCAL
    //    freopen("test.in", "r", stdin);
    //    freopen("test.out", "w", stdout);
    #endif
    read(n); read(m); read(k);
    for (int i=1;i<=n;i++) read(a[i]);
    SIZ=sqrt(n);
    for (int i=1;i<=n;i++) blo[i]=(i-1)/SIZ+1;
    for (int i=1;i<=m;i++) {
        read(Q[i].x); read(Q[i].y); 
        Q[i].id=i;
    }
    sort(Q+1,Q+m+1,[&](const Node& A,const Node& B){return blo[A.x]==blo[B.x]?A.y<B.y:A.x<B.x;});
    for (int i=1;i<16384;i++) cnt[i]=cnt[i-(i&-i)]+1;
    for (int i=0;i<16384;i++) if (cnt[i]==k) buf.push_back(i);
    for (int i=1;i<=n;i++) {
        pre[i]=A[a[i]];  //F(x,1,x-1) 
        for (auto x:buf) A[x^a[i]]++;
        now[i]=A[a[i]];
    }
    for (int i=1,L=1,R=0;i<=m;i++) {
        int l=Q[i].x,r=Q[i].y;
        if (L>l) V[R].push_back({l,L-1,i});
        while (L>l) --L,Q[i].ans-=now[L];//need -f(x,x,x)
        if (R<r) V[L-1].push_back({R+1,r,-i});
        while (R<r) ++R,Q[i].ans+=pre[R];
        if (L<l) V[R].push_back({L,l-1,-i});//need +f(x,x,x)
        while (L<l) Q[i].ans+=now[L],L++;
        if (R>r) V[L-1].push_back({r+1,R,i});
        while (R>r) Q[i].ans-=pre[R],R--; 
    }
    memset(A,0,sizeof A);
    for (int i=1,id,l,r,op;i<=n;i++) {
        for(auto x:buf) ++A[a[i]^x];
        for (auto x:V[i]) {
            l=x[0]; r=x[1]; op=x[2]<0?-1:1; id=x[2]*op;
            for (int j=l;j<=r;j++) {
                Q[id].ans+=op*A[a[j]];
            }
        }
    } 
    for (int i=1;i<=m;i++) Q[i].ans+=Q[i-1].ans,ans[Q[i].id]=Q[i].ans;
    for (int i=1;i<=m;i++) writeln(ans[i]);
    return 0;
}

洛谷P5501

我们考虑$a[x]$对$[L,R]$ 的贡献$f（x,[L,R]）$，有$f(x,[L,R])=cnt_{x,[L,R]}+a[x]*（1+num_{x,[L,R]}）$，$cnt_{x,[L,R]}$表示在$[L,R]$中比$a[x]$大的数和，$num_{x,[L,R]}$表示比$a[x]$小的数的和，我们发现$f(x,[L,R])$不满足可加性，令$g(x,[L,R])=cnt_{x,[L,R]}+a[x]*num_{x,[L,R]}$就满足可加性了。

有几个要注意的点，维护前缀和不能用树状数组，树状数组不能$O(1)$查询，而是要用分块。这题卡常数，结构体中调用函数比较慢，在最后统计后半部分的贡献时直接调用Tree的que会导致常数大到T飞，要全部写开来，还有就是要用莫队的奇偶优化，能快一半。

//#pragma GCC optimize(3,"Ofast","inline")
#pragma GCC optimize("-Ofast","-funroll-all-loops")
#include<bits/stdc++.h>
#define getchar nc
#define sight(c) ('0'<=c&&c<='9')
#define swap(a,b) a^=b,b^=a,a^=b
#define LL long long
#define debug(a) cout<<#a<<" is "<<a<<"
"
#define dput(a) puts("a")
#define eho(x) for(int i=head[x];i;i=net[i])
#define fi first
#define se second
#define N 500007
#define M 100007
inline char nc(){
static char buf[1000000],*p1=buf,*p2=buf;
return p1==p2&&(p2=(p1=buf)+fread(buf,1,1000000,stdin),p1==p2)?EOF:*p1++;
}
template <class T>
inline void read(T &x){// unsigned
    static char c;
    for (c=getchar();!sight(c);c=getchar());
    for (x=0;sight(c);c=getchar())x=x*10+c-48;
}
template <class T> void write(T x){if (x<10) {putchar('0'+x); return;} write(x/10); putchar('0'+x%10);}
template <class T> inline void writeln(T x){ if (x<0) putchar('-'),x*=-1; write(x); putchar('
'); }
template <class T> inline void writel(T x){ if (x<0) putchar('-'),x*=-1; write(x); putchar(' '); }
using namespace std;
struct Node{
    int x,y,id; LL ans; 
}Q[N];
int a[N],blo[N],n,m,k,SIZ;
int A[N],cnt[N];
vector<int> buf;
vector<array<int,3>> V[N];
LL ans[N],pre[N],now[N],sum[N]; 
struct Tree{
    LL q[M],g[M];
    int BLO;
    #define L(x) (x&-x) 
    LL que(LL x){
        return g[x/BLO]+q[x];
    } 
    void add(LL x,LL dla) {
        int s1=x/BLO,s2=(M-1)/BLO;
        for (int i=x;i<s1*BLO+BLO&&i<M;i++) q[i]+=dla;
        for (int i=s1+1;i<=s2;i++) g[i]+=dla;
    }
    void clear() {
        memset(q,0,sizeof q);
        memset(g,0,sizeof g);
        BLO=sqrt(M);
//        for (int i=0;i<M;i++) id[i]=i/BLO;
    }
}Q1,Q2;
signed main() {
//    #ifdef LOCAL
    freopen("test.in", "r", stdin);
    freopen("test.out", "w", stdout);
//    #endif
    read(n); read(m);
    Q1.clear(); Q2.clear();
    for (int i=1;i<=n;i++) read(a[i]),sum[i]=sum[i-1]+a[i];
    SIZ=sqrt(n);
    for (int i=1;i<=n;i++) blo[i]=(i-1)/SIZ+1;
    for (int i=1;i<=m;i++) {
        read(Q[i].x); read(Q[i].y); 
        Q[i].id=i;
    }
    sort(Q+1,Q+m+1,[&](const Node& A,const Node& B){return blo[A.x]==blo[B.x]?A.y<B.y:A.x<B.x;});for (int i=1;i<=n;i++) {
        pre[i]=1ll*a[i]*(Q1.que(a[i]-1))+Q2.que(M-1)-Q2.que(a[i]);
        Q1.add(a[i],1); Q2.add(a[i],a[i]);
        now[i]=1ll*a[i]*(Q1.que(a[i]-1))+Q2.que(M-1)-Q2.que(a[i]);
    } for (int i=1,L=1,R=0;i<=m;i++) {
        int l=Q[i].x,r=Q[i].y;
        if (L>l) V[R].push_back({l,L-1,i});
        while (L>l) --L,Q[i].ans-=now[L];//need -f(x,x,x)
        if (R<r) V[L-1].push_back({R+1,r,-i});
        while (R<r) ++R,Q[i].ans+=pre[R];
        if (L<l) V[R].push_back({L,l-1,-i});//need +f(x,x,x)
        while (L<l) Q[i].ans+=now[L],L++;
        if (R>r) V[L-1].push_back({r+1,R,i});
        while (R>r) Q[i].ans-=pre[R],R--; 
    }
    Q1.clear(); Q2.clear();
    int BLO=Q1.BLO;
    for (int i=1,id,l,r,op;i<=n;i++) {
        Q1.add(a[i],1); Q2.add(a[i],a[i]);
        for (auto x:V[i]) {
            l=x[0]; r=x[1]; op=x[2]<0?-1:1; id=x[2]*op;
            for (int j=l;j<=r;j++) {
                Q[id].ans+=1ll*op*(1ll*a[j]*(Q1.q[a[j]-1]+Q1.g[(a[j]-1)/BLO])+(Q2.q[M-1]+Q2.g[(M-1)/BLO])-(Q2.q[a[j]]+Q2.g[a[j]/BLO]));
            }
        }
    } 
    for (int i=1;i<=m;i++) Q[i].ans+=Q[i-1].ans,ans[Q[i].id]=Q[i].ans+sum[Q[i].y]-sum[Q[i].x-1];
    for (int i=1;i<=m;i++) writeln(ans[i]);
    return 0;
} 
/*
4 1
1 2 2 3
2 3
*/
/*
4 1
1 2 2 3
2 3
*/