个人感觉 O(n*sqrt(log(n))) 求逆序对个数是 non-practical 的
只实现了值域为 \([0,2^\sqrt{w})\) 的子问题,这个算法跑得比树状数组还慢。
#include<bits/stdc++.h>
#define rp(i,a,b) for(int i=a;i<=b;++i)
using namespace std;
typedef unsigned int ui;
typedef unsigned long long ul;
ul seed=10245;
inline ul nxt(){
seed^=seed<<17;
seed^=seed>>5;
seed^=seed<<13;
seed^=seed>>8;
seed^=seed<<25;
return seed;
}
const int N=1e8;
int a[N+9],b[257];
void add(int x){
for(int i=x;i<256;i+=i&-i)++b[i];
}
int sum(int x){
int ans=0;
for(int i=x;i;i^=i&-i)ans+=b[i];
return ans;
}
int main(){
rp(i,1,N){
a[i]=nxt()&255;
}
int st=clock();
ul ans=0;
rp(i,1,N){
ans+=sum(a[i]);
add(a[i]+1);
}
printf("%llu\n",ans);
printf("time used: %dms\n",clock()-st);
return 0;
}
/*
time used: 968ms
*/
#include<bits/stdc++.h>
#define rp(i,a,b) for(int i=a;i<=b;++i)
using namespace std;
typedef unsigned int ui;
typedef unsigned long long ul;
ul seed=10245;
ul nxt(){
seed^=seed<<17;
seed^=seed>>5;
seed^=seed<<13;
seed^=seed>>8;
seed^=seed<<25;
return seed;
}
const int L=8,A=(1<<L)-1;
int F[A+2][2];
const ul E=1|1<<2|1<<4|1<<6;
const ul K=
1ull|1ull<<L|1ull<<2*L|1ull<<3*L|
1ull<<4*L|1ull<<5*L|1ull<<6*L|1ull<<7*L;
ul S[A+2][5],T[A+2][4],Sp[A+2],Pre[L+1];
ul sh_0(ul x){
ul y=x&(~x>>1)&E;
y=y*K;
return x^3*((y&x)^(y<<1&x)>>1);
}
ul sh_1(ul x){
x=sh_0(x);
ui y=x&255;
return x&S[y][0]|(x&S[y][4])<<2|(x&S[y][2])>>2|(x&S[y][3])<<1|(x&S[y][1])>>1;
}
ul sh_2(ul x){
x=sh_1(x);
int y=x&255,z,w;
z=T[y][0]&7;
w=4-(T[y][0]>>3);
return x&T[y][3]|(x&T[y][1])<<w|(x&T[y][2])>>z;
}
void init(){
int TR[3]={0,2,3};
int TL[3]={0,1,3};
rp(i,0,A){
ui j=i;
int x=(j&3)+1>>1;
int y=(~j>>2&3)+1>>1;
int z=(j>>4&3)+1>>1;
int w=(~j>>6&3)+1>>1;
if(y){
S[i][y+2]|=TR[x];
}
if(w){
S[i][w+2]|=16*TR[z];
}
if(x){
S[i][x]|=4*TL[y];
}
if(z){
S[i][z]|=64*TL[w];
}
S[i][0]=255^S[i][1]^S[i][2]^S[i][3]^S[i][4];
rp(j,0,4){
S[i][j]*=K;
}
}
rp(i,0,A){
T[i][0]=
__builtin_popcount(i&15)+
__builtin_popcount(i>>4)*8;
T[i][1]=K*(i&15);
T[i][2]=K*(~i&240);
T[i][3]=~0ull^T[i][1]^T[i][2];
}
rp(i,0,A){
rp(j,0,L-1)if(i>>j&1){
++F[i][0];
rp(k,0,j-1)if(~i>>k&1){
++F[i][1];
}
}
rp(j,0,7){
Sp[i]|=ul(i>>(L-1-j)&1)<<(j*L);
}
}
rp(i,0,L){
Pre[i]=K*((1<<i)-1);
}
}
const int N=1e8;
const int M=N/64*10+9;
unsigned char a[N+9];
ul b[M],c[M],ans;
void Add(ul*a,int&n,int&p,int c,ul msk){
if(p+c<=L){
a[n]|=msk<<p;
p+=c;
}else{
a[n]|=(msk&K*((1<<L-p)-1))<<p;
a[++n]=msk>>L-p&K*((1<<c+p-L)-1);
p+=c-L;
}
}
void solve(ul*a,ul*b,int n,int l){
ul sum=0;
ul*c=a+n;
int bn=0,bp=L;
int cn=0,cp=L;
rp(i,1,n){
int ones=F[a[i]&A][0];
int zeros=L-ones;
ans+=sum*ones+F[a[i]&A][1];
sum+=zeros;
ul x=sh_2(a[i])>>L;
Add(b,bn,bp,zeros,x&Pre[zeros]);
Add(c,cn,cp,ones,(x>>zeros)&Pre[ones]);
}
if(l>1){
solve(b,c+cn,bn,l-1);
solve(c,b+bn,cn,l-1);
}
}
void solve(){
int m=0,p=L;
rp(i,1,N){
if(p==L){
++m,p=0;
}
b[m]|=Sp[a[i]]<<p++;
}
solve(b,c,m,L);
cout<<ans<<endl;
}
int main(){
rp(i,1,N){
a[i]=nxt()&A;
}
int st=clock();
init();
solve();
printf("time used: %dms\n",clock()-st);
return 0;
}
/*
time used: 1814ms
*/