浅谈卡常

I/O优化

scanf 比 cin 快，getchar 比 scanf 快，fread 比 getchar 快。

其对应的输出方式也一样。

因此有了基于后两者的优化：

getchar/putchar 版：

template<typename Int>
void read(Int &x){
    x=0;
    char c=getchar();int f=1;
    while(c<'0'||c>'9'){
        if(c=='-')f=-1;
        c=getchar();
    }
    while(c>='0'&&c<='9')x=x*10+c-48,c=getchar();
    x*=f;
}
template<typename Int>
void write(Int &x){
    char tmp[20];int cnt=0;
    if(x==0){putchar('0');}
    if(x<0){x=-x;putchar('-');}
    while(x){tmp[cnt++]=x%10+'0';x/=10;}
    while(cnt--){putchar(tmp[cnt]);}
}

fread/fwrite 封装版：

class fast_iostream {
private:
    const int MAXBF = 1 << 20; FILE *inf, *ouf;
    char *inbuf, *inst, *ined, *oubuf, *oust, *oued;
    inline void _flush() { fwrite(oubuf, 1, oued - oust, ouf); }
    inline char _getchar() {
        if(inst == ined) inst = inbuf, ined = inbuf + fread(inbuf, 1, MAXBF, inf);
        return inst == ined ? EOF : *inst++;
    }
    inline void _putchar(char c) {
        if(oued == oust + MAXBF) _flush(), oued = oubuf;
        *oued++ = c;
    }
public:
    fast_iostream(FILE *_inf = stdin, FILE *_ouf = stdout)
    :inbuf(new char[MAXBF]), inf(_inf), inst(inbuf), ined(inbuf),
     oubuf(new char[MAXBF]), ouf(_ouf), oust(oubuf), oued(oubuf) {}
    ~fast_iostream() { _flush(); delete inbuf; delete oubuf; }
    template<typename Int>
    fast_iostream& operator>>(Int& n) {
        static char c;
        Int flg = 1;
        while (!isdigit(c = _getchar()) && c != '-');
        if(c == '-') flg = -1, n = 0;
        else n = c - '0';
        while (isdigit(c = _getchar())) n = n * 10 + c - '0';
        n *= flg;
        return *this;
    }
    fast_iostream& operator>>(char *s) {
        static int c;
        while((c = _getchar()) == ' ' || c == '\n');
        *s++ = c;
        while((c = _getchar()) != ' ' && c != '\n' && c != EOF) *s++ = c;
        *s = 0;
        return *this;
    }
    template <typename Int>
    fast_iostream& operator<<(Int n) {
        if (n < 0) _putchar('-'), n = -n;
        static char S[20]; int t = 0;
        do { S[t++] = '0' + n % 10, n /= 10; } while(n);
        for (int i = 0; i < t; ++i) _putchar(S[t - i - 1]);
        return *this;
    }
    fast_iostream& operator<<(char c) { _putchar(c); return *this; }
    fast_iostream& operator<<(const char *s) {
        for(; *s; ++s) _putchar(*s);
        return *this;
    }
} fio;

在 LOJ#7 Input Test 中，各种输入方式结果如下：

解除流同步和绑定的 cin 比 scanf 快，但是不能和 <cstdio> 库中的输入输出函数混用了。

循环展开

顾名思义，即通过把循环内容展开，尽可能触发 CPU 的并行机制，从而提高运行速度，但是展开次数过多反而会变慢。

注意循环展开时不要用同一个变量。

测试代码：

#include <iostream>
#include <chrono>
int main(){
    long long sum=0,count;std::cin>>count;
    auto start=std::chrono::system_clock::now();
    for(int i=0;i<count;i++)sum+=i;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double>dura=end-start;
    std::cout<<sum<<"\nno unrolling: spent "<<dura.count()<<"s"<<std::endl;

    {long long sum1=0,sum2=0;
    start=std::chrono::system_clock::now();
    for(int i=0;i<count;i+=2){
        sum1+=i;sum2+=i+1;
    }
    sum=sum1+sum2;
    end=std::chrono::system_clock::now();
    dura=end-start;
    std::cout<<sum<<"\nunrolling 2 times: spent "<<dura.count()<<"s"<<std::endl;}
    
    {start=std::chrono::system_clock::now();
    long long sum1=0,sum2=0,sum3=0,sum4=0;
    for(int i=0;i<count;i+=4){
        sum1+=i;sum2+=i+1;sum3+=i+2;sum4+=i+3;
    }
    sum=sum1+sum2+sum3+sum4;
    end=std::chrono::system_clock::now();
    dura=end-start;
    std::cout<<sum<<"\nunrolling 4 times: spent "<<dura.count()<<"s"<<std::endl;}

    {long long sum1=0,sum2=0,sum3=0,sum4=0,sum5=0,sum6=0,sum7=0,sum8=0;
    start=std::chrono::system_clock::now();
    for(int i=0;i<count;i+=8){
        sum1+=i;sum2+=i+1;sum3+=i+2;sum4+=i+3;
        sum5+=i+4;sum6+=i+5;sum7+=i+6;sum8+=i+7;
    }
    sum=sum1+sum2+sum3+sum4+sum5+sum6+sum7+sum8;
    end=std::chrono::system_clock::now();
    dura=end-start;
    std::cout<<sum<<"\nunrolling 8 times: spent "<<dura.count()<<"s"<<std::endl;}
    
    {long long sum1=0,sum2=0,sum3=0,sum4=0,sum5=0,sum6=0,sum7=0,sum8=0
    ,sum9=0,sum10=0,sum11=0,sum12=0,sum13=0,sum14=0,sum15=0,sum16=0;
    start=std::chrono::system_clock::now();
    for(int i=0;i<count;i+=16){
        sum1+=i;sum2+=i+1;sum3+=i+2;sum4+=i+3;
        sum5+=i+4;sum6+=i+5;sum7+=i+6;sum8+=i+7;
        sum9+=i+8;sum10+=i+9;sum11+=i+10;sum12+=i+11;
        sum13+=i+12;sum14+=i+13;sum15+=i+14;sum16+=i+15;
    }
    sum=sum1+sum2+sum3+sum4+sum5+sum6+sum7+sum8+sum9+sum10+sum11+sum12+sum13+sum14+sum15+sum16;
    end=std::chrono::system_clock::now();
    dura=end-start;
    std::cout<<sum<<"\nunrolling 16 times: spent "<<dura.count()<<"s"<<std::endl;}

    return 0;
}

环境信息：

操作系统：Windows 10 x64
CPU：Intel(R) Core(TM) i5-1035G1
内存：16 GB
编译器：GCC 8.1.0
编译选项：-m64 -O2 -Wall -Wextra -Wl,--stack=0x10000000 -std=c++14

运行结果：

1000000000
499999999500000000
no unrolling: spent 0.41024s
499999999500000000
unrolling 2 times: spent 0.242889s
499999999500000000
unrolling 4 times: spent 0.169141s
499999999500000000
unrolling 8 times: spent 0.123151s
499999999500000000
unrolling 16 times: spent 0.15792s

~~8 核 CPU，展开 8 次最快，这很合理~~

以上两者结合可以用 \(O(n\log n)\) 的时间复杂度卡过 P3811【模板】乘法逆元。

关于运算

一般情况下，位运算最快，加减法次之，乘法再次，除法和取模最慢。

所以少用取模，比如 exgcd，如果返回的是负数再模，正数不用模；再比如矩阵乘法，统一加完一起模要比边加边模快（要是炸变量值域了当我没说。

能不取模就不取模，加上 I/O 优化，也可以不用循环展开卡过 P3811【模板】乘法逆元。

~~一道数论模板竟成为卡常测试题，这究竟是人性的扭曲还是道德的沦丧。~~

此外，n/2 和 n>>1 在底层实现时是完全等效的，n<<1 同理。编译器一般会在运算方面做优化。（上面那个循环展开的测试代码开不开 O2 速度差了将近 10 倍，我觉得可能就是在这方面优化了

指令集

不会

其他

注意内存访问是否连续的问题，这个对效率影响很大
register 自 C++11 起弃用，C++17 起移除，没用了。
inline 可能还有点用。
能用 int 别用 long long，占位多，常数大。
三目运算符确实比 if else 快。
那种需要很多节点的可持久化数据结构，不要用指针，容易被卡空间。（我被卡过两次
数据组数较大时，能不 memset 就不要 memset，memset 时间复杂度为 \(O(size)\)，\(size\) 一定时，相当于一个巨大无比的常数。（\(O(1e5)=O(1)\) ，合理）
如果你想要生成随机数，请使用 mt19937，不仅生成的伪随机数列质量高，在 O2 下也比 rand() 快很多。

posted @ 2023-04-24 20:37 untitled0 阅读(116) 评论(0) 收藏举报

刷新页面返回顶部

untitled0

不如帅拦过河卒

浅谈卡常

I/O优化

循环展开

关于运算

指令集

其他

公告