汇编怪谈——优化整数除法导致的灾难
某日,某万恶的同志交给我一个问题,害得我今日trap in deep loop了3小时,囧- -
问题曰:
在PKU的某题中,我需要同时计算出两个整数相除运算的商和除数,为了提高效率,避免编译器除两次,因为两个结果是同时计算出来的,所以只要后一次使用mov就ok,但是asm版本的程序出现了比较郁闷的事情……未"优化"的程序AC,"优化"的程序TLE,what's wrong
(众所周知,x86的div是tens of clock cycles,而mov只是5~6 cycles,这样做理论上确实可以提高效率,不过,那是理论)
好,开始分析,为了使程序尽可能保持一致,本人对程序稍加改动,使其成为了pure C的版本,另一个版本就是所谓优化的asm版
--------------------------------
//porg_purec.c
#include <stdio.h>
//Define the problem scale
#define MAXN 5000004
#define MAXP 350000
#define MAXR 2236
char mk[MAXN]; //Prime bits
int prime[MAXP], pnum; //Primes
int c[32]; //Array for generating answer
void init()
{
int i, j;
for(i=0;i<32;i++)
c[i]= (i+2)*(i+2)*(i+1)*(i+1)>>2; //Sum of i^3
for(i=2;i<MAXN;i++)if(!mk[i]){
prime[pnum++]=i;
if(i<MAXR)
for(j=i*i;j<MAXN;j+=i)mk[j]=1;
}
}
int main()
{
int i, j, ncase, n, res;
init();
scanf("%d", &ncase);
while(ncase--){
scanf("%d", &n);
res=1;
for(i=0;i<pnum;i++)
if(!mk[n]){
if(n>1)res *=c[1];
break;
}
else{
j=0;
while(!(n%prime[i])){
n/=prime[i];
j++;
}
if(j>0)res *=c[j];
}
printf("%d\n", res);
}
return 0;
}
--------------------------------
//prog_asm.c
#include <stdio.h>
//Define the problem scale
#define MAXN 5000004
#define MAXP 350000
#define MAXR 2236
char mk[MAXN]; //Prime bits
int prime[MAXP], pnum; //Primes
int c[32]; //Array for generating answer
void init()
{
int i, j;
for(i=0;i<32;i++)
c[i]= (i+2)*(i+2)*(i+1)*(i+1)>>2;
for(i=2;i<MAXN;i++)if(!mk[i]){
prime[pnum++]=i;
if(i<MAXR)
for(j=i*i;j<MAXN;j+=i)mk[j]=1;
}
}
int main()
{
int i, j, ncase, n, res;
init();
scanf("%d", &ncase);
while(ncase--){
scanf("%d", &n);
res=1;
for(i=0;i<pnum;i++)
if(!mk[n]){
if(n>1)res *=c[1];
break;
}
else{
j=0;
while(!(n%prime[i])){
asm("movl %eax, -24(%ebp)");
j++;
}
if(j>0)res *=c[j];
}
printf("%d\n", res);
}
return 0;
}
--------------------------------
.file "prog_purec.c"
.text
.globl init
.type init, @function
init:
pushl %ebp
movl %esp, %ebp
subl $16, %esp
movl $0, -8(%ebp)
jmp .L2
.L3:
movl -8(%ebp), %ecx
movl -8(%ebp), %edx
addl $2, %edx
movl -8(%ebp), %eax
addl $2, %eax
imull %eax, %edx
movl -8(%ebp), %eax
incl %eax
imull %eax, %edx
movl -8(%ebp), %eax
incl %eax
imull %edx, %eax
sarl $2, %eax
movl %eax, c(,%ecx,4)
incl -8(%ebp)
.L2:
cmpl $31, -8(%ebp)
jle .L3
movl $2, -8(%ebp)
jmp .L5
.L6:
movl -8(%ebp), %eax
movzbl mk(%eax), %eax
testb %al, %al
jne .L7
movl pnum, %eax
movl -8(%ebp), %edx
movl %edx, prime(,%eax,4)
incl %eax
movl %eax, pnum
cmpl $2235, -8(%ebp)
jg .L7
movl -8(%ebp), %eax
imull -8(%ebp), %eax
movl %eax, -4(%ebp)
jmp .L10
.L11:
movl -4(%ebp), %eax
movb $1, mk(%eax)
movl -8(%ebp), %eax
addl %eax, -4(%ebp)
.L10:
cmpl $5000003, -4(%ebp)
jle .L11
.L7:
incl -8(%ebp)
.L5:
cmpl $5000003, -8(%ebp)
jle .L6
leave
ret
.size init, .-init
.section .rodata
.LC0:
.string "%d"
.LC1:
.string "%d\n"
.text
.globl main
.type main, @function
main:
leal 4(%esp), %ecx
andl $-16, %esp
pushl -4(%ecx)
pushl %ebp
movl %esp, %ebp
pushl %ecx
subl $52, %esp
call init
leal -20(%ebp), %eax
movl %eax, 4(%esp)
movl $.LC0, (%esp)
call scanf
jmp .L15
.L16:
leal -24(%ebp), %eax
movl %eax, 4(%esp)
movl $.LC0, (%esp)
call scanf
movl $1, -8(%ebp)
movl $0, -16(%ebp)
jmp .L17
.L18:
movl -24(%ebp), %eax
movzbl mk(%eax), %eax
testb %al, %al
jne .L19
movl -24(%ebp), %eax
cmpl $1, %eax
jle .L23
movl c+4, %eax
movl -8(%ebp), %edx
imull %edx, %eax
movl %eax, -8(%ebp)
jmp .L23
.L19:
movl $0, -12(%ebp)
jmp .L24
.L25:
movl -24(%ebp), %edx
movl -16(%ebp), %eax
movl prime(,%eax,4), %eax
movl %eax, -40(%ebp)
movl %edx, %eax
cltd
idivl -40(%ebp)
movl %eax, -40(%ebp)
movl -40(%ebp), %eax
movl %eax, -24(%ebp)
incl -12(%ebp)
.L24:
movl -24(%ebp), %edx
movl -16(%ebp), %eax
movl prime(,%eax,4), %eax
movl %eax, -40(%ebp)
movl %edx, %eax
cltd
idivl -40(%ebp)
movl %edx, %eax
testl %eax, %eax
je .L25
cmpl $0, -12(%ebp)
jle .L27
movl -12(%ebp), %eax
movl c(,%eax,4), %edx
movl -8(%ebp), %eax
imull %edx, %eax
movl %eax, -8(%ebp)
.L27:
incl -16(%ebp)
.L17:
movl pnum, %eax
cmpl %eax, -16(%ebp)
jl .L18
.L23:
movl -8(%ebp), %eax
movl %eax, 4(%esp)
movl $.LC1, (%esp)
call printf
.L15:
movl -20(%ebp), %eax
decl %eax
movl %eax, -20(%ebp)
movl -20(%ebp), %eax
cmpl $-1, %eax
jne .L16
movl $0, %eax
addl $52, %esp
popl %ecx
popl %ebp
leal -4(%ecx), %esp
ret
.size main, .-main
.comm mk,5000004,32
.comm prime,1400000,32
.comm pnum,4,4
.comm c,128,32
.ident "GCC: (GNU) 4.1.2 20061115 (prerelease) (Debian 4.1.1-21)"
.section .note.GNU-stack,"",@progbits
--------------------------------
.file "prog_asm.c"
.text
.globl init
.type init, @function
init:
pushl %ebp
movl %esp, %ebp
subl $16, %esp
movl $0, -8(%ebp)
jmp .L2
.L3:
movl -8(%ebp), %ecx
movl -8(%ebp), %edx
addl $2, %edx
movl -8(%ebp), %eax
addl $2, %eax
imull %eax, %edx
movl -8(%ebp), %eax
incl %eax
imull %eax, %edx
movl -8(%ebp), %eax
incl %eax
imull %edx, %eax
sarl $2, %eax
movl %eax, c(,%ecx,4)
incl -8(%ebp)
.L2:
cmpl $31, -8(%ebp)
jle .L3
movl $2, -8(%ebp)
jmp .L5
.L6:
movl -8(%ebp), %eax
movzbl mk(%eax), %eax
testb %al, %al
jne .L7
movl pnum, %eax
movl -8(%ebp), %edx
movl %edx, prime(,%eax,4)
incl %eax
movl %eax, pnum
cmpl $2235, -8(%ebp)
jg .L7
movl -8(%ebp), %eax
imull -8(%ebp), %eax
movl %eax, -4(%ebp)
jmp .L10
.L11:
movl -4(%ebp), %eax
movb $1, mk(%eax)
movl -8(%ebp), %eax
addl %eax, -4(%ebp)
.L10:
cmpl $5000003, -4(%ebp)
jle .L11
.L7:
incl -8(%ebp)
.L5:
cmpl $5000003, -8(%ebp)
jle .L6
leave
ret
.size init, .-init
.section .rodata
.LC0:
.string "%d"
.LC1:
.string "%d\n"
.text
.globl main
.type main, @function
main:
leal 4(%esp), %ecx
andl $-16, %esp
pushl -4(%ecx)
pushl %ebp
movl %esp, %ebp
pushl %ecx
subl $52, %esp
call init
leal -20(%ebp), %eax
movl %eax, 4(%esp)
movl $.LC0, (%esp)
call scanf
jmp .L15
.L16:
leal -24(%ebp), %eax
movl %eax, 4(%esp)
movl $.LC0, (%esp)
call scanf
movl $1, -8(%ebp)
movl $0, -16(%ebp)
jmp .L17
.L18:
movl -24(%ebp), %eax
movzbl mk(%eax), %eax
testb %al, %al
jne .L19
movl -24(%ebp), %eax
cmpl $1, %eax
jle .L23
movl c+4, %eax
movl -8(%ebp), %edx
imull %edx, %eax
movl %eax, -8(%ebp)
jmp .L23
.L19:
movl $0, -12(%ebp)
jmp .L24
.L25:
#APP
movl %eax, -24(%ebp)
#NO_APP
incl -12(%ebp)
.L24:
movl -24(%ebp), %edx
movl -16(%ebp), %eax
movl prime(,%eax,4), %eax
movl %eax, -40(%ebp)
movl %edx, %eax
cltd
idivl -40(%ebp)
movl %edx, %eax
testl %eax, %eax
je .L25
cmpl $0, -12(%ebp)
jle .L27
movl -12(%ebp), %eax
movl c(,%eax,4), %edx
movl -8(%ebp), %eax
imull %edx, %eax
movl %eax, -8(%ebp)
.L27:
incl -16(%ebp)
.L17:
movl pnum, %eax
cmpl %eax, -16(%ebp)
jl .L18
.L23:
movl -8(%ebp), %eax
movl %eax, 4(%esp)
movl $.LC1, (%esp)
call printf
.L15:
movl -20(%ebp), %eax
decl %eax
movl %eax, -20(%ebp)
movl -20(%ebp), %eax
cmpl $-1, %eax
jne .L16
movl $0, %eax
addl $52, %esp
popl %ecx
popl %ebp
leal -4(%ecx), %esp
ret
.size main, .-main
.comm mk,5000004,32
.comm prime,1400000,32
.comm pnum,4,4
.comm c,128,32
.ident "GCC: (GNU) 4.1.2 20061115 (prerelease) (Debian 4.1.1-21)"
.section .note.GNU-stack,"",@progbits
--------------------------------
请注意看本人标出的红色部分,此即优化前后的差异。乍一看,此处将10余行代码优化为2行,而且还略去了一次整数除法,貌似是很大的进步……
那么,为什么出问题了呢?
请注意看prog_asm.c汇编结果中的蓝色字体部分,这是那个关键的mov语句前的动作。
同志们注意到了么……
对了,编译器为了测试余数是否为0,做了操作"movl %edx, %eax"……
Oh no,原来以为提高效率的部分只是移得了余数,不是商,囧……
那么,如何正确处理这个优化呢……
经过缜密的思考,只要避免上述该死的mov动作,就可以使商被正确保留,由于本人水平有限,就干脆汇编重写了while循坏,省得编译器不理解我的意思^^
--------------------------------
#include <stdio.h>
//Define the problem scale
#define MAXN 5000004
#define MAXP 350000
#define MAXR 2236
char mk[MAXN]; //Prime bits
int prime[MAXP], pnum; //Primes
int c[32]; //Array for generating answer
void init()
{
int i, j;
for(i=0;i<32;i++)
c[i]= (i+2)*(i+2)*(i+1)*(i+1)>>2;
for(i=2;i<MAXN;i++)if(!mk[i]){
prime[pnum++]=i;
if(i<MAXR)
for(j=i*i;j<MAXN;j+=i)mk[j]=1;
}
}
int main()
{
int i, j, ncase, n, res;
init();
scanf("%d", &ncase);
while(ncase--){
scanf("%d", &n);
res=1;
for(i=0;i<pnum;i++)
if(!mk[n]){
if(n>1)res *=c[1];
break;
}
else{
j=0;
asm(
"jmp JDG\n"
"WL:\n"
"movl %eax, -24(%ebp)\n"
"incl -12(%ebp)\n"
"JDG:\n"
"movl -24(%ebp), %edx\n"
"movl -16(%ebp), %eax\n"
"movl prime(,%eax,4), %eax\n"
"movl %eax, -40(%ebp)\n"
"movl %edx, %eax\n"
"cltd\n"
"idivl -40(%ebp)\n"
"testl %edx, %edx\n"
"je WL\n"
);
if(j>0)res *=c[j];
}
printf("%d\n", res);
}
return 0;
}
--------------------------------
很兴奋地写完,在gcc下编译通过,尝试提交……
囧……竟然CE……
仔细查看了PKU OJ的FAQ,原来它用"mingw-gcc"代替"gcc",囧…… :(
算了……我不熟悉gcc以外的编译器怎么处理内联汇编,不知道怎么去改,这个留给某始作俑者好了^^
--------------------------------
教训:
1)对HLL编写的程序进行汇编优化有个前提:你要熟悉编译器,否则它做的事情你无法预料;
2)对HLL编写的程序进行汇编优化所提升的性能不会超过使用时间/空间复杂度更好的算法,故主要精力还是应该放在算法上,而不是所谓优化;
3)另外,虽然这次的事情不能表明,但是编译器的优化作用不容小觑,有时真的没必要去做比编译器优化工作做得差的人工优化;
4)mingw != gcc,这点必须批判,我有点怒了
--------------------------------
后记:
最后花了点时间,还是搞定了mingw,代码如下:
#include <stdio.h>
//Define the problem scale
#define MAXN 5000004
#define MAXP 350000
#define MAXR 2236
char mk[MAXN]; //Prime bits
int prime[MAXP], pnum; //Primes
int c[32]; //Array for generating answer
void init()
{
int i, j;
for(i=0;i<32;i++)
c[i]= (i+2)*(i+2)*(i+1)*(i+1)>>2;
for(i=2;i<MAXN;i++)if(!mk[i]){
prime[pnum++]=i;
if(i<MAXR)
for(j=i*i;j<MAXN;j+=i)mk[j]=1;
}
}
int main()
{
int i, j, ncase, n, res;
init();
scanf("%d", &ncase);
while(ncase--){
scanf("%d", &n);
res=1;
for(i=0;i<pnum;i++)
if(!mk[n]){
if(n>1)res *=c[1];
break;
}
else{
j=0;
__asm__(
"UL1:\n"
"movl -4(%ebp), %edx\n"
"movl %edx, -28(%ebp)\n"
"movl -16(%ebp), %eax\n"
"movl -28(%ebp), %ecx\n"
"cltd\n"
"idivl _prime(,%ecx,4)\n"
"movl %edx, -28(%ebp)\n"
"cmpl $0, -28(%ebp)\n"
"jne UL2\n"
"movl %eax, -16(%ebp)\n"
"leal -8(%ebp), %eax\n"
"incl (%eax)\n"
"jmp UL1\n"
"UL2:"
);
if(j>0)res *=c[j];
}
printf("%d\n", res);
}
return 0;
}
--------------------------------
提交,TLE……
无奈之下,尝试使用intel格式的汇编,硬着头皮写完后未经编译(我懒得装VC了)提交,AC……
囧大了……
提升成绩 985ms -> 955 ms
差距不大……
之前所说的几个教训貌似应验了,纠结的问题已搞定,退出这个东西,唉,总计花了约5小时工作时间调这些,不值啊:(
附上AC的代码……
--------------------------------
#Source Code
Problem: 3604 User: 8606
Memory: 6472K Time: 954MS
Language: C++ Result: Accepted
include <stdio.h>
//Define the problem scale
#define MAXN 5000004
#define MAXP 350000
#define MAXR 2236
char mk[MAXN]; //Prime bits
int prime[MAXP], pnum; //Primes
int c[32]; //Array for generating answer
void init()
{
int i, j;
for (i=0;i<32;i++)
c[i]= (i+2)*(i+2)*(i+1)*(i+1)>>2;
for (i=2;i<MAXN;i++)if (!mk[i]) {
prime[pnum++]=i;
if (i<MAXR)
for (j=i*i;j<MAXN;j+=i)mk[j]=1;
}
}
int main()
{
int i, j, ncase, n, res;
init();
scanf("%d", &ncase);
while (ncase--) {
scanf("%d", &n);
res=1;
for (i=0;i<pnum;i++)
if (!mk[n]) {
if (n>1)res *= c[1];
break;
} else {
j=0;
__asm {
UL1:
mov ecx, i
mov eax, n
cdq
idiv prime[ecx*4]
test edx, edx
jne UL2
mov n, eax
inc j
jmp UL1
UL2:
}
if (j>0)res *= c[j];
}
printf("%d\n", res);
}
return 0;
}
--------------------------------
重申一遍吧,教训……
1)对HLL编写的程序进行汇编优化有个前提:你要熟悉编译器,否则它做的事情你无法预料;
2)对HLL编写的程序进行汇编优化所提升的性能不会超过使用时间/空间复杂度更好的算法,故主要精力还是应该放在算法上,而不是所谓优化;
3)另外,虽然这次的事情不能表明,但是编译器的优化作用不容小觑,有时真的没必要去做比编译器优化工作做得差的人工优化;
4)mingw != gcc,这点必须批判,它甚至不如VC……