SwapBuffers的效率问题
最近看了ms实现的opengl 1.1 source,写的非常不错,但也发现了不少问题,而且这个问题在后续版本中并没有改掉(难道ms 为保 d3d 故意的?太恶劣了。。。)
这里就SwapBuffers API的问题简单说一下,希望大家看到他的缺陷,并且避开它。
这是ms opengl 1.1 中的source:
__inline FARPROC GetAPI(char *szDll, char *szAPI, HMODULE *phDll)
{
*phDll = LoadLibraryA(szDll);
if (*phDll == NULL)
{
return NULL;
}
return GetProcAddress(*phDll, szAPI);
}
/***********************************************************************/
BOOL WINAPI SwapBuffers(HDC hdc)
{
HMODULE hDll;
PFN5 pfn = (PFN5)GetAPI(szOpenGL, "wglSwapBuffers", &hDll);
BOOL bRet = FALSE;
if (pfn)
{
bRet = (*pfn)(hdc);
}
if (hDll)
{
FreeLibrary(hDll);
}
return bRet;
}
可以看到SwapBuffers API的问题,很低效!为了证实opengl 的后续版本也存在这个问题,我在windows xp sp2 下用ida 5.2 打开gdi32.dll 找到了SwapBuffers 的反汇编代码如下:
.text:77F2599E ; *************** S U B R O U T I N E ***************************************
.text:77F2599E
.text:77F2599E ; Attributes: bp-based frame
.text:77F2599E
.text:77F2599E ; BOOL __stdcall SwapBuffers(HDC)
.text:77F2599E public __stdcall SwapBuffers(x)
.text:77F2599E __stdcall SwapBuffers(x) proc near
.text:77F2599E
.text:77F2599E hLibModule = dword ptr -4
.text:77F2599E arg_0 = dword ptr 8
.text:77F2599E
.text:77F2599E mov edi, edi
.text:77F259A0 push ebp
.text:77F259A1 mov ebp, esp
.text:77F259A3 push ecx
.text:77F259A4 push esi
.text:77F259A5 lea eax, [ebp+hLibModule]
.text:77F259A8 push eax ; int
.text:77F259A9 push offset s_Wglswapbuffer ; "wglSwapBuffers"
.text:77F259AE push offset s_Opengl32 ; "OPENGL32"
.text:77F259B3 call GetAPI(x,x,x)
.text:77F259B3
.text:77F259B8 xor esi, esi
.text:77F259BA test eax, eax
.text:77F259BC jz short loc_77F259C5
.text:77F259BC
.text:77F259BE push [ebp+arg_0]
.text:77F259C1 call eax
.text:77F259C3 mov esi, eax
.text:77F259C3
.text:77F259C5
.text:77F259C5 loc_77F259C5: ; CODE XREF: SwapBuffers(x)+1Ej
.text:77F259C5 cmp [ebp+hLibModule], 0
.text:77F259C9 jz short loc_77F259D4
.text:77F259C9
.text:77F259CB push [ebp+hLibModule] ; hLibModule
.text:77F259CE call ds:FreeLibrary(x)
.text:77F259CE
.text:77F259D4
.text:77F259D4 loc_77F259D4: ; CODE XREF: SwapBuffers(x)+2Bj
.text:77F259D4 mov eax, esi
.text:77F259D6 pop esi
.text:77F259D7 leave
.text:77F259D8 retn 4
.text:77F259D8
.text:77F259D8 __stdcall SwapBuffers(x) endp
可以看到确实SwapBuffer每次调用都在GetAPI,而GetAPI先LoadLibrary,再GetProcAddress;虽然SwapBuffers每次LoadLibrary/FreeLibaray也许只是改变引用计数,但是我想GetProcAddress总是有开销的,即便有Cache机制也会有开销的。追根溯源,我又找到了GetProcAdress的source code,发现GetProcAddress是调用LdrGetProcedureAddress,继续追查,LdrGetProcedureAddress:
NTSTATUS
LdrpGetProcedureAddress (
IN PVOID DllHandle,
IN PANSI_STRING ProcedureName OPTIONAL,
IN ULONG ProcedureNumber OPTIONAL,
OUT PVOID *ProcedureAddress,
IN BOOLEAN RunInitRoutines
)
/*++
Routine Description:
This function locates the address of the specified procedure in the
specified DLL and returns its address.
Arguments:
DllHandle - Supplies a handle to the DLL that the address is being
looked up in.
ProcedureName - Supplies that address of a string that contains the
name of the procedure to lookup in the DLL. If this argument is
not specified, then the ProcedureNumber is used.
ProcedureNumber - Supplies the procedure number to lookup. If
ProcedureName is specified, then this argument is ignored.
Otherwise, it specifies the procedure ordinal number to locate
in the DLL.
ProcedureAddress - Returns the address of the procedure found in
the DLL.
Return Value:
TBD
--*/
{
NTSTATUS st;
UCHAR FunctionNameBuffer[64];
PUCHAR src, dst;
ULONG cb, ExportSize;
PLDR_DATA_TABLE_ENTRY LdrDataTableEntry;
IMAGE_THUNK_DATA Thunk;
PVOID ImageBase;
PIMAGE_IMPORT_BY_NAME FunctionName;
PIMAGE_EXPORT_DIRECTORY ExportDirectory;
PLIST_ENTRY Next;
if (ShowSnaps) {
DbgPrint("LDR: LdrGetProcedureAddress by ");
}
FunctionName = NULL;
if ( ARGUMENT_PRESENT(ProcedureName) ) {
if (ShowSnaps) {
DbgPrint("NAME - %s\n", ProcedureName->Buffer);
}
//
// BUGBUG need STRING to PSZ
//
if (ProcedureName->Length >= sizeof( FunctionNameBuffer )-1 ) {
FunctionName = RtlAllocateHeap(RtlProcessHeap(), MAKE_TAG( TEMP_TAG ),ProcedureName->Length+1+sizeof(USHORT));
if ( !FunctionName ) {
return STATUS_INVALID_PARAMETER;
}
} else {
FunctionName = (PIMAGE_IMPORT_BY_NAME) FunctionNameBuffer;
}
FunctionName->Hint = 0;
cb = ProcedureName->Length;
src = ProcedureName->Buffer;
dst = FunctionName->Name;
ImageBase = NtCurrentPeb()->ImageBaseAddress;
while (cb--) {
*dst++ = *src++;
}
*dst = '\0';
Thunk.u1.AddressOfData = FunctionName;
} else {
if (ShowSnaps) {
DbgPrint("ORDINAL - %lx\n", ProcedureNumber);
}
if (ProcedureNumber) {
Thunk.u1.Ordinal = ProcedureNumber | IMAGE_ORDINAL_FLAG;
} else {
return STATUS_INVALID_PARAMETER;
}
}
if ( LdrpInLdrInit == FALSE ) {
RtlEnterCriticalSection((PRTL_CRITICAL_SECTION)NtCurrentPeb()->LoaderLock);
}
try {
if (!LdrpCheckForLoadedDllHandle(DllHandle, &LdrDataTableEntry)) {
st = STATUS_DLL_NOT_FOUND;
return st;
}
ExportDirectory = (PIMAGE_EXPORT_DIRECTORY)RtlImageDirectoryEntryToData(
LdrDataTableEntry->DllBase,
TRUE,
IMAGE_DIRECTORY_ENTRY_EXPORT,
&ExportSize
);
if (!ExportDirectory) {
return STATUS_PROCEDURE_NOT_FOUND;
}
st = LdrpSnapThunk(LdrDataTableEntry->DllBase,
0,
&Thunk,
&Thunk,
ExportDirectory,
ExportSize,
FALSE,
NULL
);
if ( RunInitRoutines ) {
//
// Look at last entry in init order list. If entry processed
// flag is not set, then a forwarded dll was loaded during the
// getprocaddr call and we need to run init routines
//
Next = NtCurrentPeb()->Ldr->InInitializationOrderModuleList.Blink;
LdrDataTableEntry = CONTAINING_RECORD(Next, LDR_DATA_TABLE_ENTRY, InInitializationOrderLinks);
if ( !(LdrDataTableEntry->Flags & LDRP_ENTRY_PROCESSED) ) {
try {
st = LdrpRunInitializeRoutines(NULL);
}
except( EXCEPTION_EXECUTE_HANDLER ) {
st = GetExceptionCode();
}
}
}
if ( NT_SUCCESS(st) ) {
*ProcedureAddress = Thunk.u1.Function;
}
} finally {
if ( FunctionName && (FunctionName != (PIMAGE_IMPORT_BY_NAME) FunctionNameBuffer) ) {
RtlFreeHeap(RtlProcessHeap(),0,FunctionName);
}
if ( LdrpInLdrInit == FALSE ) {
RtlLeaveCriticalSection((PRTL_CRITICAL_SECTION)NtCurrentPeb()->LoaderLock);
}
}
return st;
}
LdrpGetProcedureAddress 其实是去查IAT,找到函数地址,并返回。
我想,关于SwapBuffers这个被频繁调用的API的性能的问题,也许用性能分析工具,比如intel vtune或者amd codeanalyst 就可以看得出来,下面使用vs2005中使用自带的PerformanceTools进行分析了一个性能分析,测试代码如下:
for( int i = 0;i<100;i++ )
{
::glClear( GL_COLOR_BUFFER_BIT );
::SwapBuffers( wnd->m_hdc );
}
for( int i = 0;i<100;i++ )
{
::glClear( GL_COLOR_BUFFER_BIT );
wglSwapBuffers( wnd->m_hdc );
}
测试结果分三次:
第一次:
Time( msecs ) %
SwapBuffers 1379.503985 21.323
wglSwapBuffers 1333.716537 20.615
第二次:
Time( msecs ) %
SwapBuffers 1374.064678 29.757
wglSwapBuffers 1333.665637 28.882
第三次:
Time( msecs ) %
SwapBuffers 1382.822897 21.076
wglSwapBuffers 1334.037943 20.332
测试环境:
CPU:Intel Core2 Duo E4500 2.20 GHz
显卡:NVIDIA GEFORCE 8600 GT
OS:Windows XP SP2
上面的数据就是最好的说明,平均每帧多开销0.5个毫秒。想一下,FPS达到 30帧的时候,每帧可用时间片只有33毫秒,却被它白白浪费的0.5个毫秒。
要避免这个问题,最好还是自己去获得wglSwapBuffers这个函数,才是最高效的,因为wglSwapBuffers是厂商驱动提供的。在软模式opengl下,它也会回调os 的 GdiSwapBuffers
另外其他API的问题也是这样的 ChoosePixelFormat,DescribePixelFormat,GetPixelFormat,SetPixelFormat,它们都是先得到wglXXX版本的函数地址,比如wglChoosePixelFormat,wglDescribePixelFormat,wglGetPixelFormat,wglSetPixelFormat然后再调用。
我曾经遇到过这样的问题,在真正opengl调用之前调用DescribePixelFormat,会非常慢。他一定是LoadLibrary ,FreeLibrary,从下面的输出窗口可以看到执行的时候会输出加载卸载opengl32.dll的信息。但是当opengl api 被调用后,这个时候就LoadLibrary,FreeLibrary 只是改变引用计数了。一种解决的方法是,在调用ChoosePixelFormat,DescribePixelFormat,GetPixelFormat,SetPixelFormat这些函数之前,先调用 LoadLibrary("OpenGL32.dll");这样就不会频繁的加载卸载dll;更好的方法还是自己获取wgl版本的相应函数;
如果你要列举所有像素格式,如果不这么做,会发现慢的要死,每个调用都要LoadLibrary/FreeLibrary,大概需要1分钟时间才能列举完所有像素格式,不是一般的恐怖。。。
这里就SwapBuffers API的问题简单说一下,希望大家看到他的缺陷,并且避开它。
这是ms opengl 1.1 中的source:
__inline FARPROC GetAPI(char *szDll, char *szAPI, HMODULE *phDll)
{
*phDll = LoadLibraryA(szDll);
if (*phDll == NULL)
{
return NULL;
}
return GetProcAddress(*phDll, szAPI);
}
/***********************************************************************/
BOOL WINAPI SwapBuffers(HDC hdc)
{
HMODULE hDll;
PFN5 pfn = (PFN5)GetAPI(szOpenGL, "wglSwapBuffers", &hDll);
BOOL bRet = FALSE;
if (pfn)
{
bRet = (*pfn)(hdc);
}
if (hDll)
{
FreeLibrary(hDll);
}
return bRet;
}
可以看到SwapBuffers API的问题,很低效!为了证实opengl 的后续版本也存在这个问题,我在windows xp sp2 下用ida 5.2 打开gdi32.dll 找到了SwapBuffers 的反汇编代码如下:
.text:77F2599E ; *************** S U B R O U T I N E ***************************************
.text:77F2599E
.text:77F2599E ; Attributes: bp-based frame
.text:77F2599E
.text:77F2599E ; BOOL __stdcall SwapBuffers(HDC)
.text:77F2599E public __stdcall SwapBuffers(x)
.text:77F2599E __stdcall SwapBuffers(x) proc near
.text:77F2599E
.text:77F2599E hLibModule = dword ptr -4
.text:77F2599E arg_0 = dword ptr 8
.text:77F2599E
.text:77F2599E mov edi, edi
.text:77F259A0 push ebp
.text:77F259A1 mov ebp, esp
.text:77F259A3 push ecx
.text:77F259A4 push esi
.text:77F259A5 lea eax, [ebp+hLibModule]
.text:77F259A8 push eax ; int
.text:77F259A9 push offset s_Wglswapbuffer ; "wglSwapBuffers"
.text:77F259AE push offset s_Opengl32 ; "OPENGL32"
.text:77F259B3 call GetAPI(x,x,x)
.text:77F259B3
.text:77F259B8 xor esi, esi
.text:77F259BA test eax, eax
.text:77F259BC jz short loc_77F259C5
.text:77F259BC
.text:77F259BE push [ebp+arg_0]
.text:77F259C1 call eax
.text:77F259C3 mov esi, eax
.text:77F259C3
.text:77F259C5
.text:77F259C5 loc_77F259C5: ; CODE XREF: SwapBuffers(x)+1Ej
.text:77F259C5 cmp [ebp+hLibModule], 0
.text:77F259C9 jz short loc_77F259D4
.text:77F259C9
.text:77F259CB push [ebp+hLibModule] ; hLibModule
.text:77F259CE call ds:FreeLibrary(x)
.text:77F259CE
.text:77F259D4
.text:77F259D4 loc_77F259D4: ; CODE XREF: SwapBuffers(x)+2Bj
.text:77F259D4 mov eax, esi
.text:77F259D6 pop esi
.text:77F259D7 leave
.text:77F259D8 retn 4
.text:77F259D8
.text:77F259D8 __stdcall SwapBuffers(x) endp
可以看到确实SwapBuffer每次调用都在GetAPI,而GetAPI先LoadLibrary,再GetProcAddress;虽然SwapBuffers每次LoadLibrary/FreeLibaray也许只是改变引用计数,但是我想GetProcAddress总是有开销的,即便有Cache机制也会有开销的。追根溯源,我又找到了GetProcAdress的source code,发现GetProcAddress是调用LdrGetProcedureAddress,继续追查,LdrGetProcedureAddress:
NTSTATUS
LdrpGetProcedureAddress (
IN PVOID DllHandle,
IN PANSI_STRING ProcedureName OPTIONAL,
IN ULONG ProcedureNumber OPTIONAL,
OUT PVOID *ProcedureAddress,
IN BOOLEAN RunInitRoutines
)
/*++
Routine Description:
This function locates the address of the specified procedure in the
specified DLL and returns its address.
Arguments:
DllHandle - Supplies a handle to the DLL that the address is being
looked up in.
ProcedureName - Supplies that address of a string that contains the
name of the procedure to lookup in the DLL. If this argument is
not specified, then the ProcedureNumber is used.
ProcedureNumber - Supplies the procedure number to lookup. If
ProcedureName is specified, then this argument is ignored.
Otherwise, it specifies the procedure ordinal number to locate
in the DLL.
ProcedureAddress - Returns the address of the procedure found in
the DLL.
Return Value:
TBD
--*/
{
NTSTATUS st;
UCHAR FunctionNameBuffer[64];
PUCHAR src, dst;
ULONG cb, ExportSize;
PLDR_DATA_TABLE_ENTRY LdrDataTableEntry;
IMAGE_THUNK_DATA Thunk;
PVOID ImageBase;
PIMAGE_IMPORT_BY_NAME FunctionName;
PIMAGE_EXPORT_DIRECTORY ExportDirectory;
PLIST_ENTRY Next;
if (ShowSnaps) {
DbgPrint("LDR: LdrGetProcedureAddress by ");
}
FunctionName = NULL;
if ( ARGUMENT_PRESENT(ProcedureName) ) {
if (ShowSnaps) {
DbgPrint("NAME - %s\n", ProcedureName->Buffer);
}
//
// BUGBUG need STRING to PSZ
//
if (ProcedureName->Length >= sizeof( FunctionNameBuffer )-1 ) {
FunctionName = RtlAllocateHeap(RtlProcessHeap(), MAKE_TAG( TEMP_TAG ),ProcedureName->Length+1+sizeof(USHORT));
if ( !FunctionName ) {
return STATUS_INVALID_PARAMETER;
}
} else {
FunctionName = (PIMAGE_IMPORT_BY_NAME) FunctionNameBuffer;
}
FunctionName->Hint = 0;
cb = ProcedureName->Length;
src = ProcedureName->Buffer;
dst = FunctionName->Name;
ImageBase = NtCurrentPeb()->ImageBaseAddress;
while (cb--) {
*dst++ = *src++;
}
*dst = '\0';
Thunk.u1.AddressOfData = FunctionName;
} else {
if (ShowSnaps) {
DbgPrint("ORDINAL - %lx\n", ProcedureNumber);
}
if (ProcedureNumber) {
Thunk.u1.Ordinal = ProcedureNumber | IMAGE_ORDINAL_FLAG;
} else {
return STATUS_INVALID_PARAMETER;
}
}
if ( LdrpInLdrInit == FALSE ) {
RtlEnterCriticalSection((PRTL_CRITICAL_SECTION)NtCurrentPeb()->LoaderLock);
}
try {
if (!LdrpCheckForLoadedDllHandle(DllHandle, &LdrDataTableEntry)) {
st = STATUS_DLL_NOT_FOUND;
return st;
}
ExportDirectory = (PIMAGE_EXPORT_DIRECTORY)RtlImageDirectoryEntryToData(
LdrDataTableEntry->DllBase,
TRUE,
IMAGE_DIRECTORY_ENTRY_EXPORT,
&ExportSize
);
if (!ExportDirectory) {
return STATUS_PROCEDURE_NOT_FOUND;
}
st = LdrpSnapThunk(LdrDataTableEntry->DllBase,
0,
&Thunk,
&Thunk,
ExportDirectory,
ExportSize,
FALSE,
NULL
);
if ( RunInitRoutines ) {
//
// Look at last entry in init order list. If entry processed
// flag is not set, then a forwarded dll was loaded during the
// getprocaddr call and we need to run init routines
//
Next = NtCurrentPeb()->Ldr->InInitializationOrderModuleList.Blink;
LdrDataTableEntry = CONTAINING_RECORD(Next, LDR_DATA_TABLE_ENTRY, InInitializationOrderLinks);
if ( !(LdrDataTableEntry->Flags & LDRP_ENTRY_PROCESSED) ) {
try {
st = LdrpRunInitializeRoutines(NULL);
}
except( EXCEPTION_EXECUTE_HANDLER ) {
st = GetExceptionCode();
}
}
}
if ( NT_SUCCESS(st) ) {
*ProcedureAddress = Thunk.u1.Function;
}
} finally {
if ( FunctionName && (FunctionName != (PIMAGE_IMPORT_BY_NAME) FunctionNameBuffer) ) {
RtlFreeHeap(RtlProcessHeap(),0,FunctionName);
}
if ( LdrpInLdrInit == FALSE ) {
RtlLeaveCriticalSection((PRTL_CRITICAL_SECTION)NtCurrentPeb()->LoaderLock);
}
}
return st;
}
LdrpGetProcedureAddress 其实是去查IAT,找到函数地址,并返回。
我想,关于SwapBuffers这个被频繁调用的API的性能的问题,也许用性能分析工具,比如intel vtune或者amd codeanalyst 就可以看得出来,下面使用vs2005中使用自带的PerformanceTools进行分析了一个性能分析,测试代码如下:
for( int i = 0;i<100;i++ )
{
::glClear( GL_COLOR_BUFFER_BIT );
::SwapBuffers( wnd->m_hdc );
}
for( int i = 0;i<100;i++ )
{
::glClear( GL_COLOR_BUFFER_BIT );
wglSwapBuffers( wnd->m_hdc );
}
测试结果分三次:
第一次:
Time( msecs ) %
SwapBuffers 1379.503985 21.323
wglSwapBuffers 1333.716537 20.615
第二次:
Time( msecs ) %
SwapBuffers 1374.064678 29.757
wglSwapBuffers 1333.665637 28.882
第三次:
Time( msecs ) %
SwapBuffers 1382.822897 21.076
wglSwapBuffers 1334.037943 20.332
测试环境:
CPU:Intel Core2 Duo E4500 2.20 GHz
显卡:NVIDIA GEFORCE 8600 GT
OS:Windows XP SP2
上面的数据就是最好的说明,平均每帧多开销0.5个毫秒。想一下,FPS达到 30帧的时候,每帧可用时间片只有33毫秒,却被它白白浪费的0.5个毫秒。
要避免这个问题,最好还是自己去获得wglSwapBuffers这个函数,才是最高效的,因为wglSwapBuffers是厂商驱动提供的。在软模式opengl下,它也会回调os 的 GdiSwapBuffers
另外其他API的问题也是这样的 ChoosePixelFormat,DescribePixelFormat,GetPixelFormat,SetPixelFormat,它们都是先得到wglXXX版本的函数地址,比如wglChoosePixelFormat,wglDescribePixelFormat,wglGetPixelFormat,wglSetPixelFormat然后再调用。
我曾经遇到过这样的问题,在真正opengl调用之前调用DescribePixelFormat,会非常慢。他一定是LoadLibrary ,FreeLibrary,从下面的输出窗口可以看到执行的时候会输出加载卸载opengl32.dll的信息。但是当opengl api 被调用后,这个时候就LoadLibrary,FreeLibrary 只是改变引用计数了。一种解决的方法是,在调用ChoosePixelFormat,DescribePixelFormat,GetPixelFormat,SetPixelFormat这些函数之前,先调用 LoadLibrary("OpenGL32.dll");这样就不会频繁的加载卸载dll;更好的方法还是自己获取wgl版本的相应函数;
如果你要列举所有像素格式,如果不这么做,会发现慢的要死,每个调用都要LoadLibrary/FreeLibrary,大概需要1分钟时间才能列举完所有像素格式,不是一般的恐怖。。。