分页读取 GB 级别超大文件试验
© Conmajia 2012
May 15th, 2012
我们在编程过程中
通常定义大小在 2GB 以上的文件为超大文件
内存映射
内存映射的方法可以使用下面的 Windows API 实现
LPVOID MapViewOfFile(HANDLE hFileMappingObject,
DWORD dwDesiredAccess,
DWORD dwFileOffsetHigh,
DWORD dwFileOffsetLow,
DWORD dwNumberOfBytesToMap);
虽然使用方便
data:image/s3,"s3://crabby-images/6da44/6da44a3c422e49abcf1dae786223d28e774e2de6" alt=""
1 using System; 2 using System.Collections.Generic; 3 using System.Text; 4 using System.Runtime.InteropServices; 5 6 namespace BlueVision.SaYuan.FileMapping 7 { 8 public class ShareMemory 9 { 10 [DllImport( "user32.dll", CharSet = CharSet.Auto )] 11 public static extern IntPtr SendMessage( IntPtr hWnd, int Msg, int wParam, IntPtr lParam ); 12 13 [DllImport( "Kernel32.dll", CharSet = CharSet.Auto )] 14 public static extern IntPtr CreateFileMapping( IntPtr hFile, IntPtr lpAttributes, uint flProtect, uint dwMaxSizeHi, uint dwMaxSizeLow, string lpName ); 15 16 [DllImport( "Kernel32.dll", CharSet = CharSet.Auto )] 17 public static extern IntPtr OpenFileMapping( int dwDesiredAccess, [MarshalAs( UnmanagedType.Bool )] bool bInheritHandle, string lpName ); 18 19 [DllImport( "Kernel32.dll", CharSet = CharSet.Auto )] 20 public static extern IntPtr MapViewOfFile( IntPtr hFileMapping, uint dwDesiredAccess, uint dwFileOffsetHigh, uint dwFileOffsetLow, uint dwNumberOfBytesToMap ); 21 22 [DllImport( "Kernel32.dll", CharSet = CharSet.Auto )] 23 public static extern bool UnmapViewOfFile( IntPtr pvBaseAddress ); 24 25 [DllImport( "Kernel32.dll", CharSet = CharSet.Auto )] 26 public static extern bool CloseHandle( IntPtr handle ); 27 28 [DllImport( "kernel32", EntryPoint = "GetLastError" )] 29 public static extern int GetLastError(); 30 31 [DllImport( "kernel32.dll" )] 32 static extern void GetSystemInfo( out SYSTEM_INFO lpSystemInfo ); 33 34 [StructLayout( LayoutKind.Sequential )] 35 public struct SYSTEM_INFO 36 { 37 public ushort processorArchitecture; 38 ushort reserved; 39 public uint pageSize; 40 public IntPtr minimumApplicationAddress; 41 public IntPtr maximumApplicationAddress; 42 public IntPtr activeProcessorMask; 43 public uint numberOfProcessors; 44 public uint processorType; 45 public uint allocationGranularity; 46 public ushort processorLevel; 47 public ushort processorRevision; 48 } 49 /// <summary> 50 /// 获取系统的分配粒度 51 /// </summary> 52 /// <returns></returns> 53 public static uint GetPartitionsize() 54 { 55 SYSTEM_INFO sysInfo; 56 GetSystemInfo( out sysInfo ); 57 return sysInfo.allocationGranularity; 58 } 59 60 const int ERROR_ALREADY_EXISTS = 183; 61 62 const int FILE_MAP_COPY = 0x0001; 63 const int FILE_MAP_WRITE = 0x0002; 64 const int FILE_MAP_READ = 0x0004; 65 const int FILE_MAP_ALL_ACCESS = 0x0002 | 0x0004; 66 67 const int PAGE_READONLY = 0x02; 68 const int PAGE_READWRITE = 0x04; 69 const int PAGE_WRITECOPY = 0x08; 70 const int PAGE_EXECUTE = 0x10; 71 const int PAGE_EXECUTE_READ = 0x20; 72 const int PAGE_EXECUTE_READWRITE = 0x40; 73 74 const int SEC_COMMIT = 0x8000000; 75 const int SEC_IMAGE = 0x1000000; 76 const int SEC_NOCACHE = 0x10000000; 77 const int SEC_RESERVE = 0x4000000; 78 79 IntPtr m_fHandle; 80 81 IntPtr m_hSharedMemoryFile = IntPtr.Zero; 82 IntPtr m_pwData = IntPtr.Zero; 83 bool m_bAlreadyExist = false; 84 bool m_bInit = false; 85 uint m_MemSize = 0x1400000;//20M 86 long m_offsetBegin = 0; 87 long m_FileSize = 0; 88 FileReader File = new FileReader(); 89 90 91 /// <summary> 92 /// 初始化文件 93 /// </summary> 94 /// <param name="MemSize">缓冲大小</param> 95 public ShareMemory( string filename, uint memSize ) 96 { 97 // 分页映射文件时,每页的起始位置startpos, 必须为 64K 的整数倍。 98 // memSize 即缓存区的大小必须是系统分配粒度的整倍说,window 系统的分配粒度是 64KB 99 this.m_MemSize = memSize; 100 Init( filename ); 101 } 102 103 104 /// <summary> 105 /// 默认映射 20M 缓冲 106 /// </summary> 107 /// <param name="filename"></param> 108 public ShareMemory( string filename ) 109 { 110 this.m_MemSize = 0x1400000; 111 Init( filename ); 112 } 113 114 ~ShareMemory() 115 { 116 Close(); 117 } 118 119 /// <summary> 120 /// 初始化共享内存 121 /// 122 /// 共享内存名称 123 /// 共享内存大小 124 /// </summary> 125 /// <param name="strName"></param> 126 protected void Init( string strName ) 127 { 128 //if (lngSize <= 0 || lngSize > 0x00800000) lngSize = 0x00800000; 129 130 if ( !System.IO.File.Exists( strName ) ) throw new Exception( "未找到文件" ); 131 132 System.IO.FileInfo f = new System.IO.FileInfo( strName ); 133 134 m_FileSize = f.Length; 135 136 m_fHandle = File.Open( strName ); 137 138 if ( strName.Length > 0 ) 139 { 140 //创建文件映射 141 m_hSharedMemoryFile = CreateFileMapping( m_fHandle, IntPtr.Zero, ( uint )PAGE_READONLY, 0, ( uint )m_FileSize, "mdata" ); 142 if ( m_hSharedMemoryFile == IntPtr.Zero ) 143 { 144 m_bAlreadyExist = false; 145 m_bInit = false; 146 throw new Exception( "CreateFileMapping 失败 LastError=" + GetLastError().ToString() ); 147 } 148 else 149 m_bInit = true; 150 151 ////映射第一块文件 152 //m_pwData = MapViewOfFile(m_hSharedMemoryFile, FILE_MAP_READ, 0, 0, (uint)m_MemSize); 153 //if (m_pwData == IntPtr.Zero) 154 //{ 155 // m_bInit = false; 156 // throw new Exception("m_hSharedMemoryFile 失败 LastError=" + GetLastError().ToString()); 157 //} 158 159 } 160 } 161 /// <summary> 162 /// 获取高 32 位 163 /// </summary> 164 /// <param name="intValue"></param> 165 /// <returns></returns> 166 private static uint GetHighWord( UInt64 intValue ) 167 { 168 return Convert.ToUInt32( intValue >> 32 ); 169 } 170 /// <summary> 171 /// 获取低 32 位 172 /// </summary> 173 /// <param name="intValue"></param> 174 /// <returns></returns> 175 private static uint GetLowWord( UInt64 intValue ) 176 { 177 178 return Convert.ToUInt32( intValue & 0x00000000FFFFFFFF ); 179 } 180 181 /// <summary> 182 /// 获取下一个文件块 块大小为 20M 183 /// </summary> 184 /// <returns>false 表示已经是最后一块文件</returns> 185 public uint GetNextblock() 186 { 187 if ( !this.m_bInit ) throw new Exception( " 文件未初始化。" ); 188 //if ( m_offsetBegin + m_MemSize >= m_FileSize ) return false; 189 190 uint m_Size = GetMemberSize(); 191 if ( m_Size == 0 ) return m_Size; 192 193 // 更改缓冲区大小 194 m_MemSize = m_Size; 195 196 //卸载前一个文件 197 //bool l_result = UnmapViewOfFile( m_pwData ); 198 //m_pwData = IntPtr.Zero; 199 200 201 m_pwData = MapViewOfFile( m_hSharedMemoryFile, FILE_MAP_READ, GetHighWord( ( UInt64 )m_offsetBegin ), GetLowWord( ( UInt64 )m_offsetBegin ), m_Size ); 202 if ( m_pwData == IntPtr.Zero ) 203 { 204 m_bInit = false; 205 throw new Exception( "映射文件块失败" + GetLastError().ToString() ); 206 } 207 m_offsetBegin = m_offsetBegin + m_Size; 208 209 return m_Size; //创建成功 210 } 211 /// <summary> 212 /// 返回映射区大小 213 /// </summary> 214 /// <returns></returns> 215 private uint GetMemberSize() 216 { 217 if ( m_offsetBegin >= m_FileSize ) 218 { 219 return 0; 220 } 221 else if ( m_offsetBegin + m_MemSize >= m_FileSize ) 222 { 223 long temp = m_FileSize - m_offsetBegin; 224 return ( uint )temp; 225 } 226 else 227 return m_MemSize; 228 } 229 230 /// <summary> 231 /// 关闭内存映射 232 /// </summary> 233 public void Close() 234 { 235 if ( m_bInit ) 236 { 237 UnmapViewOfFile( m_pwData ); 238 CloseHandle( m_hSharedMemoryFile ); 239 File.Close(); 240 } 241 } 242 243 /// <summary> 244 /// 从当前块中获取数据 245 /// </summary> 246 /// <param name="bytData">数据</param> 247 /// <param name="lngAddr">起始数据</param> 248 /// <param name="lngSize">数据长度,最大值=缓冲长度</param> 249 /// <param name="Unmap">读取完成是否卸载缓冲区</param> 250 /// <returns></returns> 251 public void Read( ref byte[] bytData, int lngAddr, int lngSize, bool Unmap ) 252 { 253 if ( lngAddr + lngSize > m_MemSize ) 254 throw new Exception( "Read 操作超出数据区 " ); 255 if ( m_bInit ) 256 { 257 // string bb = Marshal.PtrToStringAuto(m_pwData);// 258 Marshal.Copy( m_pwData, bytData, lngAddr, lngSize ); 259 } 260 else 261 { 262 throw new Exception( "文件未初始化" ); 263 } 264 265 if ( Unmap ) 266 { 267 bool l_result = UnmapViewOfFile( m_pwData ); 268 if ( l_result ) 269 m_pwData = IntPtr.Zero; 270 } 271 } 272 273 /// <summary> 274 /// 从当前块中获取数据 275 /// </summary> 276 /// <param name="bytData">数据</param> 277 /// <param name="lngAddr">起始数据</param> 278 /// <param name="lngSize">数据长度,最大值=缓冲长度</param> 279 /// <exception cref="Exception: Read 操作超出数据区 "></exception> 280 /// <exception cref="Exception: 文件未初始化 "></exception> 281 /// <returns></returns> 282 public void Read( ref byte[] bytData, int lngAddr, int lngSize ) 283 { 284 if ( lngAddr + lngSize > m_MemSize ) 285 throw new Exception( "Read 操作超出数据区 " ); 286 if ( m_bInit ) 287 { 288 Marshal.Copy( m_pwData, bytData, lngAddr, lngSize ); 289 } 290 else 291 { 292 throw new Exception( "文件未初始化" ); 293 } 294 } 295 296 /// <summary> 297 /// 从当前块中获取数据 298 /// </summary> 299 /// <param name="lngAddr">缓存区偏移量</param> 300 /// <param name="byteData">数据数组</param> 301 /// <param name="StartIndex">数据数组开始复制的下标</param> 302 /// <param name="lngSize">数据长度,最大值=缓冲长度</param> 303 /// <exception cref="Exception: 起始数据超过缓冲区长度 "></exception> 304 /// <exception cref="Exception: 文件未初始化 "></exception> 305 /// <returns>返回实际读取值</returns> 306 public uint ReadBytes( int lngAddr, ref byte[] byteData, int StartIndex, uint intSize ) 307 { 308 if ( lngAddr >= m_MemSize ) 309 throw new Exception( "起始数据超过缓冲区长度" ); 310 311 if ( lngAddr + intSize > m_MemSize ) 312 intSize = m_MemSize - ( uint )lngAddr; 313 314 if ( m_bInit ) 315 { 316 IntPtr s = new IntPtr( ( long )m_pwData + lngAddr ); // 地址偏移 317 Marshal.Copy( s, byteData, StartIndex, ( int )intSize ); 318 } 319 else 320 { 321 throw new Exception( "文件未初始化" ); 322 } 323 324 return intSize; 325 } 326 327 /// <summary> 328 /// 写数据 329 /// </summary> 330 /// <param name="bytData">数据</param> 331 /// <param name="lngAddr">起始地址</param> 332 /// <param name="lngSize">个数</param> 333 /// <returns></returns> 334 private int Write( byte[] bytData, int lngAddr, int lngSize ) 335 { 336 if ( lngAddr + lngSize > m_MemSize ) return 2; //超出数据区 337 if ( m_bInit ) 338 { 339 Marshal.Copy( bytData, lngAddr, m_pwData, lngSize ); 340 } 341 else 342 { 343 return 1; //共享内存未初始化 344 } 345 return 0; //写成功 346 } 347 } 348 internal class FileReader 349 { 350 const uint GENERIC_READ = 0x80000000; 351 const uint OPEN_EXISTING = 3; 352 System.IntPtr handle; 353 354 [DllImport( "kernel32", SetLastError = true )] 355 public static extern System.IntPtr CreateFile( 356 string FileName, // file name 357 uint DesiredAccess, // access mode 358 uint ShareMode, // share mode 359 uint SecurityAttributes, // Security Attributes 360 uint CreationDisposition, // how to create 361 uint FlagsAndAttributes, // file attributes 362 int hTemplateFile // handle to template file 363 ); 364 365 [System.Runtime.InteropServices.DllImport( "kernel32", SetLastError = true )] 366 static extern bool CloseHandle 367 ( 368 System.IntPtr hObject // handle to object 369 ); 370 371 372 373 public IntPtr Open( string FileName ) 374 { 375 // open the existing file for reading 376 handle = CreateFile 377 ( 378 FileName, 379 GENERIC_READ, 380 0, 381 0, 382 OPEN_EXISTING, 383 0, 384 0 385 ); 386 387 if ( handle != System.IntPtr.Zero ) 388 { 389 return handle; 390 } 391 else 392 { 393 throw new Exception( "打开文件失败" ); 394 } 395 } 396 397 public bool Close() 398 { 399 return CloseHandle( handle ); 400 } 401 } 402 }
分页读取法
另外一种高效读取文件的方法就是分页法
在开始研究分页法前
参考上图
页号 | 页码 | 内容 | 至头部偏移量 |
长度 |
0 | 1 | 012 | 00 01 02 | 3 |
1 | 2 | 345 | 03 04 05 | 3 |
2 | 3 | 678 | 06 07 08 | 3 |
3 | 4 | 901 | 09 0a 0b | 3 |
4 | 5 | 234 | 0c 0d 0e | 3 |
5 | 6 | 56 | 0f 10 | 2 |
可以看到
当我们要读取
参考图
1 offsetStart = pageNumber * pageSize; 2 3 if(offsetStart + pageSize < fileSize) 4 { 5 offsetEnd = offsetStart + pageSize; 6 } 7 else 8 { 9 offsetEnd = fileSize - 1; 10 }
我们常用的
1 // 将该流的当前位置设置为给定值。 2 public override long Seek ( 3 long offset, 4 SeekOrigin origin 5 ) 6 7 // 从流中读取字节块并将该数据写入给定缓冲区中。 8 public override int Read ( 9 [InAttribute] [OutAttribute] byte[] array, 10 int offset, 11 int count 12 )
利用这两个方法
data:image/s3,"s3://crabby-images/6da44/6da44a3c422e49abcf1dae786223d28e774e2de6" alt=""
1 指定PageNumber,读取页数据 2 byte[] getPage(Int64 pageNumber) 3 { 4 if (fileStream == null || !fileStream.CanSeek || !fileStream.CanRead) 5 return null; 6 7 if (pageNumber < 0 || pageNumber >= pageCount) 8 return null; 9 10 // absolute offileStreamet of read range 11 Int64 offsetStart = (Int64)pageNumber * (Int64)pageSize; 12 Int64 offsetEnd = 0; 13 14 if (pageNumber < pageCount - 1) 15 // not last pageNumber 16 offsetEnd = offsetStart + pageSize - 1; 17 else 18 // last pageNumber 19 offsetEnd = fileSize - 1; 20 21 byte[] tmp = new byte[offsetEnd - offsetStart + 1]; 22 23 fileStream.Seek(offsetStart, SeekOrigin.Begin); 24 int rd = fileStream.Read(tmp, 0, (Int32)(offsetEnd - offsetStart + 1)); 25 26 return tmp; 27 }
由于每次读取的数据长度
CPU
内存
硬盘
为尽量保证测试质量
下面是为了测试分页法而制作的超大文件读取器界面截图
本次测试选择了
# | 文件名 | 文件内容 | 大小 |
1 | AlishaHead.png | Poser Pro 6 |
11,611 |
2 | ubuntu-11.10-desktop-i386.iso | Ubuntu11.10 |
711,980 |
3 | Windows8-ConsumerPreview-64bit-ChineseSimplified.iso | Windows8 |
3,567,486 |
通过进行多次读取
对用例#1
从图中可以看到
对用例#2
对用例#3
引发
尽管如此
分页法使用简单
通过扩展该方法
分页法
© Conmajia 2012
if(jQuery('#no-reward').text() == 'true') jQuery('.bottom-reward').addClass('hidden');
· 没有源码,如何修改代码逻辑?
· 一个奇形怪状的面试题:Bean中的CHM要不要加volatile?
· [.NET]调用本地 Deepseek 模型
· 一个费力不讨好的项目,让我损失了近一半的绩效!
· .NET Core 托管堆内存泄露/CPU异常的常见思路
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 没有源码,如何修改代码逻辑?
· NetPad:一个.NET开源、跨平台的C#编辑器
· PowerShell开发游戏 · 打蜜蜂
· 在鹅厂做java开发是什么体验