看到空明流转分享了他的SALVIA 0.5.2优化谈,我也来说说Hybird3D中和光栅化相关的一些优化技术。
Hybird3D的设计目标是打造一款准实时的软件高质量渲染器,采用了光栅化和光线跟踪混合算法,光栅化用于渲染eye ray,光线跟踪则用于阴影、反射、折射、全局光等次级光线的计算。由于渲染器是以准实时(一帧渲染时间在几十毫秒到几秒之间)为设计目标的,因此性能优化是非常重要的,但同时为了能够实现高质量的软件渲染,在渲染器的架构和支持的特性上也不会因为性能而缩水和妥协。
光栅化算法
Hybird3D的光栅化算法主要采用的是Michael Abrash写的rasterization on Larrabee这篇paper,这个算法是目前我所知道的同时能够支持多线程和SIMD的最佳算法,这个算法保证了每个tile的光栅化可以由每个线程独立计算,而没有任何数据冲突,从而实现完全的无锁化计算。Larrabee有512bit的SIMD宽度,也就是一次可以计算16个float,在一般的支持SSE的CPU上我们只能使用128bit的SIMD指令一次计算4个float,虽然宽度少了,但不影响算法的应用。
Hybird3D也是采用的16x16的tile,光栅化的主要任务是计算某个像素上对应的三角图元和三角形重心坐标(另外为了后续的mipmap纹理插值我们还需要保存重心坐标的差分值),有了图元和重心坐标就可以送到后端的LightShader和PixelShader中做进一步的计算了。一个像素上不一定只存在一个图元,半透明物体及其anti-alise等都会使得一个像素上存在多个图元,可以将这些图元保存为一个单向链表,同时为每个图元设置一个alpha值做为混合权重就可以了。
内存优化技巧
内存的优化主要包括内存的分配和cache的合理利用。在光栅化过程中会产生很多的临时对象,对象的数量不可预估但是每种对象的生命周期可以很容易的知道,所以我们可以采取多次分配一次释放的策略来实现非常高效的对象分配,根据不同的生命周期需要多个内存分配器,同时为了防止多线程冲突,每个线程需要独立的内存分配器。对cache的利用则需要合理的设计我们的渲染流水线以及合理的组织数据结构SoA(struct的array化),让数据访问尽可能的集中。SoA不但可以让数据访问变得集中而且对SIMD指令非常友好,不过SoA的编程难度很高,会让代码变得非常难写和难读,
SIMD编程技巧
SIMD的编程一直是件脏活和累活,15年前我就开始使用MMX指令来加速应用,那个时候只能内嵌汇编,而且指令与普通的运算指令差别很大,写完之后过段时间自己也看不懂了,代码的维护是一个非常让人头疼的问题。后来出了intrinsics指令,可以在C代码中用函数的形式来编写SIMD指令,免去了手工写汇编的痛苦,但是intrinsics指令同普通的C运算符差别还是很大,代码的可读性依然不佳,好在SSE指令集还是比较规整的,大部分运算指令可以用C++运算符重载来包装intrinsics指令,下面给出我的包装函数供大家参考。
1 inline __m128 operator + (__m128 v1, __m128 v2) 2 { 3 return _mm_add_ps(v1, v2); 4 } 5 6 inline __m128 operator - (__m128 v1, __m128 v2) 7 { 8 return _mm_sub_ps(v1, v2); 9 } 10 11 inline __m128 operator * (__m128 v1, __m128 v2) 12 { 13 return _mm_mul_ps(v1, v2); 14 } 15 16 inline __m128 operator / (__m128 v1, __m128 v2) 17 { 18 return _mm_div_ps(v1, v2); 19 } 20 21 inline __m128 operator == (__m128 v1, __m128 v2) 22 { 23 return _mm_cmpeq_ps(v1, v2); 24 } 25 26 inline __m128 operator != (__m128 v1, __m128 v2) 27 { 28 return _mm_cmpneq_ps(v1, v2); 29 } 30 31 inline __m128 operator > (__m128 v1, __m128 v2) 32 { 33 return _mm_cmpgt_ps(v1, v2); 34 } 35 36 inline __m128 operator >= (__m128 v1, __m128 v2) 37 { 38 return _mm_cmpge_ps(v1, v2); 39 } 40 41 inline __m128 operator < (__m128 v1, __m128 v2) 42 { 43 return _mm_cmplt_ps(v1, v2); 44 } 45 46 inline __m128 operator <= (__m128 v1, __m128 v2) 47 { 48 return _mm_cmple_ps(v1, v2); 49 } 50 51 inline __m128 operator & (__m128 v1, __m128 v2) 52 { 53 return _mm_and_ps(v1, v2); 54 } 55 56 inline __m128 operator | (__m128 v1, __m128 v2) 57 { 58 return _mm_or_ps(v1, v2); 59 } 60 61 inline int MoveMask(__m128 v) 62 { 63 return _mm_movemask_ps(v); 64 } 65 66 inline __m128 Max(__m128 v1, __m128 v2) 67 { 68 return _mm_max_ps(v1, v2); 69 } 70 71 inline __m128 Min(__m128 v1, __m128 v2) 72 { 73 return _mm_min_ps(v1, v2); 74 } 75 76 //mask ? a : b 77 inline __m128 Select(__m128 mask, __m128 a, __m128 b) 78 { 79 return _mm_or_ps(_mm_and_ps(a, mask), _mm_andnot_ps(mask, b)); 80 } 81 82 inline __m128 Extract(__m128 m, int n) 83 { 84 switch(n) 85 { 86 case 0: 87 return _mm_shuffle_ps(m, m, 0); 88 case 1: 89 return _mm_shuffle_ps(m, m, 0x55); 90 case 2: 91 return _mm_shuffle_ps(m, m, 0xaa); 92 case 3: 93 return _mm_shuffle_ps(m, m, 0xff); 94 default: 95 return m; 96 } 97 }
最后是干货时间,放出Hybird3D中光栅化相关的代码供大家参考。
1 #include "stdafx.h" 2 #include "RayTracer.h" 3 #include "Clipper.h" 4 #include "PrimitiveTile.h" 5 6 #pragma warning(disable: 4018) 7 8 #define TILE_WIDTH 16 9 #define TILE_HEIGHT 16 10 11 const float NORMAL_THRESHOLD = 0.9f; 12 extern int ReflectionDepth; 13 14 _CRT_ALIGN(16) struct Illuminance 15 { 16 Float3 direction; 17 float illuminance; 18 Float3 color; 19 float shadowFactor; 20 Illuminance* next; 21 LightShader* light; 22 }; 23 24 struct VertexOutput 25 { 26 Float4 pos; 27 Float4 normal; 28 float attributes[0]; 29 }; 30 31 struct PolyPrimitive; 32 33 _CRT_ALIGN(16) struct PixelContext 34 { 35 PixelContext* next; 36 PolyPrimitive* prim; 37 int triIndex; 38 float alpha; 39 Float4 pos; 40 Float4 view; 41 Float4 normal; 42 Float2 uv; 43 Float2 duvdx; //d(uv) / dx 44 Float2 duvdy; //d(uv) / dy 45 Illuminance* light; 46 void* userData; 47 }; 48 49 _CRT_ALIGN(16) const float FrustumClipPlane[6][4] = { 50 { 0, 0, 1, 0}, 51 { 0, 0,-1, 1}, 52 { 1, 0, 0, 1}, 53 {-1, 0, 0, 1}, 54 { 0, 1, 0, 1}, 55 { 0,-1, 0, 1}, 56 }; 57 58 __m128 ScreenOffset; 59 __m128 ScreenScale; 60 __m128 ScreenScaleInv; 61 62 struct RenderContext : public IRenderContext 63 { 64 Float4x4 ViewProjMatrix; 65 Float4x4 ViewInvMatrix; 66 Float4 _eye; 67 float ScreenWidth, ScreenHeight; 68 PrimitiveTile* _primTiles; 69 Bitmap* _renderTarget; 70 int _tileCol, _tileRow; 71 BYTE* _vertexTempBuf; 72 size_t _vertexTempSize; 73 ICamera* _camera; 74 Accel _accelStruct; 75 int _aaLevel; 76 int _primCount; 77 DWORD _bkColor; 78 Float4 _bkColorF; 79 vector<LightShader*> _lights; 80 81 RenderContext() 82 { 83 _vertexTempSize = 0; 84 _vertexTempBuf = 0; 85 _primTiles = 0; 86 _aaLevel = 0; 87 _tileCol = 0; 88 _tileRow = 0; 89 ScreenOffset = _mm_setr_ps(1, -1, 0, 0); 90 } 91 92 void AddLight(LightShader* light) 93 { 94 _lights.push_back(light); 95 } 96 97 void ClearLights() 98 { 99 _lights.clear(); 100 } 101 102 void SetRenderTarget(Bitmap* target) 103 { 104 ScreenWidth = target->width; 105 ScreenHeight = target->height; 106 int tileCount = _tileCol * _tileRow; 107 _renderTarget = target; 108 _tileCol = Align(target->width, TILE_WIDTH) / TILE_WIDTH; 109 _tileRow = Align(target->height, TILE_HEIGHT) / TILE_HEIGHT; 110 if(tileCount < _tileCol * _tileRow) 111 { 112 if(_primTiles) 113 delete[] _primTiles; 114 115 _primTiles = new PrimitiveTile[_tileCol * _tileRow]; 116 } 117 for(int i = 0; i < _tileCol * _tileRow; ++i) 118 _primTiles[i].Clear(); 119 120 ScreenScale = _mm_setr_ps(ScreenWidth * 0.5f, -ScreenHeight * 0.5f, 1, 1); 121 ScreenScaleInv = m128(1) / ScreenScale; 122 } 123 124 void SetAntiAliasQuality(int level) 125 { 126 _aaLevel = min(max(0, level), 4); 127 } 128 129 void SetCamera(ICamera* camera) 130 { 131 _camera = camera; 132 } 133 134 ICamera* GetCamera() 135 { 136 return _camera; 137 } 138 139 void VertConvert(Float4* dest, VertexOutput* vert, int vertChannels) 140 { 141 __m128 pos = _mm_load_ps(vert->pos); 142 __m128 w = _mm_shuffle_ps(pos, pos, _MM_SHUFFLE(3, 3, 3, 3)); 143 144 __m128 rhw = _mm_div_ss(_mm_set_ss(1), w); 145 rhw = _mm_shuffle_ps(rhw, rhw, 0); 146 147 _mm_store_ps(dest[0], _mm_mul_ps(_mm_add_ps(_mm_mul_ps(pos, rhw), ScreenOffset), ScreenScale)); 148 149 __m128* attr = (__m128*)&vert->normal; 150 for(int k = 0; k < vertChannels; k++) 151 _mm_store_ps(dest[k + 1], _mm_mul_ps(attr[k], rhw)); 152 153 _mm_store_ss(&dest[0].w, rhw); 154 } 155 156 virtual void BeginScene() 157 { 158 _accelStruct.BeginBuild(); 159 } 160 161 virtual void AddPolyons(VertexOutput* verts, int vertSize, 162 int vertCount, DWORD* triangles, int count, Shader* shader) 163 { 164 165 _accelStruct.AddPolygons((BYTE*)verts, triangles, vertSize, vertCount, count, shader); 166 } 167 168 virtual void EndScene() 169 { 170 _accelStruct.Build(); 171 } 172 173 virtual void SetBackground(DWORD color) 174 { 175 _bkColor = color; 176 _bkColorF = Float4((float)(color & 0xff), 177 (float)((color >> 8) & 0xff), 178 (float)((color >> 16) & 0xff), 1); 179 180 _bkColorF /= 255.f; 181 182 _bkColorF = _bkColorF * _bkColorF; 183 } 184 185 void RasterTile(PrimitiveTile* tile, int x, int y, 186 DWORD* target, int pitch, struct FGSampleTable* FGSamples = 0); 187 188 void RasterFGSample(PrimitiveTile* tile, int x, int y, struct FGSampleMap& dest); 189 190 void RasterFragmentSample(PrimitiveTile* tile, int x, int y, struct FragmentSampleMap& dest); 191 192 void FGShader(struct FGSampleRef* samples, int count); 193 194 void DrawPrimitive(TriVertex** vert, TrianglePrim& tri); 195 196 void ClippingAndDraw(TriVertex** verts, TrianglePrim& tri); 197 198 void DrawTriangle(TrianglePrim& tri); 199 200 void Render(); 201 202 static void* operator new (size_t size) 203 { 204 return _aligned_malloc(sizeof(RenderContext), 16); 205 } 206 }; 207 208 void Create4TransPixels(PixelContext** pixels, TriPrimitive* prim, const Float4& eye, 209 float* rhw, float x, float y, Allocator& alloc) 210 { 211 __m128 ma = _mm_loadu_ps(prim->a); 212 __m128 mb = _mm_loadu_ps(prim->b); 213 __m128 a0 = ma * (m128(x - prim->p0.x)) + mb * (m128(y - prim->p0.y)) + _mm_loadu_ps(prim->c); 214 for(int i = 0; i < 4; ++i) 215 { 216 __m128 a = a0; 217 if(rhw[i] > 0) 218 { 219 PixelContext pixel; 220 __m128 adx = a + ma; 221 __m128 ady = a + mb; 222 __m128 r = _mm_div_ss(m128(1), a); 223 a = a * Extract(r, 0); 224 adx = a - adx * Extract(_mm_rcp_ss(adx), 0); 225 ady = a - ady * Extract(_mm_rcp_ss(ady), 0); 226 _mm_store_ss(&pixel.pos.w, a0); 227 pixel.prim = prim->prim; 228 pixel.triIndex = prim->triIndex; 229 _mm_storeu_ps(pixel.uv, _mm_shuffle_ps(a, adx, _MM_SHUFFLE(2, 1, 2, 1))); 230 _mm_storeu_ps(pixel.duvdy, _mm_shuffle_ps(ady, ady, _MM_SHUFFLE(2, 1, 2, 1))); 231 float alpha = prim->prim->shader->TransprentShader(&pixel); 232 if(alpha > 0.01f) 233 { 234 //insert pixel 235 PixelContext* p = (PixelContext*)alloc.Alloc(sizeof(PixelContext), 16); 236 p->alpha = alpha; 237 p->prim = prim->prim; 238 p->triIndex = prim->triIndex; 239 p->uv = pixel.uv; 240 p->duvdx = pixel.duvdx; 241 p->duvdy = pixel.duvdy; 242 prim->prim->GetPosNormal(prim->triIndex, pixel.uv, &p->pos, &p->normal); 243 p->view = NormalizeFast(eye - p->pos); 244 p->light = 0; 245 p->next = 0; 246 p->pos.w = pixel.pos.w; 247 248 float alpha2 = 1; 249 if(pixels[i] == 0) 250 pixels[i] = p; 251 else 252 { 253 PixelContext* prev = 0; 254 PixelContext* pp = pixels[i]; 255 while(pp) 256 { 257 if(p->pos.w > pp->pos.w) 258 break; 259 alpha2 -= pp->alpha; 260 prev = pp; 261 pp = pp->next; 262 } 263 p->alpha = alpha * alpha2; 264 if(prev) 265 { 266 p->next = prev->next; 267 prev->next = p; 268 } 269 else 270 { 271 p->next = pixels[i]; 272 pixels[i] = p; 273 } 274 275 if(alpha > 0.99f) 276 { 277 p->next = 0; 278 } 279 else 280 { 281 alpha = 1 - alpha; 282 pp = p->next; 283 while(pp) 284 { 285 pp->alpha *= alpha; 286 pp = pp->next; 287 } 288 } 289 } 290 } 291 } 292 a0 = a0 + ma; 293 } 294 } 295 296 void CreateMainPixels(PixelContext** pixels, TriPrimitive** primBuf, const Float4& eye, 297 float startX, float startY, int tileSize, float alpha, Allocator& alloc) 298 { 299 __m128 px = m128(startX); 300 __m128 py = m128(startY); 301 for(int i = 0; i < tileSize; ++i) 302 { 303 if(i % 16 == 0 && i > 0) 304 { 305 py = py + m128(1); 306 px = m128(startX); 307 } 308 TriPrimitive* prim = primBuf[i]; 309 if(prim) 310 { 311 PixelContext* p = (PixelContext*)alloc.Alloc(sizeof(PixelContext), 16); 312 __m128 ma = _mm_loadu_ps(prim->a); 313 __m128 mb = _mm_loadu_ps(prim->b); 314 __m128 a = ma * (px - m128(prim->p0.x)) + 315 mb * (py - m128(prim->p0.y)) + _mm_loadu_ps(prim->c); 316 __m128 rhw = a; 317 318 __m128 r = _mm_div_ss(m128(1), a); 319 __m128 w = _mm_shuffle_ps(r, r, 0); 320 __m128 adx = a + ma; 321 __m128 ady = a + mb; 322 a = a * w; 323 r = _mm_rcp_ss(adx); 324 adx = a - adx * _mm_shuffle_ps(r, r, 0); 325 r = _mm_rcp_ss(ady); 326 ady = a - ady * _mm_shuffle_ps(r, r, 0); 327 _mm_storeu_ps(p->uv, _mm_shuffle_ps(a, adx, _MM_SHUFFLE(2, 1, 2, 1))); 328 _mm_storeu_ps(p->duvdy, _mm_shuffle_ps(ady, ady, _MM_SHUFFLE(2, 1, 2, 1))); 329 p->prim = prim->prim; 330 p->triIndex = prim->triIndex; 331 p->prim->GetPosNormal(p->triIndex, p->uv, &p->pos, &p->normal); 332 p->view = NormalizeFast(eye - p->pos); 333 p->light = 0; 334 p->alpha = alpha; 335 p->next = 0; 336 pixels[i] = p; 337 _mm_store_ss(&p->pos.w, rhw); 338 } 339 else 340 pixels[i] = 0; 341 px = px + m128(1); 342 } 343 } 344 345 void InsertPixel(PixelContext** pixel, PixelContext* p) 346 { 347 if(*pixel == 0) 348 *pixel = p; 349 else 350 { 351 PixelContext* prev = 0; 352 PixelContext* pp = *pixel; 353 while(pp) 354 { 355 if(p->pos.w > pp->pos.w) 356 break; 357 prev = pp; 358 pp = pp->next; 359 } 360 if(prev) 361 { 362 p->next = prev->next; 363 prev->next = p; 364 } 365 else 366 { 367 p->next = *pixel; 368 *pixel = p; 369 } 370 } 371 } 372 373 void CreatePixels(PixelContext** pixels, TriPrimitive** primBuf, const Float4& eye, 374 float alpha, float startX, float startY, int tileSize, Allocator& alloc) 375 { 376 __m128 px = m128(startX); 377 __m128 py = m128(startY); 378 for(int i = 0; i < tileSize; ++i) 379 { 380 if(i % 16 == 0 && i > 0) 381 { 382 py = py + m128(1); 383 px = m128(startX); 384 } 385 TriPrimitive* prim = primBuf[i]; 386 if(prim) 387 { 388 PixelContext* pixel = pixels[i]; 389 while(pixel) 390 { 391 if(pixel->prim == prim->prim) 392 { 393 pixel->alpha += alpha; 394 goto _SkipPixel; 395 } 396 pixel = pixel->next; 397 } 398 399 PixelContext* p = (PixelContext*)alloc.Alloc(sizeof(PixelContext), 16); 400 __m128 ma = _mm_loadu_ps(prim->a); 401 __m128 mb = _mm_loadu_ps(prim->b); 402 403 __m128 a = ma * (px - m128(prim->p0.x)) + 404 mb * (py - m128(prim->p0.y)) + _mm_loadu_ps(prim->c); 405 __m128 rhw = a; 406 407 __m128 r = _mm_div_ss(m128(1), a); 408 __m128 w = Extract(r, 0); 409 __m128 adx = a + ma; 410 __m128 ady = a + mb; 411 a = a * w; 412 r = _mm_rcp_ss(adx); 413 adx = a - adx * Extract(r, 0); 414 r = _mm_rcp_ss(ady); 415 ady = a - ady * Extract(r, 0); 416 _mm_storeu_ps(p->uv, _mm_shuffle_ps(a, adx, _MM_SHUFFLE(2, 1, 2, 1))); 417 _mm_storeu_ps(p->duvdy, _mm_shuffle_ps(ady, ady, _MM_SHUFFLE(2, 1, 2, 1))); 418 p->prim = prim->prim; 419 p->triIndex = prim->triIndex; 420 p->prim->GetPosNormal(p->triIndex, p->uv, &p->pos, &p->normal); 421 p->view = NormalizeFast(eye - p->pos); 422 p->light = 0; 423 p->alpha = alpha; 424 p->next = 0; 425 _mm_store_ss(&p->pos.w, rhw); 426 InsertPixel(&pixels[i], p); 427 } 428 _SkipPixel: 429 px = px + m128(1); 430 } 431 } 432 433 void RasterFullCoverPrim(TriPrimitive* prim, float startX, 434 float startY, float* primBuf, float* wBuf) 435 { 436 __m128 startW = m128((startX - prim->p0.x) * prim->a[0] 437 + (startY - prim->p0.y) * prim->b[0] + prim->c[0]); 438 __m128 rhwDx = m128(prim->a[0] * 4); 439 __m128 primData = m128(*(float*)&prim); 440 startW = startW + m128(prim->a[0]) * _mm_set_ps(3, 2, 1, 0); 441 442 for(int i = 0; i < TILE_HEIGHT; ++i) 443 { 444 __m128 rhw = startW; 445 for(int j = 0; j < TILE_WIDTH; j += 4) 446 { 447 __m128 oldW = _mm_load_ps(wBuf + j); 448 __m128 mask = rhw > oldW; 449 _mm_store_ps(wBuf + j, Select(mask, rhw, oldW)); 450 _mm_store_ps(primBuf + j, Select(mask, primData, _mm_load_ps(primBuf + j))); 451 rhw = rhw + rhwDx; 452 } 453 wBuf += TILE_WIDTH; 454 primBuf += TILE_WIDTH; 455 startW = startW + m128(prim->b[0]); 456 } 457 } 458 459 void RasterPrim(TriPrimitive* prim, float x, float y, 460 float xs, float ys, TriPrimitive** primBuf, float* wBuf) 461 { 462 __m128 ex[3]; 463 __m128 ey[3]; 464 __m128 xOff[3]; 465 __m128 yOff[3]; 466 __m128 mask0[3]; 467 __m128 primData = m128(*(float*)&prim); 468 469 for(int i = 0; i < 3; ++i) 470 { 471 ex[i] = m128(prim->ea[i]); 472 ey[i] = m128(prim->eb[i]); 473 xOff[i] = (ex[i] > m128(0)) & m128(4); 474 yOff[i] = (ey[i] > m128(0)) & m128(4); 475 } 476 __m128 p0x = m128(prim->p0.x); 477 __m128 p0y = m128(prim->p0.y); 478 __m128 p1x = p0x - ey[0]; 479 __m128 p1y = p0y + ex[0]; 480 481 mask0[0] = (m128(x) - p0x) * ex[0] + (m128(y) - p0y) * ey[0]; 482 mask0[1] = (m128(x) - p1x) * ex[1] + (m128(y) - p1y) * ey[1]; 483 mask0[2] = (m128(x) - p0x) * ex[2] + (m128(y) - p0y) * ey[2]; 484 485 __m128 rhw0 = (_mm_set_ps(3, 2, 1, 0) + m128(x + xs) - p0x) * m128(prim->a[0]) + 486 (m128(y + ys) - p0y) * m128(prim->b[0]) + m128(prim->c[0]); 487 __m128* mprimBuf = (__m128*)primBuf; 488 __m128* mwBuf = (__m128*)wBuf; 489 490 __m128 yStep = m128(0); 491 for(int iy = 0; iy < 4; ++iy) 492 { 493 __m128 mask; 494 __m128 xStep = _mm_set_ps(12, 8, 4, 0); 495 mask = ((mask0[0] + (xStep + xOff[0]) * ex[0] + (yStep + yOff[0]) * ey[0]) >= m128(0)) & 496 ((mask0[1] + (xStep + xOff[1]) * ex[1] + (yStep + yOff[1]) * ey[1]) >= m128(0)) & 497 ((mask0[2] + (xStep + xOff[2]) * ex[2] + (yStep + yOff[2]) * ey[2]) >= m128(0)); 498 499 int* imask = (int*)&mask; 500 if(MoveMask(mask)) 501 { 502 __m128 rhw1 = rhw0; 503 for(int ix = 0; ix < 4; ++ix) 504 { 505 if(imask[ix]) 506 { 507 __m128 mask1[3]; 508 __m128 xpos = _mm_set_ps(3, 2, 1, 0) + m128((float)(ix * 4) + xs); 509 __m128 ypos = yStep + m128(ys); 510 mask1[0] = mask0[0] + xpos * ex[0] + ypos * ey[0]; 511 mask1[1] = mask0[1] + xpos * ex[1] + ypos * ey[1]; 512 mask1[2] = mask0[2] + xpos * ex[2] + ypos * ey[2]; 513 514 __m128* mprimBuf0 = mprimBuf + ix; 515 __m128* mwBuf0 = mwBuf + ix; 516 __m128 rhw = rhw1; 517 for(int j = 0; j < 4; ++j) 518 { 519 __m128 pmask = (rhw > *mwBuf0) & 520 (mask1[0] >= m128(0)) & 521 (mask1[1] >= m128(0)) & 522 (mask1[2] >= m128(0)); 523 524 *mwBuf0 = Select(pmask, rhw, *mwBuf0); 525 *mprimBuf0 = Select(pmask, primData, *mprimBuf0); 526 mask1[0] = mask1[0] + ey[0]; 527 mask1[1] = mask1[1] + ey[1]; 528 mask1[2] = mask1[2] + ey[2]; 529 mprimBuf0 += 4; 530 mwBuf0 += 4; 531 rhw = rhw + m128(prim->b[0]); 532 } 533 } 534 rhw1 = rhw1 + m128(prim->a[0]) * m128(4); 535 } 536 } 537 rhw0 = rhw0 + m128(4) * m128(prim->b[0]); 538 mprimBuf += 16; 539 mwBuf += 16; 540 yStep = yStep + m128(4); 541 } 542 } 543 544 void CreateReflectRay(Ray* rays, int count, PixelContext* pixel, ReflectInfo* refInfo, const Float4& eye) 545 { 546 if(count == 1) 547 { 548 Float4 pos = pixel->pos; 549 Float4 normal = pixel->normal; 550 Ray& ray = rays[0]; 551 552 Float4 refVec = -Normalize(Reflect(pixel->view, normal)); 553 pos = pos + refVec * 0.02f; 554 _mm_store_ps(ray.pos, pos.m); 555 _mm_store_ps(ray.dir, refVec.m); 556 ray.triIndex = -1; 557 ray.tmin = 0; 558 ray.tmax = 1e10; 559 ray.userData = refInfo; 560 return; 561 } 562 563 static const Float2 offset[] = { 564 Float2(0, 0), 565 Float2(-0.4f, -0.4f), 566 Float2(0.4f, -0.4f), 567 Float2(0, 0.4f) 568 }; 569 for(int i = 0; i < count; ++i) 570 { 571 Float4 pos = pixel->pos; 572 Float4 normal = pixel->normal; 573 Ray& ray = rays[i]; 574 575 Float4 dpos, dnormal; 576 pixel->prim->GetPosNormalDifferential(pixel->triIndex, pixel->duvdx * offset[i].x, &dpos, &dnormal); 577 pos = pos + dpos; 578 normal = normal + dnormal; 579 580 pixel->prim->GetPosNormalDifferential(pixel->triIndex, pixel->duvdy * offset[i].y, &dpos, &dnormal); 581 pos = pos + dpos; 582 normal = NormalizeFast(normal + dnormal); 583 584 Float4 Vn; 585 //Vn = pixel->view; 586 Vn = NormalizeFast(eye - pos); 587 588 Float4 refVec = -Normalize(Reflect(Vn, normal)); 589 pos = pos + refVec * 0.02f; 590 _mm_store_ps(ray.pos, pos.m); 591 _mm_store_ps(ray.dir, refVec.m); 592 ray.triIndex = -1; 593 ray.tmin = 0; 594 ray.tmax = 1e10; 595 ray.userData = refInfo; 596 } 597 } 598 599 void CreateReflectPixel(PixelContext** pixels, Ray& ray, Allocator* alloc) 600 { 601 ReflectInfo* refinfo = (ReflectInfo*)ray.userData; 602 /*PixelContext* pixel = pixels[refinfo->index]; 603 while(pixel) 604 { 605 if(pixel->prim == ray.prim) 606 { 607 pixel->alpha += refinfo->strength; 608 return; 609 } 610 pixel = pixel->next; 611 }*/ 612 613 PixelContext* p = (PixelContext*)alloc->Alloc(sizeof(PixelContext), 16); 614 p->prim = ray.prim; 615 p->triIndex = ray.triIndex; 616 p->uv.x = ray.u; 617 p->uv.y = ray.v; 618 619 Float4 posddx, normalddx; 620 Float4 posddy, normalddy; 621 refinfo->context->prim->GetPosNormalDifferential(refinfo->context->triIndex, 622 refinfo->context->duvdx, &posddx, &normalddx); 623 refinfo->context->prim->GetPosNormalDifferential(refinfo->context->triIndex, 624 refinfo->context->duvdy, &posddy, &normalddy); 625 626 Float2 uvdx, uvdy; 627 p->prim->GetRayDifferential(ray.triIndex, *(Float4*)&ray.pos, *(Float4*)&ray.dir, 628 posddx, posddy, normalddx, normalddy, &uvdx, &uvdy); 629 630 p->duvdx = uvdx - p->uv; 631 p->duvdy = uvdy - p->uv; 632 p->alpha = refinfo->strength; 633 ray.prim->GetPosition(ray.triIndex, p->uv, &p->pos); 634 ray.prim->GetNormal(ray.triIndex, p->uv, &p->normal); 635 p->view = NormalizeFast(Float4(ray.pos, 1) - p->pos); 636 p->light = 0; 637 p->next = pixels[refinfo->index]; 638 pixels[refinfo->index] = p; 639 } 640 641 void CopyBuf(float* wBuf, float* wBuf2, float* primBuf, float* primBuf2, int size) 642 { 643 for(int i = 0; i < size; i += 4) 644 { 645 _mm_store_ps(wBuf2 + i, _mm_load_ps(wBuf + i)); 646 _mm_store_ps((float*)primBuf2 + i, _mm_load_ps((float*)primBuf + i)); 647 } 648 } 649 650 float CopyBuf2(float* wBuf, float* wBuf2, float* primBuf, float* primBuf2, int size) 651 { 652 __m128 minRHW = m128(FLT_MAX); 653 for(int i = 0; i < size; i += 4) 654 { 655 __m128 rhw = _mm_load_ps(wBuf + i); 656 minRHW = Min(minRHW, rhw); 657 _mm_store_ps((float*)primBuf2 + i, _mm_load_ps((float*)primBuf + i)); 658 _mm_store_ps(wBuf2 + i, rhw); 659 } 660 661 minRHW = Min(Min(Extract(minRHW, 0), Extract(minRHW, 1)), 662 Min(Extract(minRHW, 2), Extract(minRHW, 3))); 663 float m; 664 _mm_store_ss(&m, minRHW); 665 return m; 666 } 667 668 int AACount[] = {0, 2, 4, 8, 16}; 669 float AACfgAlpha[] = {1, 1 / 3.f, 1 / 5.f, 1 / 9.f, 1 / 17.f}; 670 Float2 AASampler[5][16] = { 671 { Float2(0, 0) }, 672 673 { Float2(0.25f, 0.25f), Float2(0.75f, 0.75f) }, 674 675 { Float2(0.25f, 0.25f), Float2(0.75f, 0.75f), Float2(0.25f, 0.75f), Float2(0.75f, 0.25f) }, 676 677 { Float2(0.2f, 0.5f), Float2(0.8f, 0.5f), Float2(0.5f, 0.2f), Float2(0.5f, 0.8f), 678 Float2(0.25f, 0.25f), Float2(0.75f, 0.25f), Float2(0.25f, 0.75f), Float2(0.75f, 0.75f)}, 679 680 { Float2(0.2f, 0.2f), Float2(0.4f, 0.2f), Float2(0.6f, 0.2f), Float2(0.8f, 0.2f), 681 Float2(0.2f, 0.4f), Float2(0.4f, 0.4f), Float2(0.6f, 0.4f), Float2(0.8f, 0.4f), 682 Float2(0.2f, 0.6f), Float2(0.4f, 0.6f), Float2(0.6f, 0.6f), Float2(0.8f, 0.6f), 683 Float2(0.2f, 0.8f), Float2(0.4f, 0.8f), Float2(0.6f, 0.8f), Float2(0.8f, 0.8f) } 684 }; 685 686 void RenderContext::ClippingAndDraw(TriVertex** verts, TrianglePrim& tri) 687 { 688 if(!BackCullTest((VertexOutput**)verts)) 689 return; 690 691 Float4 vertTmpBuf[256]; 692 Float4* vertBuf = vertTmpBuf; 693 694 int vertCount = 3; 695 for(int i = 0; i < 6; i++) 696 { 697 vertCount = ClipPoly(*(const Float4*)FrustumClipPlane[i], 698 (VertexOutput**)verts, vertCount, 2, vertBuf); 699 if(vertCount < 3) 700 return; 701 } 702 703 Float4 vertsTmp[256]; 704 for(int i = 0; i < vertCount; i++) 705 VertConvert(vertsTmp + i * 2, (VertexOutput*)verts[i], 1); 706 707 TriVertex* triangle[3]; 708 for(int i = 0; i < vertCount - 2; i++) 709 { 710 triangle[0] = (TriVertex*)vertsTmp; 711 triangle[1] = (TriVertex*)(vertsTmp + (i + 1) * 2); 712 triangle[2] = (TriVertex*)(vertsTmp + (i + 2) * 2); 713 DrawPrimitive(triangle, tri); 714 } 715 } 716 717 void RenderContext::DrawTriangle(TrianglePrim& tri) 718 { 719 TriVertex verts[3]; 720 verts[0].pos = Mul(tri.p0, ViewProjMatrix); 721 verts[1].pos = Mul(tri.p1, ViewProjMatrix); 722 verts[2].pos = Mul(tri.p2, ViewProjMatrix); 723 verts[0].uv = Float4(0, 0, 0, 0); 724 verts[1].uv = Float4(1, 0, 0, 0); 725 verts[2].uv = Float4(0, 1, 0, 0); 726 727 TriVertex* verts2[36]; 728 verts2[0] = verts; 729 verts2[1] = verts + 1; 730 verts2[2] = verts + 2; 731 ClippingAndDraw(verts2, tri); 732 } 733 734 void RenderContext::RasterTile(PrimitiveTile* tile, int x, int y, DWORD* target, 735 int pitch, FGSampleTable* FGSamples) 736 { 737 const int tileSize = TILE_WIDTH * TILE_HEIGHT; 738 _CRT_ALIGN(16) TriPrimitive* primBuf[tileSize]; 739 _CRT_ALIGN(16) float wBuf[tileSize]; 740 _CRT_ALIGN(16) PixelContext* pixels[tileSize]; 741 //_CRT_ALIGN(16) PixelContext* mainPixels[tileSize]; 742 //_CRT_ALIGN(16) PixelContext* transPixels[tileSize]; 743 _CRT_ALIGN(16) TriPrimitive* primBuf2[tileSize]; 744 _CRT_ALIGN(16) float wBuf2[tileSize]; 745 _CRT_ALIGN(16) Float4 colorBuf[tileSize]; 746 747 Allocator allocA(Align((BYTE*)_alloca(1024 * 1024), 16), 1024 * 1024 - 15); 748 Allocator allocB(Align((BYTE*)_alloca(1024 * 1024), 16), 1024 * 1024 - 15); 749 Allocator* alloc = &allocA; 750 751 tile->MergePrimitives(); 752 753 if(!tile->HasPrimitive()) 754 { 755 for(int i = 0; i < tileSize; ++i) 756 *((DWORD*)((BYTE*)target + pitch * (i / 16)) + (i % 16)) = _bkColor; 757 return; 758 } 759 760 float startX = (float)x + 0.5f; 761 float startY = (float)y + 0.5f; 762 763 for(int i = 0; i < tileSize; i += 4) 764 { 765 _mm_store_ps(wBuf + i, m128(0)); 766 _mm_store_ps((float*)primBuf + i, m128(0)); 767 //_mm_store_ps((float*)pixels + i, m128(0)); 768 //_mm_store_ps((float*)transPixels + i, m128(0)); 769 } 770 float farRhw = 0; 771 bool hasFullPrim = false; 772 while(true) 773 { 774 TriPrimitive* prim = tile->NextFullPrimitive(); 775 if(!prim) 776 break; 777 hasFullPrim = true; 778 RasterFullCoverPrim(prim, startX, startY, (float*)primBuf, wBuf); 779 } 780 if(hasFullPrim) 781 farRhw = CopyBuf2(wBuf, wBuf2, (float*)primBuf, (float*)primBuf2, tileSize); 782 else 783 { 784 for(int i = 0; i < tileSize; i += 4) 785 { 786 _mm_store_ps(wBuf2 + i, m128(0)); 787 _mm_store_ps((float*)primBuf2 + i, m128(0)); 788 } 789 } 790 791 int aaCount = AACount[_aaLevel]; 792 float alpha = AACfgAlpha[_aaLevel]; 793 Float2* sampler = AASampler[_aaLevel]; 794 795 while(true) 796 { 797 TriPrimitive* prim = tile->NextOpaquePrimitive(); 798 if(!prim) 799 break; 800 if(prim->maxRhw < farRhw) 801 continue; 802 803 RasterPrim(prim, x, y, 0.5f, 0.5f, primBuf2, wBuf2); 804 } 805 tile->Reset(); 806 CreateMainPixels(pixels, primBuf2, _eye, startX, startY, tileSize, alpha, *alloc); 807 farRhw = CopyBuf2(wBuf, wBuf2, (float*)primBuf, (float*)primBuf2, tileSize); 808 farRhw *= 0.99f; 809 810 for(int aa = 0; aa < aaCount; ++aa) 811 { 812 float xs = sampler[aa].x; 813 float ys = sampler[aa].y; 814 while(true) 815 { 816 TriPrimitive* prim = tile->NextOpaquePrimitive(); 817 if(!prim) 818 break; 819 if(prim->maxRhw < farRhw) 820 continue; 821 822 RasterPrim(prim, x, y, xs, ys, primBuf2, wBuf2); 823 } 824 tile->Reset(); 825 CreatePixels(pixels, primBuf2, _eye, alpha, startX - 0.5f + xs, startY - 0.5f + ys, tileSize, *alloc); 826 CopyBuf(wBuf, wBuf2, (float*)primBuf, (float*)primBuf2, tileSize); 827 } 828 829 do 830 { 831 bool fullScreen; 832 TriPrimitive* prim = tile->NextTransPrimitive(fullScreen); 833 if(!prim) 834 break; 835 while(prim) 836 { 837 if(prim->maxRhw < farRhw) 838 { 839 prim = tile->NextTransPrimitive(fullScreen); 840 continue; 841 } 842 PixelContext** tpixels = pixels; 843 __m128 ex[3]; 844 __m128 ey[3]; 845 __m128 mask0[3]; 846 __m128 xOff[3]; 847 for(int i = 0; i < 3; ++i) 848 { 849 ex[i] = m128(prim->ea[i]); 850 ey[i] = m128(prim->eb[i]); 851 xOff[i] = (ex[i] > m128(0)) & m128(4); 852 } 853 854 __m128 p0x = m128(prim->p0.x); 855 __m128 p0y = m128(prim->p0.y); 856 __m128 p1x = p0x - ey[0]; 857 __m128 p1y = p0y + ex[0]; 858 859 mask0[0] = (m128(x) - p0x) * ex[0] + (m128(y + 0.5f) - p0y) * ey[0]; 860 mask0[1] = (m128(x) - p1x) * ex[1] + (m128(y + 0.5f) - p1y) * ey[1]; 861 mask0[2] = (m128(x) - p0x) * ex[2] + (m128(y + 0.5f) - p0y) * ey[2]; 862 863 __m128 rhw0 = (_mm_set_ps(3, 2, 1, 0) + m128(startX) - p0x) * m128(prim->a[0]) + 864 (m128(startY) - p0y) * m128(prim->b[0]) + m128(prim->c[0]); 865 __m128* mwBuf = (__m128*)wBuf2; 866 for(int iy = 0; iy < 16; ++iy) 867 { 868 __m128 xStep = _mm_set_ps(12, 8, 4, 0); 869 __m128 mask = ((mask0[0] + (xStep + xOff[0]) * ex[0]) >= m128(0)) & 870 ((mask0[1] + (xStep + xOff[1]) * ex[1]) >= m128(0)) & 871 ((mask0[2] + (xStep + xOff[2]) * ex[2]) >= m128(0)); 872 if(MoveMask(mask)) 873 { 874 __m128 mask1[3]; 875 xStep = _mm_set_ps(3.5f, 2.5f, 1.5f, 0.5f); 876 mask1[0] = mask0[0] + xStep * ex[0]; 877 mask1[1] = mask0[1] + xStep * ex[1]; 878 mask1[2] = mask0[2] + xStep * ex[2]; 879 __m128 rhw = rhw0; 880 881 for(int ix = 0; ix < 4; ++ix) 882 { 883 __m128 pmask = ((rhw > *mwBuf) & 884 (mask1[0] >= m128(0)) & 885 (mask1[1] >= m128(0)) & 886 (mask1[2] >= m128(0))); 887 if(MoveMask(pmask)) 888 { 889 __m128 rhw1 = rhw & pmask; 890 Create4TransPixels(tpixels, prim, _eye, (float*)&rhw1, 891 x + ix * 4 + 0.5f, y + iy + 0.5f, *alloc); 892 } 893 rhw = rhw + m128(4) * m128(prim->a[0]); 894 mask1[0] = mask1[0] + m128(4) * ex[0]; 895 mask1[1] = mask1[1] + m128(4) * ex[1]; 896 mask1[2] = mask1[2] + m128(4) * ex[2]; 897 mwBuf++; 898 tpixels += 4; 899 } 900 } 901 else 902 { 903 mwBuf += 4; 904 tpixels += 16; 905 } 906 rhw0 = rhw0 + m128(prim->b[0]); 907 mask0[0] = mask0[0] + ey[0]; 908 mask0[1] = mask0[1] + ey[1]; 909 mask0[2] = mask0[2] + ey[2]; 910 } 911 prim = tile->NextTransPrimitive(fullScreen); 912 } 913 }while(0); 914 915 for(int i = 0; i < tileSize; ++i) 916 colorBuf[i].m = m128(0); 917 918 Ray reflectRays[64]; 919 ReflectInfo refInfos[64]; 920 int refRayIndex = 0; 921 int refInfoIndex = 0; 922 923 for(int depth = 0; depth <= ReflectionDepth; ++depth) 924 { 925 bool hasReflection = false; 926 for(int j = 0; j < _lights.size(); ++j) 927 { 928 int from, to; 929 if(_lights[j]->Interpolate(&from, &to) && FGSamples) 930 continue; 931 _lights[j]->DirectIlluminate(pixels, tileSize, &_accelStruct, alloc); 932 } 933 if(FGSamples && depth == 0) 934 { 935 for(int i = 0; i < tileSize; ++i) 936 { 937 int sx = x + (i % 16); 938 int sy = y + i / 16; 939 PixelContext* pixel = pixels[i]; 940 while(pixel) 941 { 942 Float4 norm; 943 pixel->prim->GetFaceNormal(pixel->triIndex, &norm); 944 945 Float4 color = FGSamples->Lookup(pixel->prim, *(Float3*)&norm, sx, sy); 946 if(color.x + color.y + color.z > 0) 947 { 948 Illuminance* illum = (Illuminance*)alloc->Alloc(sizeof(Illuminance)); 949 illum->color.x = color.x; 950 illum->color.y = color.y; 951 illum->color.z = color.z; 952 illum->direction.x = 0; 953 illum->direction.y = 0; 954 illum->direction.z = 1; 955 illum->illuminance = 1; 956 illum->light = 0; 957 illum->next = pixel->light; 958 illum->shadowFactor = 0; 959 pixel->light = illum; 960 } 961 pixel = pixel->next; 962 } 963 } 964 } 965 966 if(alloc == &allocA) 967 alloc = &allocB; 968 else 969 alloc = &allocA; 970 alloc->Clear(); 971 972 for(int i = 0; i < tileSize; ++i) 973 { 974 PixelContext* pixel = pixels[i]; 975 pixels[i] = 0; 976 Float4 color; 977 color.m = m128(0); 978 float alpha = 0; 979 980 while(pixel) 981 { 982 ReflectInfo reflect; 983 reflect.strength = 0; 984 alpha += pixel->alpha; 985 color += pixel->prim->shader->PixelShader(pixel, &reflect) * pixel->alpha; 986 reflect.strength *= pixel->alpha; 987 988 if(reflect.strength > 0.01f) 989 { 990 ReflectInfo& refInfo = refInfos[refInfoIndex++]; 991 refInfo = reflect; 992 refInfo.context = pixel; 993 refInfo.index = i; 994 refInfo.strength *= 0.25f; 995 996 if(depth == 0) 997 { 998 CreateReflectRay(reflectRays + refRayIndex, 4, pixel, &refInfo, _eye); 999 refRayIndex += 4; 1000 } 1001 else 1002 { 1003 CreateReflectRay(reflectRays + refRayIndex, 4, pixel, &refInfo, _eye); 1004 refRayIndex += 4; 1005 } 1006 hasReflection = true; 1007 1008 if(refRayIndex >= 64) 1009 { 1010 _accelStruct.TraceIntersect(reflectRays, refRayIndex); 1011 for(int r = 0; r < refRayIndex; ++r) 1012 { 1013 Ray& ray = reflectRays[r]; 1014 if(ray.prim) 1015 CreateReflectPixel(pixels, ray, alloc); 1016 } 1017 refInfoIndex = 0; 1018 refRayIndex = 0; 1019 } 1020 } 1021 pixel = pixel->next; 1022 } 1023 if(refRayIndex > 0) 1024 { 1025 _accelStruct.TraceIntersect(reflectRays, refRayIndex); 1026 for(int r = 0; r < refRayIndex; ++r) 1027 { 1028 Ray& ray = reflectRays[r]; 1029 if(ray.prim) 1030 CreateReflectPixel(pixels, ray, alloc); 1031 } 1032 refInfoIndex = 0; 1033 refRayIndex = 0; 1034 } 1035 if(depth == 0) 1036 { 1037 if(alpha < 0.99f) 1038 color = color + _bkColorF * (1 - alpha); 1039 colorBuf[i] = color; 1040 } 1041 else 1042 colorBuf[i] += color; 1043 } 1044 1045 if(!hasReflection) 1046 break; 1047 } 1048 1049 for(int i = 0; i < tileSize; ++i) 1050 { 1051 __m128i icolor = _mm_cvttps_epi32(_mm_rsqrt_ps(colorBuf[i].m) * colorBuf[i].m * m128(255)); 1052 icolor = _mm_packs_epi32(icolor, icolor); 1053 icolor = _mm_packus_epi16(icolor, icolor); 1054 1055 *((DWORD*)((BYTE*)target + pitch * (i / 16)) + (i % 16)) = _mm_cvtsi128_si32(icolor); 1056 } 1057 } 1058 1059 void RenderContext::DrawPrimitive(TriVertex** p, TrianglePrim& tri) 1060 { 1061 if((p[2]->pos.x - p[0]->pos.x) * (p[1]->pos.y - p[0]->pos.y) 1062 - (p[1]->pos.x - p[0]->pos.x) * (p[2]->pos.y - p[0]->pos.y) <= 0) 1063 return; 1064 1065 Float3 edge[3]; 1066 edge[0] = CalcEdge(p[0]->pos, p[1]->pos); 1067 edge[1] = CalcEdge(p[1]->pos, p[2]->pos); 1068 edge[2] = CalcEdge(p[2]->pos, p[0]->pos); 1069 1070 TriPrimitive* prim = (TriPrimitive*)MemoryHeapMT::Alloc(sizeof(TriPrimitive)); 1071 prim->prim = tri.prim; 1072 prim->maxRhw = max(max(p[0]->pos.w, p[1]->pos.w), p[2]->pos.w); 1073 prim->triIndex = tri.triIndex; 1074 prim->p0.x = p[0]->pos.x; 1075 prim->p0.y = p[0]->pos.y; 1076 for(int i = 0; i < 3; ++i) 1077 { 1078 prim->ea[i] = -edge[i].x; 1079 prim->eb[i] = -edge[i].y; 1080 //prim->edge[i].x = -edge[i].x; 1081 //prim->edge[i].y = -edge[i].y; 1082 } 1083 __m128 A = m128(1 / ((p[0]->pos.x - p[1]->pos.x) * (p[0]->pos.y - p[2]->pos.y) 1084 - (p[0]->pos.y - p[1]->pos.y) * (p[0]->pos.x - p[2]->pos.x))); 1085 __m128 attr[3]; 1086 for(int i = 0; i < 3; ++i) 1087 attr[i] = _mm_loadu_ps(&p[i]->pos.w); 1088 1089 _mm_storeu_ps(prim->a, A * (m128(edge[0].x) * attr[2] + m128(edge[1].x) * attr[0] + m128(edge[2].x) * attr[1])); 1090 _mm_storeu_ps(prim->b, A * (m128(edge[0].y) * attr[2] + m128(edge[1].y) * attr[0] + m128(edge[2].y) * attr[1])); 1091 prim->c[0] = p[0]->pos.w; 1092 prim->c[1] = p[0]->uv.x; 1093 prim->c[2] = p[0]->uv.y; 1094 1095 __m128 maxP = Min(Max(Max(p[0]->pos.m, p[1]->pos.m), p[2]->pos.m) + m128(1.5f), 1096 _mm_set_ps(0, 0, ScreenHeight, ScreenWidth)); 1097 __m128 minP = Max(Min(Min(p[0]->pos.m, p[1]->pos.m), p[2]->pos.m) - m128(0.5f), m128(0)); 1098 1099 __m128i bound = _mm_cvtps_epi32(_mm_unpacklo_ps(minP, maxP)); 1100 1101 bound = _mm_add_epi32(bound, _mm_set_epi32(TILE_HEIGHT - 1, 0, TILE_WIDTH - 1, 0)); 1102 bound = _mm_and_si128(bound, _mm_set_epi32(~(TILE_HEIGHT - 1), 1103 ~(TILE_HEIGHT - 1), ~(TILE_WIDTH - 1), ~(TILE_WIDTH - 1))); 1104 1105 edge[0] = -edge[0]; 1106 edge[1] = -edge[1]; 1107 edge[2] = -edge[2]; 1108 1109 const int& minX = ((int*)&bound)[0]; 1110 const int& maxX = ((int*)&bound)[1]; 1111 const int& minY = ((int*)&bound)[2]; 1112 const int& maxY = ((int*)&bound)[3]; 1113 1114 __m128 offX, offY; 1115 __m128 ex = _mm_set_ps(0, edge[2].x, edge[1].x, edge[0].x); 1116 __m128 ey = _mm_set_ps(0, edge[2].y, edge[1].y, edge[0].y); 1117 __m128 ez = _mm_set_ps(0, edge[2].z, edge[1].z, edge[0].z); 1118 offX = (ex > m128(0)) & m128(TILE_WIDTH); 1119 offY = (ey > m128(0)) & m128(TILE_HEIGHT); 1120 1121 PrimitiveTile* tile = _primTiles + (minY / TILE_HEIGHT) * _tileCol + (minX / TILE_WIDTH); 1122 1123 bool trans = tri.prim->shader->IsTransprency(); 1124 1125 for(int y = minY; y < maxY; y += TILE_HEIGHT) 1126 { 1127 PrimitiveTile* tile2 = tile; 1128 for(int x = minX; x < maxX; x += TILE_WIDTH) 1129 { 1130 if(MoveMask((m128(x) + offX) * ex + (m128(y) + offY) * ey + ez) == 0) 1131 { 1132 bool fullCovered = MoveMask((m128(x + TILE_WIDTH) - offX) * ex 1133 + (m128(y + TILE_HEIGHT) - offY) * ey + ez) == 0; 1134 if(trans) 1135 { 1136 tile2->AddPrimitive(prim, Tanslusent, fullCovered); 1137 } 1138 else 1139 { 1140 if(fullCovered) 1141 tile2->InsertFullPrimitive(prim, prim->maxRhw); 1142 else 1143 tile2->AddPrimitive(prim, Opaque, false); 1144 } 1145 } 1146 tile2++; 1147 } 1148 tile += _tileCol; 1149 } 1150 } 1151 1152 void RenderContext::Render() 1153 { 1154 ViewProjMatrix = _camera->GetViewProjMatrix(); 1155 _eye = Float4(_camera->GetEyePos(), 1); 1156 DWORD startTime = ::timeGetTime(); 1157 1158 struct VertexProcess 1159 { 1160 LONG index; 1161 RenderContext* rc; 1162 TrianglePrim* prims; 1163 int triCount; 1164 1165 static void Run(int id, void* _context) 1166 { 1167 VertexProcess* context = (VertexProcess*)_context; 1168 RenderContext* rc = context->rc; 1169 1170 while(true) 1171 { 1172 LONG index = ::InterlockedIncrement(&context->index) - 1; 1173 int c = index * 64; 1174 if(c >= context->triCount) 1175 break; 1176 int e = min(c + 64, context->triCount); 1177 for(int i = c; i < e; ++i) 1178 rc->DrawTriangle(context->prims[i]); 1179 } 1180 } 1181 1182 VertexProcess() 1183 { 1184 index = 0; 1185 } 1186 }; 1187 1188 VertexProcess p; 1189 p.rc = this; 1190 p.prims = _accelStruct.GetPrims(&p.triCount); 1191 Parallel::Run(VertexProcess::Run, &p); 1192 1193 LogInfo("Vertex Process time: %d\n", ::timeGetTime() - startTime); 1194 startTime = ::timeGetTime(); 1195 1196 1197 struct PixelProcess 1198 { 1199 LONG index; 1200 RenderContext* rc; 1201 1202 static void Run(int id, void* _context) 1203 { 1204 PixelProcess* context = (PixelProcess*)_context; 1205 RenderContext* rc = context->rc; 1206 while(true) 1207 { 1208 LONG index = ::InterlockedIncrement(&context->index) - 1; 1209 if(index >= (rc->_tileCol * rc->_tileRow)) 1210 break; 1211 1212 int col = index % (rc->_tileCol); 1213 int row = index / (rc->_tileCol); 1214 1215 int x = col * TILE_WIDTH; 1216 int y = row * TILE_HEIGHT; 1217 index = row * rc->_tileCol + col; 1218 1219 rc->RasterTile(rc->_primTiles + index, x, y, 1220 (*rc->_renderTarget)[y] + x, rc->_renderTarget->pitch); 1221 } 1222 } 1223 1224 PixelProcess() 1225 { 1226 index = 0; 1227 } 1228 }; 1229 1230 PixelProcess pp; 1231 pp.rc = this; 1232 Parallel::Run(PixelProcess::Run, &pp); 1233 1234 LogInfo("Pixel Process time: %d\n", ::timeGetTime() - startTime); 1235 }