float32 和float16互转
//int main() //{ // int i = 0; // float fVal = -255.123456789; // char *pChar; // pChar = (char *)&fVal; // // for (i = 0; i<4; i++) // { // printf("chs[%d] = %x\n", i, pChar[i]); // } // // pChar[0] = 0; // /*pChar[1] = 0; // pChar[2] = 0; // pChar[3] = 0;*/ // float * pFloat = (float *)pChar; // printf("fVal = %0.9f\n", pFloat[0]); // getchar(); //} // based on https://gist.github.com/martin-kallman/5049614 // float32 // Martin Kallman // // Fast half-precision to single-precision floating point conversion // - Supports signed zero and denormals-as-zero (DAZ) // - Does not support infinities or NaN // - Few, partially pipelinable, non-branching instructions, // - Core opreations ~6 clock cycles on modern x86-64 void float32(float *__restrict out, const short in) { unsigned int t1; unsigned int t2; unsigned int t3; t1 = in & 0x7fffu; // Non-sign bits t2 = in & 0x8000u; // Sign bit t3 = in & 0x7c00u; // Exponent t1 <<= 13u; // Align mantissa on MSB t2 <<= 16u; // Shift sign bit into position t1 += 0x38000000; // Adjust bias t1 = (t3 == 0 ? 0 : t1); // Denormals-as-zero t1 |= t2; // Re-insert sign bit *((unsigned int *)out) = t1; }; // float16 // Martin Kallman // // Fast single-precision to half-precision floating point conversion // - Supports signed zero, denormals-as-zero (DAZ), flush-to-zero (FTZ), // clamp-to-max // - Does not support infinities or NaN // - Few, partially pipelinable, non-branching instructions, // - Core opreations ~10 clock cycles on modern x86-64 void float16(short *__restrict out, const float in) { unsigned int inu = *((unsigned int *)& in); unsigned int t1; unsigned int t2; unsigned int t3; t1 = inu & 0x7fffffffu; // Non-sign bits t2 = inu & 0x80000000u; // Sign bit t3 = inu & 0x7f800000u; // Exponent t1 >>= 13u; // Align mantissa on MSB t2 >>= 16u; // Shift sign bit into position t1 -= 0x1c000; // Adjust bias t1 = (t3 < 0x38800000u) ? 0 : t1; // Flush-to-zero t1 = (t3 > 0x8e000000u) ? 0x7bff : t1; // Clamp-to-max t1 = (t3 == 0 ? 0 : t1); // Denormals-as-zero t1 |= t2; // Re-insert sign bit *((short *)out) = t1; }; #define ABS(A) ((A) >= 0 ? (A) : -(A)) int main() { float original = -42.42f; short small = 0; float16(&small, original); float quantized = 0.0f; float32(&quantized, small); float diff = ABS(original - quantized); printf("orig %f quantized %f absdiff %f\n", original, quantized, diff); getchar(); //assert(diff < 0.1f); }
posted on 2022-10-12 19:04 strangeman 阅读(306) 评论(0) 编辑 收藏 举报
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· 阿里巴巴 QwQ-32B真的超越了 DeepSeek R-1吗?
· 【译】Visual Studio 中新的强大生产力特性
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 【设计模式】告别冗长if-else语句:使用策略模式优化代码结构