float32 和float16互转

复制代码
//int  main()
//{
//    int i = 0;
//    float fVal = -255.123456789;
//    char *pChar;
//    pChar = (char *)&fVal;
//
//    for (i = 0; i<4; i++)
//    {
//        printf("chs[%d] = %x\n", i, pChar[i]);
//    }
//
//    pChar[0] = 0;
//    /*pChar[1] = 0;
//    pChar[2] = 0;
//    pChar[3] = 0;*/
//    float * pFloat = (float *)pChar;
//    printf("fVal = %0.9f\n", pFloat[0]);
//    getchar();
//}


// based on https://gist.github.com/martin-kallman/5049614
// float32
// Martin Kallman
//
// Fast half-precision to single-precision floating point conversion
//  - Supports signed zero and denormals-as-zero (DAZ)
//  - Does not support infinities or NaN
//  - Few, partially pipelinable, non-branching instructions,
//  - Core opreations ~6 clock cycles on modern x86-64
void float32(float *__restrict out, const short in) {
    unsigned int t1;
    unsigned int t2;
    unsigned int t3;

    t1 = in & 0x7fffu;                       // Non-sign bits
    t2 = in & 0x8000u;                       // Sign bit
    t3 = in & 0x7c00u;                       // Exponent

    t1 <<= 13u;                              // Align mantissa on MSB
    t2 <<= 16u;                              // Shift sign bit into position

    t1 += 0x38000000;                       // Adjust bias

    t1 = (t3 == 0 ? 0 : t1);                // Denormals-as-zero

    t1 |= t2;                               // Re-insert sign bit

    *((unsigned int *)out) = t1;
};

// float16
// Martin Kallman
//
// Fast single-precision to half-precision floating point conversion
//  - Supports signed zero, denormals-as-zero (DAZ), flush-to-zero (FTZ),
//    clamp-to-max
//  - Does not support infinities or NaN
//  - Few, partially pipelinable, non-branching instructions,
//  - Core opreations ~10 clock cycles on modern x86-64
void float16(short *__restrict out, const float in) {
    unsigned int  inu = *((unsigned int  *)& in);
    unsigned int  t1;
    unsigned int  t2;
    unsigned int  t3;

    t1 = inu & 0x7fffffffu;                 // Non-sign bits
    t2 = inu & 0x80000000u;                 // Sign bit
    t3 = inu & 0x7f800000u;                 // Exponent

    t1 >>= 13u;                             // Align mantissa on MSB
    t2 >>= 16u;                             // Shift sign bit into position

    t1 -= 0x1c000;                         // Adjust bias

    t1 = (t3 < 0x38800000u) ? 0 : t1;       // Flush-to-zero
    t1 = (t3 > 0x8e000000u) ? 0x7bff : t1;  // Clamp-to-max
    t1 = (t3 == 0 ? 0 : t1);               // Denormals-as-zero

    t1 |= t2;                              // Re-insert sign bit

    *((short *)out) = t1;
};

#define ABS(A) ((A) >= 0 ? (A) : -(A))

int main() {
    float original = -42.42f;
    short small = 0;
    float16(&small, original);
    float quantized = 0.0f;
    float32(&quantized, small);
    float diff = ABS(original - quantized);
    printf("orig %f quantized %f absdiff %f\n", original, quantized, diff);
    getchar();
    //assert(diff < 0.1f);
}
复制代码

 

posted on   strangeman  阅读(306)  评论(0编辑  收藏  举报

相关博文:
阅读排行:
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· 阿里巴巴 QwQ-32B真的超越了 DeepSeek R-1吗?
· 【译】Visual Studio 中新的强大生产力特性
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 【设计模式】告别冗长if-else语句:使用策略模式优化代码结构

导航

< 2025年3月 >
23 24 25 26 27 28 1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30 31 1 2 3 4 5
点击右上角即可分享
微信分享提示