ARM NEON 64bit 查找表替换

没啥效果,如果表的长度在 64个uint8_t之类,应该可以提高查表速度,否则还是C来的快

 

#ifdef HAVE_NEON_AARCH64

void table_lookup_AArch64_neon(uint8_t* lookup_table, uint32_t length, uint8_t* input_ptr, uint8_t* output_ptr)
{
     /* Load lookup table. */

    uint8x16x4_t table0 = vld1q_u8_x4(lookup_table);
    uint8x16x4_t table1 = vld1q_u8_x4(lookup_table+64);
    uint8x16x4_t table2 = vld1q_u8_x4(lookup_table+128);
    uint8x16x4_t table3 = vld1q_u8_x4(lookup_table+192);
    
    uint8x16x4_t elements;
    uint8x16_t src, dst;
    uint8x16_t diff = vmovq_n_s8(64);

    for(uint32_t i=0; i<length; i=i+64) {
        uint8_t* ptr = input_ptr+i;
        elements = vld1q_u8_x4(ptr);
        for(uint8_t j=0; j<4; j++) {
            dst = vqtbx4q_u8(dst, table0, src);
            
            src = vsubq_u8(src, diff);
            dst = vqtbx4q_u8(dst, table1, src);
            
            src = vsubq_u8(src, diff);
            dst = vqtbx4q_u8(dst, table2, src);
            
            src = vsubq_u8(src, diff);
            elements.val[j] = vqtbx4q_u8(dst, table3, src);
        }
        vst1q_u8_x4(ptr, elements);
    }
}

 

posted @ 2019-05-30 16:41  AndyHu518  阅读(1301)  评论(0编辑  收藏  举报