add_clip neon

void add_clip_neon(uint8_t *dst, const int dst_stride, const uint8_t* src0, const int src0_stride,
        const int8_t* src1, const int src1_stride, int width, int height)
{
        int i, j;
        for(i = 0; i < height; i++) {
                uint8_t *_dst = dst + i * dst_stride;
                const uint8_t *_src0 = src0 + i * src0_stride;
                const int8_t *_src1 = src1 + i * src1_stride;
                for(j = 0; j < width - 7; j += 8) {
                        uint8x8_t s0 = vld1_u8(_src0);
                        int8x8_t s1 = vld1_s8(_src1);
                        uint16x8_t s2 = vmovl_u8(s0);
                        int16x8_t s3 = vreinterpretq_s16_u16(s2);
                        int16x8_t s4 = vaddw_s8(s3, s1);
                        uint8x8_t s5 = vqmovun_s16(s4);
                        vst1_u8(_dst, s5);
                        _dst += 8;
                        _src0 += 8;
                        _src1 += 8;
                }
                for(; j < width; j ++) {
                        int16_t v = _src0[0] + _src1[0];
                        if ( v < 0) v = 0;
                        if ( v > 255) v = 255;
                        _dst[0] = v;
                        _src0 ++;
                        _src1 ++;
                        _dst ++;
                }
        }
}

使用neon写的版本。

posted on 2013-09-22 20:20  mathlover  阅读(394)  评论(0编辑  收藏  举报

导航