1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #if defined(_MSC_VER) 6 #include <intrin.h> 7 #else 8 #include <mmintrin.h> 9 #include <emmintrin.h> 10 #endif 11 12 #include "media/base/simd/filter_yuv.h" 13 14 namespace media { 15 16 void FilterYUVRows_SSE2(uint8* dest, 17 const uint8* src0, 18 const uint8* src1, 19 int width, 20 int fraction) { 21 int pixel = 0; 22 23 // Process the unaligned bytes first. 24 int unaligned_width = 25 (16 - (reinterpret_cast<uintptr_t>(dest) & 15)) & 15; 26 while (pixel < width && pixel < unaligned_width) { 27 dest[pixel] = (src0[pixel] * (256 - fraction) + 28 src1[pixel] * fraction) >> 8; 29 ++pixel; 30 } 31 32 __m128i zero = _mm_setzero_si128(); 33 __m128i src1_fraction = _mm_set1_epi16(fraction); 34 __m128i src0_fraction = _mm_set1_epi16(256 - fraction); 35 const __m128i* src0_128 = 36 reinterpret_cast<const __m128i*>(src0 + pixel); 37 const __m128i* src1_128 = 38 reinterpret_cast<const __m128i*>(src1 + pixel); 39 __m128i* dest128 = reinterpret_cast<__m128i*>(dest + pixel); 40 __m128i* end128 = reinterpret_cast<__m128i*>( 41 reinterpret_cast<uintptr_t>(dest + width) & ~15); 42 43 while (dest128 < end128) { 44 __m128i src0 = _mm_loadu_si128(src0_128); 45 __m128i src1 = _mm_loadu_si128(src1_128); 46 __m128i src2 = _mm_unpackhi_epi8(src0, zero); 47 __m128i src3 = _mm_unpackhi_epi8(src1, zero); 48 src0 = _mm_unpacklo_epi8(src0, zero); 49 src1 = _mm_unpacklo_epi8(src1, zero); 50 src0 = _mm_mullo_epi16(src0, src0_fraction); 51 src1 = _mm_mullo_epi16(src1, src1_fraction); 52 src2 = _mm_mullo_epi16(src2, src0_fraction); 53 src3 = _mm_mullo_epi16(src3, src1_fraction); 54 src0 = _mm_add_epi16(src0, src1); 55 src2 = _mm_add_epi16(src2, src3); 56 src0 = _mm_srli_epi16(src0, 8); 57 src2 = _mm_srli_epi16(src2, 8); 58 src0 = _mm_packus_epi16(src0, src2); 59 *dest128++ = src0; 60 ++src0_128; 61 ++src1_128; 62 pixel += 16; 63 } 64 65 while (pixel < width) { 66 dest[pixel] = (src0[pixel] * (256 - fraction) + 67 src1[pixel] * fraction) >> 8; 68 ++pixel; 69 } 70 } 71 72 } // namespace media 73