1 /* 2 * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "webrtc/modules/desktop_capture/differ_block_sse2.h" 12 13 #if defined(_MSC_VER) 14 #include <intrin.h> 15 #else 16 #include <mmintrin.h> 17 #include <emmintrin.h> 18 #endif 19 20 #include "webrtc/modules/desktop_capture/differ_block.h" 21 22 namespace webrtc { 23 24 extern bool BlockDifference_SSE2_W16(const uint8_t* image1, 25 const uint8_t* image2, 26 int stride) { 27 __m128i acc = _mm_setzero_si128(); 28 __m128i v0; 29 __m128i v1; 30 __m128i sad; 31 for (int y = 0; y < kBlockSize; ++y) { 32 const __m128i* i1 = reinterpret_cast<const __m128i*>(image1); 33 const __m128i* i2 = reinterpret_cast<const __m128i*>(image2); 34 v0 = _mm_loadu_si128(i1); 35 v1 = _mm_loadu_si128(i2); 36 sad = _mm_sad_epu8(v0, v1); 37 acc = _mm_adds_epu16(acc, sad); 38 v0 = _mm_loadu_si128(i1 + 1); 39 v1 = _mm_loadu_si128(i2 + 1); 40 sad = _mm_sad_epu8(v0, v1); 41 acc = _mm_adds_epu16(acc, sad); 42 v0 = _mm_loadu_si128(i1 + 2); 43 v1 = _mm_loadu_si128(i2 + 2); 44 sad = _mm_sad_epu8(v0, v1); 45 acc = _mm_adds_epu16(acc, sad); 46 v0 = _mm_loadu_si128(i1 + 3); 47 v1 = _mm_loadu_si128(i2 + 3); 48 sad = _mm_sad_epu8(v0, v1); 49 acc = _mm_adds_epu16(acc, sad); 50 51 // This essential means sad = acc >> 64. We only care about the lower 16 52 // bits. 53 sad = _mm_shuffle_epi32(acc, 0xEE); 54 sad = _mm_adds_epu16(sad, acc); 55 int diff = _mm_cvtsi128_si32(sad); 56 if (diff) 57 return true; 58 image1 += stride; 59 image2 += stride; 60 } 61 return false; 62 } 63 64 extern bool BlockDifference_SSE2_W32(const uint8_t* image1, 65 const uint8_t* image2, 66 int stride) { 67 __m128i acc = _mm_setzero_si128(); 68 __m128i v0; 69 __m128i v1; 70 __m128i sad; 71 for (int y = 0; y < kBlockSize; ++y) { 72 const __m128i* i1 = reinterpret_cast<const __m128i*>(image1); 73 const __m128i* i2 = reinterpret_cast<const __m128i*>(image2); 74 v0 = _mm_loadu_si128(i1); 75 v1 = _mm_loadu_si128(i2); 76 sad = _mm_sad_epu8(v0, v1); 77 acc = _mm_adds_epu16(acc, sad); 78 v0 = _mm_loadu_si128(i1 + 1); 79 v1 = _mm_loadu_si128(i2 + 1); 80 sad = _mm_sad_epu8(v0, v1); 81 acc = _mm_adds_epu16(acc, sad); 82 v0 = _mm_loadu_si128(i1 + 2); 83 v1 = _mm_loadu_si128(i2 + 2); 84 sad = _mm_sad_epu8(v0, v1); 85 acc = _mm_adds_epu16(acc, sad); 86 v0 = _mm_loadu_si128(i1 + 3); 87 v1 = _mm_loadu_si128(i2 + 3); 88 sad = _mm_sad_epu8(v0, v1); 89 acc = _mm_adds_epu16(acc, sad); 90 v0 = _mm_loadu_si128(i1 + 4); 91 v1 = _mm_loadu_si128(i2 + 4); 92 sad = _mm_sad_epu8(v0, v1); 93 acc = _mm_adds_epu16(acc, sad); 94 v0 = _mm_loadu_si128(i1 + 5); 95 v1 = _mm_loadu_si128(i2 + 5); 96 sad = _mm_sad_epu8(v0, v1); 97 acc = _mm_adds_epu16(acc, sad); 98 v0 = _mm_loadu_si128(i1 + 6); 99 v1 = _mm_loadu_si128(i2 + 6); 100 sad = _mm_sad_epu8(v0, v1); 101 acc = _mm_adds_epu16(acc, sad); 102 v0 = _mm_loadu_si128(i1 + 7); 103 v1 = _mm_loadu_si128(i2 + 7); 104 sad = _mm_sad_epu8(v0, v1); 105 acc = _mm_adds_epu16(acc, sad); 106 107 // This essential means sad = acc >> 64. We only care about the lower 16 108 // bits. 109 sad = _mm_shuffle_epi32(acc, 0xEE); 110 sad = _mm_adds_epu16(sad, acc); 111 int diff = _mm_cvtsi128_si32(sad); 112 if (diff) 113 return true; 114 image1 += stride; 115 image2 += stride; 116 } 117 return false; 118 } 119 120 } // namespace webrtc 121