1 /****************************************************************************** 2 * 3 * Copyright (C) 2015 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ***************************************************************************** 18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 */ 20 /** 21 ******************************************************************************* 22 * @file 23 * icv_variance_sse42.c 24 * 25 * @brief 26 * This file contains the functions to compute variance 27 * 28 * @author 29 * Ittiam 30 * 31 * @par List of Functions: 32 * icv_variance_8x4_ssse3() 33 * 34 * @remarks 35 * None 36 * 37 ******************************************************************************* 38 */ 39 /*****************************************************************************/ 40 /* File Includes */ 41 /*****************************************************************************/ 42 /* System include files */ 43 #include <stdio.h> 44 #include <stdint.h> 45 #include <string.h> 46 #include <stdlib.h> 47 #include <assert.h> 48 #include <immintrin.h> 49 50 /* User include files */ 51 #include "icv_datatypes.h" 52 #include "icv_macros.h" 53 #include "icv_platform_macros.h" 54 #include "icv.h" 55 56 /** 57 ******************************************************************************* 58 * 59 * @brief 60 * Computes variance of a given 8x4 block 61 * 62 * @par Description 63 * Compute variance of a given 8x4 block 64 * 65 * @param[in] pu1_src 66 * Source 67 * 68 * @param[in] src_strd 69 * Source stride 70 * 71 * @param[in] wd 72 * Assumed to be 8 73 * 74 * @param[in] ht 75 * Assumed to be 4 76 * 77 * @returns 78 * Variance 79 * 80 * @remarks 81 * 82 ******************************************************************************* 83 */ 84 WORD32 icv_variance_8x4_ssse3(UWORD8 *pu1_src, WORD32 src_strd, WORD32 wd, WORD32 ht) 85 { 86 WORD32 sum; 87 WORD32 sum_sqr; 88 WORD32 blk_sz; 89 WORD32 vrnc; 90 __m128 src_r0, src_r1; 91 __m128i ssrc_r0, ssrc_r1, ssrc_r2, ssrc_r3; 92 __m128i sum_r0, sum_r1; 93 __m128i sqr_r0, sqr_r1, sqr_r2, sqr_r3; 94 __m128i vsum, vsum_sqr; 95 __m128i zero; 96 UNUSED(wd); 97 UNUSED(ht); 98 99 ASSERT(wd == 8); 100 ASSERT(ht == 4); 101 102 sum = 0; 103 sum_sqr = 0; 104 105 blk_sz = 8 * 4; 106 107 zero = _mm_setzero_si128(); 108 109 /* Load source */ 110 src_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src)); 111 pu1_src += src_strd; 112 113 src_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src)); 114 pu1_src += src_strd; 115 116 src_r0 = _mm_loadh_pi (src_r0, (__m64 *) (pu1_src)); 117 pu1_src += src_strd; 118 119 src_r1 = _mm_loadh_pi (src_r1, (__m64 *) (pu1_src)); 120 pu1_src += src_strd; 121 122 /* Compute sum of all elements */ 123 /* Use SAD with 0, since there is no pairwise addition */ 124 sum_r0 = _mm_sad_epu8((__m128i)src_r0, zero); 125 sum_r1 = _mm_sad_epu8((__m128i)src_r1, zero); 126 127 /* Accumulate SAD */ 128 vsum = _mm_add_epi64(sum_r0, sum_r1); 129 vsum = _mm_add_epi64(vsum, _mm_srli_si128(vsum, 8)); 130 131 sum = _mm_cvtsi128_si32(vsum); 132 133 /* Unpack to 16 bits */ 134 ssrc_r0 = _mm_unpacklo_epi8((__m128i)src_r0, zero); 135 ssrc_r1 = _mm_unpacklo_epi8((__m128i)src_r1, zero); 136 ssrc_r2 = _mm_unpackhi_epi8((__m128i)src_r0, zero); 137 ssrc_r3 = _mm_unpackhi_epi8((__m128i)src_r1, zero); 138 139 /* Compute sum of squares */ 140 sqr_r0 = _mm_madd_epi16(ssrc_r0, ssrc_r0); 141 sqr_r1 = _mm_madd_epi16(ssrc_r1, ssrc_r1); 142 sqr_r2 = _mm_madd_epi16(ssrc_r2, ssrc_r2); 143 sqr_r3 = _mm_madd_epi16(ssrc_r3, ssrc_r3); 144 145 vsum_sqr = _mm_add_epi32(sqr_r0, sqr_r1); 146 vsum_sqr = _mm_add_epi32(vsum_sqr, sqr_r2); 147 vsum_sqr = _mm_add_epi32(vsum_sqr, sqr_r3); 148 149 vsum_sqr = _mm_add_epi32(vsum_sqr, _mm_srli_si128(vsum_sqr, 8)); 150 vsum_sqr = _mm_add_epi32(vsum_sqr, _mm_srli_si128(vsum_sqr, 4)); 151 sum_sqr = _mm_cvtsi128_si32(vsum_sqr); 152 153 /* Compute variance */ 154 vrnc = ((sum_sqr * blk_sz) - (sum * sum)) / (blk_sz * blk_sz); 155 156 return vrnc; 157 } 158 159