1 /****************************************************************************** 2 * 3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 /** 19 ******************************************************************************* 20 * @file 21 * ihevcd_frm_cvt_x86_intr.c 22 * 23 * @brief 24 * Platform specific intrinsic implementation of certain functions 25 * 26 * @author 27 * Ittiam 28 * @par List of Functions: 29 * - ihevcd_itrans_recon_dc 30 * - ihevcd_fmt_conv_420sp_to_420p 31 * 32 * @remarks 33 * None 34 * 35 ******************************************************************************* 36 */ 37 #include "string.h" 38 #include "ihevc_typedefs.h" 39 #include "ihevc_defs.h" 40 #include "ihevc_macros.h" 41 #include "ihevc_platform_macros.h" 42 #include "ihevcd_function_selector.h" 43 #include <string.h> 44 #include <immintrin.h> 45 46 47 void ihevcd_fmt_conv_420sp_to_420p_ssse3(UWORD8 *pu1_y_src, 48 UWORD8 *pu1_uv_src, 49 UWORD8 *pu1_y_dst, 50 UWORD8 *pu1_u_dst, 51 UWORD8 *pu1_v_dst, 52 WORD32 wd, 53 WORD32 ht, 54 WORD32 src_y_strd, 55 WORD32 src_uv_strd, 56 WORD32 dst_y_strd, 57 WORD32 dst_uv_strd, 58 WORD32 is_u_first, 59 WORD32 disable_luma_copy) 60 { 61 UWORD8 *pu1_src, *pu1_dst; 62 UWORD8 *pu1_u_src, *pu1_v_src; 63 WORD32 num_rows, num_cols, src_strd, dst_strd, cols, rows; 64 WORD32 i, j; 65 66 cols = 0; 67 pu1_u_src = (UWORD8 *)pu1_uv_src; 68 pu1_v_src = (UWORD8 *)pu1_uv_src + 1; 69 if(0 == disable_luma_copy) 70 { 71 /* copy luma */ 72 pu1_src = (UWORD8 *)pu1_y_src; 73 pu1_dst = (UWORD8 *)pu1_y_dst; 74 75 num_rows = ht; 76 num_cols = wd; 77 78 src_strd = src_y_strd; 79 dst_strd = dst_y_strd; 80 for(i = 0; i < num_rows; i++) 81 { 82 memcpy(pu1_dst, pu1_src, num_cols); 83 pu1_dst += dst_strd; 84 pu1_src += src_strd; 85 } 86 } 87 88 /* de-interleave U and V and copy to destination */ 89 if(!is_u_first) 90 { 91 UWORD8 *temp = pu1_u_dst; 92 pu1_u_dst = pu1_v_dst; 93 pu1_v_dst = temp; 94 95 pu1_u_src = (UWORD8 *)pu1_uv_src + 1; 96 pu1_v_src = (UWORD8 *)pu1_uv_src; 97 } 98 99 { 100 __m128i src_uv0_8x16b, src_uv1_8x16b, src_u_8x16b, src_v_8x16b; 101 __m128i temp0_8x16b, temp1_8x16b, alt_first_mask; 102 103 UWORD8 FIRST_ALT_SHUFFLE[16] = { 104 0x00, 0x02, 0x04, 0x06, 105 0x08, 0x0A, 0x0C, 0x0E, 106 0x01, 0x03, 0x05, 0x07, 107 0x09, 0x0B, 0x0D, 0x0F }; 108 109 PREFETCH((char const *)(pu1_uv_src + (0 * src_uv_strd)), _MM_HINT_T0) 110 PREFETCH((char const *)(pu1_uv_src + (1 * src_uv_strd)), _MM_HINT_T0) 111 PREFETCH((char const *)(pu1_uv_src + (2 * src_uv_strd)), _MM_HINT_T0) 112 PREFETCH((char const *)(pu1_uv_src + (3 * src_uv_strd)), _MM_HINT_T0) 113 PREFETCH((char const *)(pu1_uv_src + (4 * src_uv_strd)), _MM_HINT_T0) 114 PREFETCH((char const *)(pu1_uv_src + (5 * src_uv_strd)), _MM_HINT_T0) 115 PREFETCH((char const *)(pu1_uv_src + (6 * src_uv_strd)), _MM_HINT_T0) 116 PREFETCH((char const *)(pu1_uv_src + (7 * src_uv_strd)), _MM_HINT_T0) 117 118 num_rows = ht >> 1; 119 num_cols = wd >> 1; 120 121 src_strd = src_uv_strd; 122 dst_strd = dst_uv_strd; 123 124 alt_first_mask = _mm_loadu_si128((__m128i *)&FIRST_ALT_SHUFFLE[0]); 125 126 if(num_cols > 15) 127 { 128 cols = num_cols >> 4; 129 130 for(i = 0; i < (num_rows >> 2); i++) 131 { 132 UWORD8 *pu1_uv_src_temp, *pu1_u_dst_temp, *pu1_v_dst_temp; 133 134 PREFETCH((char const *)(pu1_uv_src + (8 * src_strd)), _MM_HINT_T0) 135 PREFETCH((char const *)(pu1_uv_src + (9 * src_strd)), _MM_HINT_T0) 136 PREFETCH((char const *)(pu1_uv_src + (10 * src_strd)), _MM_HINT_T0) 137 PREFETCH((char const *)(pu1_uv_src + (11 * src_strd)), _MM_HINT_T0) 138 139 pu1_uv_src_temp = pu1_uv_src; 140 pu1_u_dst_temp = pu1_u_dst; 141 pu1_v_dst_temp = pu1_v_dst; 142 143 for(j = 0; j < cols; j++) 144 { 145 146 /**** Row 0 ***/ 147 src_uv0_8x16b = _mm_loadu_si128((__m128i *)pu1_uv_src_temp); 148 src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + 16)); 149 150 temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask); 151 temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask); 152 153 src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b); 154 src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b); 155 156 _mm_storeu_si128((__m128i *)(pu1_u_dst_temp), src_u_8x16b); 157 _mm_storeu_si128((__m128i *)(pu1_v_dst_temp), src_v_8x16b); 158 159 /**** Row 1 ***/ 160 src_uv0_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (1 * src_strd))); 161 src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (1 * src_strd) + 16)); 162 163 temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask); 164 temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask); 165 166 src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b); 167 src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b); 168 169 _mm_storeu_si128((__m128i *)(pu1_u_dst_temp + (1 * dst_strd)), src_u_8x16b); 170 _mm_storeu_si128((__m128i *)(pu1_v_dst_temp + (1 * dst_strd)), src_v_8x16b); 171 172 /**** Row 2 ***/ 173 src_uv0_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (2 * src_strd))); 174 src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (2 * src_strd) + 16)); 175 176 temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask); 177 temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask); 178 179 src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b); 180 src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b); 181 182 _mm_storeu_si128((__m128i *)(pu1_u_dst_temp + (2 * dst_strd)), src_u_8x16b); 183 _mm_storeu_si128((__m128i *)(pu1_v_dst_temp + (2 * dst_strd)), src_v_8x16b); 184 185 /**** Row 3 ***/ 186 src_uv0_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (3 * src_strd))); 187 src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (3 * src_strd) + 16)); 188 189 temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask); 190 temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask); 191 192 src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b); 193 src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b); 194 195 _mm_storeu_si128((__m128i *)(pu1_u_dst_temp + (3 * dst_strd)), src_u_8x16b); 196 _mm_storeu_si128((__m128i *)(pu1_v_dst_temp + (3 * dst_strd)), src_v_8x16b); 197 198 pu1_u_dst_temp += 16; 199 pu1_v_dst_temp += 16; 200 pu1_uv_src_temp += 32; 201 } 202 203 pu1_u_dst += 4 * dst_strd; 204 pu1_v_dst += 4 * dst_strd; 205 pu1_uv_src += 4 * src_strd; 206 //pu1_v_src += src_strd; 207 } 208 rows = num_rows & 0x3; 209 if(rows) 210 { 211 for(i = 0; i < rows; i++) 212 { 213 UWORD8 *pu1_uv_src_temp, *pu1_u_dst_temp, *pu1_v_dst_temp; 214 215 pu1_uv_src_temp = pu1_uv_src; 216 pu1_u_dst_temp = pu1_u_dst; 217 pu1_v_dst_temp = pu1_v_dst; 218 219 for(j = 0; j < cols; j++) 220 { 221 222 src_uv0_8x16b = _mm_loadu_si128((__m128i *)pu1_uv_src_temp); 223 src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + 16)); 224 225 temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask); 226 temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask); 227 228 src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b); 229 src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b); 230 231 _mm_storeu_si128((__m128i *)(pu1_u_dst_temp), src_u_8x16b); 232 _mm_storeu_si128((__m128i *)(pu1_v_dst_temp), src_v_8x16b); 233 234 pu1_u_dst_temp += 16; 235 pu1_v_dst_temp += 16; 236 pu1_uv_src_temp += 32; 237 } 238 239 pu1_u_dst += dst_strd; 240 pu1_v_dst += dst_strd; 241 pu1_uv_src += src_strd; 242 } 243 } 244 pu1_u_dst -= (num_rows * dst_strd); 245 pu1_v_dst -= (num_rows * dst_strd); 246 num_cols &= 0x0F; 247 } 248 if(num_cols) 249 { 250 pu1_u_dst += (cols << 4); 251 pu1_v_dst += (cols << 4); 252 pu1_u_src += 2 * (cols << 4); 253 pu1_v_src += 2 * (cols << 4); 254 for(i = 0; i < num_rows; i++) 255 { 256 for(j = 0; j < num_cols; j++) 257 { 258 pu1_u_dst[j] = pu1_u_src[j * 2]; 259 pu1_v_dst[j] = pu1_v_src[j * 2]; 260 } 261 262 pu1_u_dst += dst_strd; 263 pu1_v_dst += dst_strd; 264 pu1_u_src += src_strd; 265 pu1_v_src += src_strd; 266 } 267 } 268 } 269 return; 270 } 271