Home | History | Annotate | Download | only in x86

Lines Matching defs:pu1_ref

97 *    Intra prediction interpolation filter for pu1_ref substitution
142 UWORD8 pu1_ref;
204 pu1_ref = pu1_dst[idx];
206 pu1_dst[i] = pu1_ref;
212 pu1_ref = pu1_dst[idx];
214 pu1_dst[i] = pu1_ref;
272 pu1_ref = pu1_dst[nbr_id_from_bl];
275 pu1_dst[i] = pu1_ref;
302 pu1_ref = pu1_dst[nbr_id_from_bl - 1];
304 pu1_dst[nbr_id_from_bl + i] = pu1_ref;
336 pu1_ref = pu1_dst[nbr_id_from_bl];
338 pu1_dst[i] = pu1_ref;
363 pu1_ref = pu1_dst[nbr_id_from_bl - 1];
365 pu1_dst[nbr_id_from_bl + i] = pu1_ref;
652 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
682 void ihevc_intra_pred_luma_planar_ssse3(UWORD8 *pu1_ref,
706 temp = pu1_ref[nt - 1];
707 temp = (temp << 8) | ((UWORD16)pu1_ref[three_nt + 1]);
736 res_temp1_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 1 - row]);
745 src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
746 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 17));
753 /*(row + 1) * pu1_ref[nt - 1] + (col + 1) * pu1_ref[three_nt + 1] */
758 /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] + (nt - 1 - col)* pu1_ref[two_nt - 1 - row] */
812 res_temp1_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 1 - row]);
813 res_temp2_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 2 - row]);
823 src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
831 /*(row + 1) * pu1_ref[nt - 1] + (col + 1) * pu1_ref[three_nt + 1] */
836 /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] + (nt - 1 - col)* pu1_ref[two_nt - 1 - row] */
890 res_temp4_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 1 - row]);
891 res_temp5_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 2 - row]);
892 res_temp6_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 3 - row]);
893 res_temp7_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 4 - row]);
902 src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
909 /*(row + 1) * pu1_ref[nt - 1] + (col + 1) * pu1_ref[three_nt + 1] */
914 /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] + (nt - 1 - col)* pu1_ref[two_nt - 1 - row] */
955 const_temp_4x32b = _mm_set1_epi16(pu1_ref[three_nt + 1]);
956 const_temp1_4x32b = _mm_set1_epi16(pu1_ref[nt - 1]);
964 const_temp2_4x32b = _mm_set1_epi16(pu1_ref[two_nt - 1 - row]);
975 /*(row + 1) * pu1_ref[nt - 1]*/
978 /*(row + 1) * pu1_ref[nt - 1] + nt)*/
987 src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + col));
991 /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] */
994 /*(col + 1) * pu1_ref[three_nt + 1]*/
997 /*(nt - 1 - col)* pu1_ref[two_nt - 1 - row]*/
1028 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
1057 void ihevc_intra_pred_luma_dc_ssse3(UWORD8 *pu1_ref,
1108 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt));
1109 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16));
1110 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 32));
1111 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 48));
1127 acc_dc += pu1_ref[three_nt];
1128 acc_dc -= pu1_ref[two_nt];
1193 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt));
1194 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
1204 acc_dc += pu1_ref[three_nt];
1205 acc_dc -= pu1_ref[two_nt];
1217 /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
1220 /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2) >> 2 */
1244 pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
1248 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
1255 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt));
1263 acc_dc += pu1_ref[three_nt];
1264 acc_dc -= pu1_ref[two_nt];
1275 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
1278 /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
1281 /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */
1306 pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
1310 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
1317 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt));
1318 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16));
1320 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
1321 src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8));
1335 acc_dc += pu1_ref[three_nt];
1336 acc_dc -= pu1_ref[two_nt];
1346 /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
1349 /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */
1391 pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
1395 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
1410 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
1439 void ihevc_intra_pred_luma_horz_ssse3(UWORD8 *pu1_ref,
1464 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 1 - row - 15));
1554 src_temp2 = _mm_set1_epi16(pu1_ref[two_nt - 1]);
1555 src_temp10 = _mm_set1_epi16(pu1_ref[two_nt]);
1558 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
1562 /*(pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt])*/
1565 /* ((pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt]) >> 1)*/
1568 /* pu1_ref[two_nt - 1]+((pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt]) >> 1)*/
1579 src_temp2 = _mm_set1_epi8(pu1_ref[two_nt - 2]);
1580 src_temp3 = _mm_set1_epi8(pu1_ref[two_nt - 3]);
1581 src_temp4 = _mm_set1_epi8(pu1_ref[two_nt - 4]);
1587 /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
1598 src_temp1 = _mm_set1_epi8(pu1_ref[two_nt - 2]);
1599 src_temp2 = _mm_set1_epi8(pu1_ref[two_nt - 3]);
1600 src_temp3 = _mm_set1_epi8(pu1_ref[two_nt - 4]);
1601 src_temp4 = _mm_set1_epi8(pu1_ref[two_nt - 5]);
1602 src_temp5 = _mm_set1_epi8(pu1_ref[two_nt - 6]);
1603 src_temp6 = _mm_set1_epi8(pu1_ref[two_nt - 7]);
1604 src_temp7 = _mm_set1_epi8(pu1_ref[two_nt - 8]);
1608 pu1_ref[two_nt - 1 - row];*/
1620 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8));
1631 /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
1632 src_temp1 = _mm_set1_epi8(pu1_ref[two_nt - 2]);
1633 src_temp2 = _mm_set1_epi8(pu1_ref[two_nt - 3]);
1634 src_temp3 = _mm_set1_epi8(pu1_ref[two_nt - 4]);
1635 src_temp4 = _mm_set1_epi8(pu1_ref[two_nt - 5]);
1636 src_temp5 = _mm_set1_epi8(pu1_ref[two_nt - 6]);
1637 src_temp6 = _mm_set1_epi8(pu1_ref[two_nt - 7]);
1638 src_temp7 = _mm_set1_epi8(pu1_ref[two_nt - 8]);
1639 src_temp10 = _mm_set1_epi8(pu1_ref[two_nt - 9]);
1650 src_temp1 = _mm_set1_epi8(pu1_ref[two_nt - 10]);
1651 src_temp2 = _mm_set1_epi8(pu1_ref[two_nt - 11]);
1652 src_temp3 = _mm_set1_epi8(pu1_ref[two_nt - 12]);
1653 src_temp4 = _mm_set1_epi8(pu1_ref[two_nt - 13]);
1654 src_temp5 = _mm_set1_epi8(pu1_ref[two_nt - 14]);
1655 src_temp6 = _mm_set1_epi8(pu1_ref[two_nt - 15]);
1656 src_temp7 = _mm_set1_epi8(pu1_ref[two_nt - 16]);
1679 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
1709 void ihevc_intra_pred_luma_ver_ssse3(UWORD8 *pu1_ref,
1729 temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
1730 temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 16));
1782 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
1796 src_temp0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
1814 src_temp0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
1832 s2_predpixel = pu1_ref[two_nt + 1]
1833 + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1);
1849 * location pointed by 'pu1_ref' to the TU block location pointed by
1878 void ihevc_intra_pred_luma_mode2_ssse3(UWORD8 *pu1_ref,
1905 /*pu1_ref[two_nt - row - (col+1) - 1]*/
1906 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 8));
1921 /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
1931 /*pu1_ref[two_nt - row - (col+1) - 1]*/
1932 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 16));
1965 { /*pu1_ref[two_nt - row - (col+1) - 1]*/
1967 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 0) - (col + 16) - 1));
1968 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 1) - (col + 16) - 1));
1969 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 2) - (col + 16) - 1));
1970 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 3) - (col + 16) - 1));
1971 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 4) - (col + 16) - 1));
1972 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 5) - (col + 16) - 1));
1973 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 6) - (col + 16) - 1));
1974 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 7) - (col + 16) - 1));
2007 * reference neighboring samples location pointed by 'pu1_ref' to the TU
2036 void ihevc_intra_pred_luma_mode_18_34_ssse3(UWORD8 *pu1_ref,
2054 /*pu1_ref[two_nt + col + idx + 1]*/
2055 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 2));
2056 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 3));
2057 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 4));
2058 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 5));
2065 /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
2074 /*pu1_ref[two_nt + col + idx + 1]*/
2075 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 2));
2076 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 3));
2077 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 4));
2078 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 5));
2079 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 6));
2080 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 7));
2081 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 8));
2082 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 9));
2098 /*pu1_ref[two_nt + col + idx + 1]*/
2099 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 0) + 2));
2100 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 1) + 2));
2101 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 2) + 2));
2102 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 3) + 2));
2103 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 4) + 2));
2104 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 5) + 2));
2105 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 6) + 2));
2106 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 7) + 2));
2125 /*pu1_ref[two_nt + col + idx + 1]*/
2126 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (0 + 0) + 2));
2127 src_temp9 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (0 + 16) + 2));
2128 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (1 + 0) + 2));
2129 src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (1 + 16) + 2));
2130 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (2 + 0) + 2));
2131 src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (2 + 16) + 2));
2132 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (3 + 0) + 2));
2133 src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (3 + 16) + 2));
2144 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (4 + 0) + 2));
2145 src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (4 + 16) + 2));
2146 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (5 + 0) + 2));
2147 src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (5 + 16) + 2));
2148 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (6 + 0) + 2));
2149 src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (6 + 16) + 2));
2150 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (7 + 0) + 2));
2151 src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (7 + 16) + 2));
2162 pu1_ref += 8;
2173 /*pu1_ref[two_nt + col + idx + 1]*/
2174 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 3));
2184 /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
2193 /*pu1_ref[two_nt + col + idx + 1]*/
2194 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 7));
2218 /*pu1_ref[two_nt + col + idx + 1]*/
2219 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 0)));
2220 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 1)));
2221 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 2)));
2222 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 3)));
2223 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 4)));
2224 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 5)));
2225 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 6)));
2226 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 7)));
2245 /*pu1_ref[two_nt + col + idx + 1]*/
2246 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 0 + 0));
2247 src_temp9 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 0 + 16));
2248 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 1 + 0));
2249 src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 1 + 16));
2250 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 2 + 0));
2251 src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 2 + 16));
2252 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 3 + 0));
2253 src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 3 + 16));
2264 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 4 + 0));
2265 src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 4 + 16));
2266 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 5 + 0));
2267 src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 5 + 16));
2268 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 6 + 0));
2269 src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 6 + 16));
2270 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 7 + 0));
2271 src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 7 + 16));
2282 pu1_ref -= 8;
2298 * reference neighboring samples location pointed by 'pu1_ref' to the TU
2327 void ihevc_intra_pred_luma_mode_3_to_9_ssse3(UWORD8 *pu1_ref,
2423 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1 - 1)); /* col=0*/
2424 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2 - 1)); /* col=1*/
2425 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3 - 1)); /* col=2*/
2426 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4 - 1)); /* col=3*/
2433 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
2439 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
2445 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
2551 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 1 - (8 + row))); /* col=0*/
2552 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 1 - (8 + row))); /* col=1*/
2553 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 1 - (8 + row))); /* col=2*/
2554 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 1 - (8 + row))); /* col=3*/
2557 src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - 1 - (8 + row))); /* col=5*/
2558 src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - 1 - (8 + row))); /* col=6*/
2559 src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - 1 - (8 + row))); /* col=7*/
2560 src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - 1 - (8 + row))); /* col=8*/
2572 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
2578 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
2584 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
2590 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
2596 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
2602 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
2726 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 1)); /* col=0*/
2727 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 1)); /* col=1*/
2728 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 1)); /* col=2*/
2729 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 1)); /* col=3*/
2732 src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - 1)); /* col=5*/
2733 src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - 1)); /* col=6*/
2734 src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - 1)); /* col=7*/
2735 src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - 1)); /* col=8*/
2747 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
2753 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
2759 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
2765 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
2771 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
2777 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
2841 * with reference neighboring samples location pointed by 'pu1_ref' to the
2871 void ihevc_intra_pred_luma_mode_11_to_17_ssse3(UWORD8 *pu1_ref,
2941 ref_temp[k + nt - 1] = pu1_ref[two_nt - k];
2955 ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)];
3007 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
3013 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
3019 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
3058 ref_temp[two_nt - 1] = pu1_ref[two_nt - nt];
3059 temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 1));
3060 temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 17));
3065 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/
3066 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 17)); /*(nt+16)-(two_nt-1)*/
3182 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
3188 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
3194 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
3200 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
3206 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
3212 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
3275 ref_temp[two_nt - 1] = pu1_ref[two_nt - nt];
3276 temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 1));
3278 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/
3386 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
3392 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
3398 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
3404 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
3410 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
3416 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
3480 ref_temp[two_nt - 1] = pu1_ref[nt];
3481 temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 1));
3485 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/
3585 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
3591 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
3597 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
3603 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
3609 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
3615 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
3684 * reference neighboring samples location pointed by 'pu1_ref' to the TU
3713 void ihevc_intra_pred_luma_mode_19_to_25_ssse3(UWORD8 *pu1_ref,
3764 ref_temp[two_nt - 1] = pu1_ref[two_nt + nt];
3765 temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt));
3766 temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 16));
3771 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - nt)); /*nt-(nt+15)*/
3772 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 16)); /*(nt+16)-(two_nt-1)*/
3867 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
3877 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref
3928 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
3938 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
3977 ref_temp[two_nt - 1] = pu1_ref[two_nt + nt];
3978 temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt));
3980 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - nt)); /*nt-(nt+15)*/
4071 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
4081 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
4132 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
4142 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
4182 ref_temp[two_nt - 1] = pu1_ref[two_nt + nt];
4183 temp1 = _mm_loadl_epi64((__m128i *)(pu1_ref + two_nt));
4187 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref)); /*nt-(nt+15)*/
4278 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
4288 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
4329 ref_temp[k + nt - 1] = pu1_ref[two_nt + k];
4336 ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)];
4411 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
4417 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
4455 * reference neighboring samples location pointed by 'pu1_ref' to the TU
4485 void ihevc_intra_pred_luma_mode_27_to_33_ssse3(UWORD8 *pu1_ref,
4570 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + col));
4571 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + col));
4572 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + col));
4573 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + col));
4574 src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + 8 + col));
4575 src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + 8 + col));
4576 src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + 8 + col));
4577 src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + 8 + col));
4598 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
4608 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
4631 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + col));
4632 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + col));
4633 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + col));
4634 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + col));
4635 src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + 8 + col));
4636 src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + 8 + col));
4637 src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + 8 + col));
4638 src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + 8 + col));
4659 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
4669 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
4760 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0]));
4761 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1]));
4762 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2]));
4763 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3]));
4764 src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + 8));
4765 src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + 8));
4766 src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + 8));
4767 src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + 8));
4788 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
4798 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
4821 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4]));
4822 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5]));
4823 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6]));
4824 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7]));
4825 src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + 8));
4826 src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + 8));
4827 src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + 8));
4828 src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + 8));
4849 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
4859 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
4948 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0])); /* col = 0-7 */
4949 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1])); /* col = 8-15 */
4950 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2])); /* col = 16-23 */
4951 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3])); /* col = 24-31 */
4952 src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4])); /* col = 32-39 */
4953 src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5])); /* col = 40-47 */
4954 src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6])); /* col = 48-55 */
4955 src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7])); /* col = 56-63*/
4976 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
4986 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
5081 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1)); /* col = 0-7 */
5082 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2)); /* col = 8-15 */
5083 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3)); /* col = 16-23 */
5084 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4)); /* col = 24-31 */
5096 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
5102 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/