Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2 *
      3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 *
      5 * Licensed under the Apache License, Version 2.0 (the "License");
      6 * you may not use this file except in compliance with the License.
      7 * You may obtain a copy of the License at:
      8 *
      9 * http://www.apache.org/licenses/LICENSE-2.0
     10 *
     11 * Unless required by applicable law or agreed to in writing, software
     12 * distributed under the License is distributed on an "AS IS" BASIS,
     13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 * See the License for the specific language governing permissions and
     15 * limitations under the License.
     16 *
     17 ******************************************************************************/
     18 /**
     19 *******************************************************************************
     20 * @file
     21 *  ihevc_sao_atom_intr.c
     22 *
     23 * @brief
     24 *  Contains function definitions for Sample adaptive offset(SAO) used in-loop
     25 * filtering
     26 *
     27 * @author
     28 * 100592
     29 *
     30 * @par List of Functions:
     31 *   - ihevc_sao_band_offset_luma_ssse3()
     32 *   - ihevc_sao_band_offset_chroma_ssse3()
     33 *   - ihevc_sao_edge_offset_class0_ssse3()
     34 *   - ihevc_sao_edge_offset_class0_chroma_ssse3()
     35 *   - ihevc_sao_edge_offset_class1_ssse3()
     36 *   - ihevc_sao_edge_offset_class1_chroma_ssse3()
     37 *   - ihevc_sao_edge_offset_class2_ssse3()
     38 *   - ihevc_sao_edge_offset_class2_chroma_ssse3()
     39 *   - ihevc_sao_edge_offset_class3_ssse3()
     40 *   - ihevc_sao_edge_offset_class3_chroma_ssse3()
     41 *
     42 * @remarks
     43 *  None
     44 *
     45 *******************************************************************************
     46 */
     47 /*****************************************************************************/
     48 /* File Includes                                                             */
     49 /*****************************************************************************/
     50 #include <stdio.h>
     51 
     52 #include "ihevc_typedefs.h"
     53 #include "ihevc_platform_macros.h"
     54 #include "ihevc_macros.h"
     55 #include "ihevc_func_selector.h"
     56 #include "ihevc_defs.h"
     57 #include "ihevc_tables_x86_intr.h"
     58 #include "ihevc_common_tables.h"
     59 #include "ihevc_sao.h"
     60 
     61 #include <immintrin.h>
     62 
     63 #define NUM_BAND_TABLE  32
     64 /**
     65 *******************************************************************************
     66 *
     67 * @brief
     68 * Has two sets of functions : band offset and edge offset both for luma and chroma
     69 * edge offset has horizontal ,vertical, 135 degree and 45 degree
     70 *
     71 * @par Description:
     72 *
     73 *
     74 * @param[in-out] pu1_src
     75 *  Pointer to the source
     76 *
     77 * @param[in] src_strd
     78 *  Source stride
     79 *
     80 * @param[in-out] pu1_src_left
     81 *  source left boundary
     82 *
     83 * @param[in-out] pu1_src_top
     84 * Source top boundary
     85 *
     86 * @param[in-out] pu1_src_top_left
     87 *  Source top left boundary
     88 *
     89 * @param[in] pu1_src_top_right
     90 *  Source top right boundary
     91 *
     92 * @param[in] pu1_src_bot_left
     93 *  Source bottom left boundary
     94 *
     95 * @param[in] pu1_avail
     96 *  boundary availability flags
     97 *
     98 * @param[in] pi1_sao_offset_u
     99 *  Chroma U sao offset values
    100 *
    101 * @param[in] pi1_sao_offset_v
    102 *  Chroma V sao offset values
    103 *
    104 * @param[in] pi1_sao_offset
    105 *  Luma sao offset values
    106 *
    107 * @param[in] wd
    108 *  width of the source
    109 
    110 * @param[in] ht
    111 *  height of the source
    112 * @returns
    113 *
    114 * @remarks
    115 *  None
    116 *
    117 *******************************************************************************
    118 */
    119 
    120 
    121 void ihevc_sao_band_offset_luma_ssse3(UWORD8 *pu1_src,
    122                                       WORD32 src_strd,
    123                                       UWORD8 *pu1_src_left,
    124                                       UWORD8 *pu1_src_top,
    125                                       UWORD8 *pu1_src_top_left,
    126                                       WORD32 sao_band_pos,
    127                                       WORD8 *pi1_sao_offset,
    128                                       WORD32 wd,
    129                                       WORD32 ht)
    130 {
    131     WORD32 row, col;
    132     UWORD8 *pu1_src_cpy;
    133     WORD32 wd_rem;
    134     WORD8 offset = 0;
    135 
    136     __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
    137     __m128i band_table0_8x16b, band_table1_8x16b, band_table2_8x16b, band_table3_8x16b;
    138     __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4;
    139     __m128i band_pos_16x8b;
    140     __m128i sao_offset;
    141     __m128i cmp_mask, cmp_store;
    142 
    143     /* Updating left and top-left and top */
    144     for(row = 0; row < ht; row++)
    145     {
    146         pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
    147     }
    148     pu1_src_top_left[0] = pu1_src_top[wd - 1];
    149     for(col = 0; col < wd; col += 8)
    150     {
    151         tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset));
    152         _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1);
    153         offset += 8;
    154     }
    155 
    156     //replicating sao_band_pos as 8 bit value 16 times
    157 
    158 
    159     band_pos_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos << 3));
    160     //value set for sao_offset extraction
    161     tmp_set_128i_1  = _mm_set_epi8(128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1);
    162     tmp_set_128i_2  = _mm_set_epi8(128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2);
    163     tmp_set_128i_3  = _mm_set_epi8(128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3);
    164     tmp_set_128i_4  = _mm_set_epi8(128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4);
    165 
    166     //loaded sao offset values
    167     sao_offset      = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
    168 
    169     //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
    170     band_table0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
    171     band_table1_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
    172     band_table2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
    173     band_table3_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
    174 
    175     //band_position addition
    176     band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, band_pos_16x8b);
    177     band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, band_pos_16x8b);
    178     band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, band_pos_16x8b);
    179     band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, band_pos_16x8b);
    180     //sao_offset duplication
    181     tmp_set_128i_1  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
    182     tmp_set_128i_2  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
    183     tmp_set_128i_3  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
    184     tmp_set_128i_4  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
    185     //settng for comparision
    186     cmp_mask = _mm_set1_epi16(16);
    187     cmp_store = _mm_set1_epi16(0x00ff);
    188 
    189     //sao_offset addition
    190     band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, tmp_set_128i_1);
    191     band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, tmp_set_128i_2);
    192     band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, tmp_set_128i_3);
    193     band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, tmp_set_128i_4);
    194     //masking upper 8bit values of each  16 bit band table value
    195     band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store);
    196     band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store);
    197     band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store);
    198     band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store);
    199 
    200     switch(sao_band_pos)
    201     {
    202         case 0:
    203             tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b);
    204             band_table0_8x16b = _mm_and_si128(band_table0_8x16b, tmp_set_128i_2);
    205             break;
    206         case 28:
    207             tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b);
    208             band_table3_8x16b = _mm_or_si128(band_table3_8x16b, tmp_set_128i_2);
    209             break;
    210         case 29:
    211             tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b);
    212             band_table2_8x16b = _mm_or_si128(band_table2_8x16b, tmp_set_128i_2);
    213             tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b);
    214             band_table3_8x16b = _mm_and_si128(band_table3_8x16b, tmp_set_128i_2);
    215             break;
    216         case 30:
    217             tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b);
    218             band_table1_8x16b = _mm_or_si128(band_table1_8x16b, tmp_set_128i_2);
    219             tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b);
    220             band_table2_8x16b = _mm_and_si128(band_table2_8x16b, tmp_set_128i_2);
    221             break;
    222         case 31:
    223             tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b);
    224             band_table0_8x16b = _mm_or_si128(band_table0_8x16b, tmp_set_128i_2);
    225             tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b);
    226             band_table1_8x16b = _mm_and_si128(band_table1_8x16b, tmp_set_128i_2);
    227             break;
    228         default:
    229             break;
    230     }
    231     //sao_offset is reused for zero cmp mask.
    232     sao_offset = _mm_setzero_si128();
    233     tmp_set_128i_1 = _mm_set1_epi8(1);
    234     //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0);
    235     cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16);
    236 
    237     //masking upper 8bit values of each  16 bit band table value
    238     band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store);
    239     band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store);
    240     band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store);
    241     band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store);
    242 
    243     //band table 8x16 four registers are packed into 16x8 two registers:  band_table0_8x16b and band_table2_8x16b
    244     band_table0_8x16b = _mm_packus_epi16(band_table0_8x16b, band_table1_8x16b);
    245     band_table2_8x16b = _mm_packus_epi16(band_table2_8x16b, band_table3_8x16b);
    246 
    247     band_table3_8x16b = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31
    248     band_pos_16x8b = _mm_packus_epi16(band_pos_16x8b, band_pos_16x8b); //band_pos is now 8 bit aligned
    249     band_table3_8x16b = _mm_sub_epi8(band_table3_8x16b, tmp_set_128i_1); // to compare if value is greater than 31
    250 
    251     cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1);
    252     //  band_pos_16x8b = _mm_or_si128(band_pos_16x8b,cmp_store);
    253 
    254     for(col = wd; col >= 16; col -= 16)
    255     {
    256         pu1_src_cpy = pu1_src;
    257         for(row = ht; row > 0; row -= 2)
    258         {
    259 
    260 
    261             //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
    262             src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
    263             // row = 1
    264             src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
    265 
    266 
    267 
    268             //saturated substract 8 bit
    269             tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b);
    270             tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b);
    271             //if the values less than 0 put ff
    272             tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
    273             tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
    274             tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
    275             tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
    276             //if the values gret=ater than 31 put ff
    277             tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b);
    278             tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b);
    279             tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
    280             tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
    281 
    282 
    283             //row 0 and row1
    284             //if the values >16 then put ff ,cmp_mask = dup16(15)
    285             cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
    286             //values 16 to 31 for row 0 & 1 but values <16 ==0
    287             tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store);
    288             // values 0 to 15 for row 0 & 1
    289             tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store);
    290             //values 16 to 31 for row 0 & 1 but values <16 masked to ff
    291             cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
    292             tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store);
    293             //row 2 and  row 3
    294             //if the values >16 then put ff ,cmp_mask = dup16(15)
    295             cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
    296             //values 16 to 31 for row 2 & 3 but values <16 ==0
    297             tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store);
    298             // values 0 to 15 for row 2 & 3
    299             tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store);
    300             //values 16 to 31 for row 2 & 3 but values <16 masked to ff
    301             cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
    302             tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store);
    303 
    304             //row 0 and row 1
    305             //to preserve pixel values in which no offset needs to be added.
    306             cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
    307             src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store);
    308 
    309             //row 2 and row 3
    310             //to preserve pixel values in which no offset needs to be added.
    311             cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
    312             src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store);
    313 
    314             //indexing 0 - 15 bandtable indexes
    315             tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1);
    316             tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3);
    317             tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2);
    318             tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4);
    319             // combining all offsets results
    320             tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
    321             tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
    322             // combing results woth the pixel values
    323             src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
    324             src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
    325 
    326 
    327             //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
    328             _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_8x16b);
    329             // row = 1
    330             _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp2_8x16b);
    331 
    332             pu1_src_cpy += (src_strd << 1);
    333         }
    334         pu1_src += 16;
    335     }
    336     wd_rem = wd & 0xF;
    337     if(wd_rem)
    338     {pu1_src_cpy = pu1_src;
    339         for(row = ht; row > 0; row -= 4)
    340         {
    341 
    342 
    343             //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
    344             src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
    345             // row = 1
    346             src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
    347             // row = 2
    348             src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
    349             // row = 3
    350             src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
    351             //row0 and row1 packed and row2 and row3 packed
    352 
    353             src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
    354             src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b);
    355 
    356             //saturated substract 8 bit
    357             tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b);
    358             tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b);
    359             //if the values less than 0 put ff
    360             tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
    361             tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
    362             tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
    363             tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
    364             //if the values gret=ater than 31 put ff
    365             tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b);
    366             tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b);
    367             tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
    368             tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
    369 
    370 
    371 
    372             //row 0 and row1
    373             //if the values >16 then put ff ,cmp_mask = dup16(15)
    374             cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
    375             //values 16 to 31 for row 0 & 1 but values <16 ==0
    376             tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store);
    377             // values 0 to 15 for row 0 & 1
    378             tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store);
    379             //values 16 to 31 for row 0 & 1 but values <16 masked to ff
    380             cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
    381             tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store);
    382             //row 2 and  row 3
    383             //if the values >16 then put ff ,cmp_mask = dup16(15)
    384             cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
    385             //values 16 to 31 for row 2 & 3 but values <16 ==0
    386             tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store);
    387             // values 0 to 15 for row 2 & 3
    388             tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store);
    389             //values 16 to 31 for row 2 & 3 but values <16 masked to ff
    390             cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
    391             tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store);
    392 
    393             //row 0 and row 1
    394             //to preserve pixel values in which no offset needs to be added.
    395             cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
    396             src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store);
    397 
    398             //row 2 and row 3
    399             //to preserve pixel values in which no offset needs to be added.
    400             cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
    401             src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store);
    402 
    403             //indexing 0 - 15 bandtable indexes
    404             tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1);
    405             tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3);
    406             tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2);
    407             tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4);
    408             // combining all offsets results
    409             tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
    410             tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
    411             // combing results woth the pixel values
    412             src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
    413             src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
    414 
    415             //Getting row1 separately
    416             src_temp1_8x16b = _mm_srli_si128(src_temp0_8x16b, 8);
    417             //Getting row3 separately
    418             src_temp3_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
    419 
    420             //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
    421             _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_8x16b);
    422             // row = 1
    423             _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp1_8x16b);
    424             // row = 2
    425             _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp2_8x16b);
    426             // row = 3
    427             _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp3_8x16b);
    428 
    429             pu1_src_cpy += (src_strd << 2);
    430 
    431         }
    432         pu1_src += 8;
    433     }
    434 
    435 
    436 }
    437 
    438 void ihevc_sao_band_offset_chroma_ssse3(UWORD8 *pu1_src,
    439                                         WORD32 src_strd,
    440                                         UWORD8 *pu1_src_left,
    441                                         UWORD8 *pu1_src_top,
    442                                         UWORD8 *pu1_src_top_left,
    443                                         WORD32 sao_band_pos_u,
    444                                         WORD32 sao_band_pos_v,
    445                                         WORD8 *pi1_sao_offset_u,
    446                                         WORD8 *pi1_sao_offset_v,
    447                                         WORD32 wd,
    448                                         WORD32 ht)
    449 {
    450     WORD32 row, col;
    451     WORD8 offset = 0;
    452 
    453 
    454     __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
    455     __m128i cmp_msk2;
    456     __m128i band_table0_16x8b, band_table1_16x8b, band_table2_16x8b, band_table3_16x8b;
    457     __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4;
    458     __m128i band_pos_u_16x8b, band_pos_v_16x8b;
    459     __m128i sao_offset;
    460     __m128i cmp_mask;
    461 
    462 
    463     /* Updating left and top and top-left */
    464     for(row = 0; row < ht; row++)
    465     {
    466         pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
    467         pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
    468     }
    469     pu1_src_top_left[0] = pu1_src_top[wd - 2];
    470     pu1_src_top_left[1] = pu1_src_top[wd - 1];
    471     for(col = 0; col < wd; col += 8)
    472     {
    473         tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset));
    474         _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1);
    475         offset += 8;
    476     }
    477 
    478     { // band _table creation
    479         __m128i temp0_8x16b, temp1_8x16b, temp2_8x16b, temp3_8x16b;
    480         // Band table for U component : band_table0_16x8b and band_table2_16x8b
    481         //replicating sao_band_pos as 8 bit value 16 times
    482         band_pos_u_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_u << 3));
    483         //value set for sao_offset extraction
    484         tmp_set_128i_1  = _mm_set_epi8(128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1);
    485         tmp_set_128i_2  = _mm_set_epi8(128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2);
    486         tmp_set_128i_3  = _mm_set_epi8(128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3);
    487         tmp_set_128i_4  = _mm_set_epi8(128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4);
    488 
    489         //loaded sao offset values
    490         sao_offset      = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
    491 
    492         //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
    493         band_table0_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
    494         band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
    495         band_table2_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
    496         band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
    497 
    498         //band_position addition
    499         band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, band_pos_u_16x8b);
    500         band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_u_16x8b);
    501         band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, band_pos_u_16x8b);
    502         band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_u_16x8b);
    503         //sao_offset duplication
    504         temp0_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
    505         temp1_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
    506         temp2_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
    507         temp3_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
    508 
    509         //sao_offset addition
    510         band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, temp0_8x16b);
    511         band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, temp1_8x16b);
    512         band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, temp2_8x16b);
    513         band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, temp3_8x16b);
    514         //reuse for clipping
    515         temp1_8x16b = _mm_set1_epi16(0x00ff);
    516         //settng for comparision
    517         cmp_mask = _mm_set1_epi16(16);
    518 
    519         //masking upper 8bit values of each  16 bit band table value
    520         band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b);
    521         band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
    522         band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b);
    523         band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
    524 
    525         //temp1_8x16b reuse for compare storage
    526         switch(sao_band_pos_u)
    527         {
    528             case 0:
    529                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b);
    530                 band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp3_8x16b);
    531                 break;
    532             case 28:
    533                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
    534                 band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b);
    535                 break;
    536             case 29:
    537                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b);
    538                 band_table2_16x8b = _mm_or_si128(band_table2_16x8b, temp3_8x16b);
    539                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
    540                 band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b);
    541                 break;
    542             case 30:
    543                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
    544                 band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b);
    545                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b);
    546                 band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp3_8x16b);
    547                 break;
    548             case 31:
    549                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b);
    550                 band_table0_16x8b = _mm_or_si128(band_table0_16x8b, temp3_8x16b);
    551                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
    552                 band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b);
    553                 break;
    554             default:
    555                 break;
    556         }
    557         //masking upper 8bit values of each  16 bit band table value
    558         band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b);
    559         band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
    560         band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b);
    561         band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
    562         //band table 8x16 four registers are packed into 16x8 two registers:  band_table0_8x16b and band_table2_8x16b
    563         band_table0_16x8b = _mm_packus_epi16(band_table0_16x8b, band_table1_16x8b);
    564         band_table2_16x8b = _mm_packus_epi16(band_table2_16x8b, band_table3_16x8b);
    565         // Band table for U component over
    566 
    567         // Band table for V component : band_table1_16x8b and band_table3_16x8b
    568         // replicating sao_band_pos as 8 bit value 16 times
    569         band_pos_v_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_v << 3));
    570 
    571         //loaded sao offset values
    572         sao_offset      = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
    573 
    574         //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
    575         temp0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
    576         band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
    577         temp2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
    578         band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
    579 
    580         //band_position addition
    581         temp0_8x16b = _mm_add_epi16(temp0_8x16b, band_pos_v_16x8b);
    582         band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_v_16x8b);
    583         temp2_8x16b = _mm_add_epi16(temp2_8x16b, band_pos_v_16x8b);
    584         band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_v_16x8b);
    585         //sao_offset duplication
    586         tmp_set_128i_1  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
    587         tmp_set_128i_2  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
    588         tmp_set_128i_3  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
    589         tmp_set_128i_4  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
    590 
    591         //sao_offset addition
    592         temp0_8x16b = _mm_add_epi16(temp0_8x16b, tmp_set_128i_1);
    593         band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, tmp_set_128i_2);
    594         temp2_8x16b = _mm_add_epi16(temp2_8x16b, tmp_set_128i_3);
    595         band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, tmp_set_128i_4);
    596 
    597         //masking upper 8bit values of 16 bit band table value
    598         temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b);
    599         band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
    600         temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b);
    601         band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
    602         //temp1_8x16b reuse for compare storage
    603 
    604         switch(sao_band_pos_v)
    605         {
    606             case 0:
    607                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b);
    608                 temp0_8x16b = _mm_and_si128(temp0_8x16b, temp3_8x16b);
    609                 break;
    610             case 28:
    611                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
    612                 band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b);
    613                 break;
    614             case 29:
    615                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b);
    616                 temp2_8x16b = _mm_or_si128(temp2_8x16b, temp3_8x16b);
    617                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
    618                 band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b);
    619                 break;
    620             case 30:
    621                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
    622                 band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b);
    623                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b);
    624                 temp2_8x16b = _mm_and_si128(temp2_8x16b, temp3_8x16b);
    625                 break;
    626             case 31:
    627                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b);
    628                 temp0_8x16b = _mm_or_si128(temp0_8x16b, temp3_8x16b);
    629                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
    630                 band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b);
    631                 break;
    632             default:
    633                 break;
    634         }
    635         //masking upper 8bit values of each  16 bit band table value
    636         temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b);
    637         band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
    638         temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b);
    639         band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
    640         //band table 8x16 four registers are packed into 16x8 two registers:  band_table0_8x16b and band_table2_8x16b
    641         band_table1_16x8b = _mm_packus_epi16(temp0_8x16b, band_table1_16x8b);
    642         band_table3_16x8b = _mm_packus_epi16(temp2_8x16b, band_table3_16x8b);
    643         //band table for u and v created
    644     }
    645     {
    646         UWORD8 *pu1_src_cpy;
    647         WORD32 wd_rem;
    648 
    649 
    650         //sao_offset is reused for zero cmp mask.
    651         sao_offset = _mm_setzero_si128();
    652         tmp_set_128i_1 = _mm_set1_epi8(1);
    653         //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0);
    654         cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16);
    655         //to avoid ffff to be saturated to 0 instead it should be to ff
    656 
    657         cmp_msk2 = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31
    658         band_pos_u_16x8b = _mm_packus_epi16(band_pos_u_16x8b, band_pos_u_16x8b); //band_pos_u is now 8 bit aligned
    659         band_pos_v_16x8b = _mm_packus_epi16(band_pos_v_16x8b, band_pos_v_16x8b); //band_pos_v is now 8 bit aligned
    660         cmp_msk2 = _mm_sub_epi8(cmp_msk2, tmp_set_128i_1); // to compare if value is greater than 31
    661 
    662         cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1);
    663 
    664         for(col = wd; col >= 16; col -= 16)
    665         {
    666             pu1_src_cpy = pu1_src;
    667             for(row = ht; row > 0; row -= 2)
    668             {
    669                 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
    670                 src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
    671                 // row = 1
    672                 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
    673 
    674 
    675                 //odd values
    676                 src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
    677                 src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
    678                 //even values
    679                 src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8);
    680                 src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8);
    681                 src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
    682                 src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
    683                 //combining odd values
    684                 src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b);
    685                 //combining even values
    686                 src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b);
    687 
    688                 //saturated substract 8 bit
    689                 tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b);
    690                 tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b);
    691                 //if the values less than 0 put ff
    692                 tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
    693                 tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
    694                 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
    695                 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
    696                 //if the values greater than 31 put ff
    697                 tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2);
    698                 tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2);
    699                 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
    700                 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
    701                 // registers reused to increase performance
    702                 //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1
    703                 src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
    704                 //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and  row 3
    705                 src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
    706 
    707                 //values 16 to 31 for row 0 & 1 but values <16 ==0
    708                 tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b);
    709                 // values 0 to 15 for row 0 & 1
    710                 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b);
    711                 //values 16 to 31 for row 2 & 3 but values <16 ==0
    712                 tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b);
    713                 // values 0 to 15 for row 2 & 3
    714                 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b);
    715 
    716                 //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1
    717                 src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
    718                 //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and  row 3
    719                 src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
    720                 tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b);
    721                 tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b);
    722 
    723 
    724                 //to choose which pixel values to preserve in row 0 and row 1
    725                 src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
    726                 //to choose which pixel values to preserve in row 2 and row 3
    727                 src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
    728                 //values of all rows to which no offset needs to be added preserved.
    729                 src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b);
    730                 src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b);
    731 
    732                 //indexing 0 - 15 bandtable indexes
    733                 tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low
    734                 tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low
    735                 //indexing 16 -31 bandtable indexes
    736                 tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high
    737                 tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high
    738                 // combining all offsets results
    739                 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U
    740                 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V
    741                 // combing results with the pixel values
    742                 src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
    743                 src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
    744                 //reorganising even and odd values
    745                 src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b);
    746                 src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b);
    747 
    748 
    749                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
    750                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp1_8x16b);
    751                 // row = 1
    752                 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp3_8x16b);
    753 
    754 
    755                 pu1_src_cpy += (src_strd << 1);
    756 
    757             }
    758             pu1_src += 16;
    759         }
    760 
    761         wd_rem = wd & 0xF;
    762         if(wd_rem)
    763         {
    764             pu1_src_cpy = pu1_src;
    765             for(row = ht; row > 0; row -= 4)
    766             {
    767                 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
    768                 src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
    769                 // row = 1
    770                 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
    771                 // row = 2
    772                 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
    773                 // row = 3
    774                 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
    775                 //row0 and row1 packed and row2 and row3 packed
    776 
    777                 src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
    778                 src_temp3_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b);
    779                 //odd values
    780                 src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
    781                 src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
    782                 //even values
    783                 src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8);
    784                 src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8);
    785                 src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
    786                 src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
    787                 //combining odd values
    788                 src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b);
    789                 //combining even values
    790                 src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b);
    791 
    792                 //saturated substract 8 bit
    793                 tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b);
    794                 tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b);
    795                 //if the values less than 0 put ff
    796                 tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
    797                 tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
    798                 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
    799                 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
    800                 //if the values greater than 31 put ff
    801                 tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2);
    802                 tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2);
    803                 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
    804                 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
    805                 // registers reused to increase performance
    806                 //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1
    807                 src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
    808                 //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and  row 3
    809                 src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
    810 
    811                 //values 16 to 31 for row 0 & 1 but values <16 ==0
    812                 tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b);
    813                 // values 0 to 15 for row 0 & 1
    814                 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b);
    815                 //values 16 to 31 for row 2 & 3 but values <16 ==0
    816                 tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b);
    817                 // values 0 to 15 for row 2 & 3
    818                 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b);
    819 
    820                 //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1
    821                 src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
    822                 //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and  row 3
    823                 src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
    824                 tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b);
    825                 tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b);
    826 
    827 
    828                 //to choose which pixel values to preserve in row 0 and row 1
    829                 src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
    830                 //to choose which pixel values to preserve in row 2 and row 3
    831                 src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
    832                 //values of all rows to which no offset needs to be added preserved.
    833                 src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b);
    834                 src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b);
    835 
    836                 //indexing 0 - 15 bandtable indexes
    837                 tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low
    838                 tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low
    839                 //indexing 16 -31 bandtable indexes
    840                 tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high
    841                 tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high
    842                 // combining all offsets results
    843                 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U
    844                 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V
    845                 // combing results with the pixel values
    846                 src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
    847                 src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
    848                 //reorganising even and odd values
    849                 src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b);
    850                 src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b);
    851                 //Getting row1 separately
    852                 src_temp0_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
    853                 //Getting row3 separately
    854                 src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
    855 
    856                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
    857                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp1_8x16b);
    858                 // row = 1
    859                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp0_8x16b);
    860                 // row = 2
    861                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp3_8x16b);
    862                 // row = 3
    863                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp2_8x16b);
    864 
    865                 pu1_src_cpy += (src_strd << 2);
    866 
    867             }
    868             pu1_src += 16;
    869         }
    870 
    871 
    872     }
    873 }
    874 
    875 
    876 
    877 void ihevc_sao_edge_offset_class0_ssse3(UWORD8 *pu1_src,
    878                                         WORD32 src_strd,
    879                                         UWORD8 *pu1_src_left,
    880                                         UWORD8 *pu1_src_top,
    881                                         UWORD8 *pu1_src_top_left,
    882                                         UWORD8 *pu1_src_top_right,
    883                                         UWORD8 *pu1_src_bot_left,
    884                                         UWORD8 *pu1_avail,
    885                                         WORD8 *pi1_sao_offset,
    886                                         WORD32 wd,
    887                                         WORD32 ht)
    888 {
    889     WORD32 row, col;
    890     UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp;
    891     UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
    892     UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
    893     UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
    894     UWORD8 u1_avail0, u1_avail1;
    895     WORD32 wd_rem;
    896     WORD32 offset = 0;
    897     __m128i src_temp0_16x8b, src_temp1_16x8b;
    898     __m128i left0_16x8b, left1_16x8b;
    899     __m128i cmp_gt0_16x8b, cmp_lt0_16x8b, cmp_gt1_16x8b, cmp_lt1_16x8b;
    900     __m128i edge0_16x8b, edge1_16x8b;
    901     __m128i au1_mask8x16b;
    902     __m128i edge_idx_8x16b, sao_offset_8x16b;
    903     __m128i const2_16x8b, const0_16x8b;
    904     __m128i left_store_16x8b;
    905     UNUSED(pu1_src_top_right);
    906     UNUSED(pu1_src_bot_left);
    907 
    908     au1_mask8x16b = _mm_set1_epi8(0xff);
    909 
    910     /* Update  top and top-left arrays */
    911 
    912     *pu1_src_top_left = pu1_src_top[wd - 1];
    913 
    914     for(col = wd; col >= 16; col -= 16)
    915     {
    916         const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd));
    917         _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b);
    918         offset += 16;
    919     }
    920 
    921     //setting availability mask to ff size MAX_CTB_SIZE
    922     for(col = 0; col < MAX_CTB_SIZE; col += 16)
    923         _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
    924     for(row = 0; row < ht; row++)
    925     {
    926         au1_src_left_tmp[row] = pu1_src_left[row];
    927     }
    928     edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
    929     sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
    930 
    931     //availability mask creation
    932     u1_avail0 = pu1_avail[0];
    933     u1_avail1 = pu1_avail[1];
    934     au1_mask[0] = u1_avail0;
    935     au1_mask[wd - 1] = u1_avail1;
    936 
    937     const2_16x8b = _mm_set1_epi8(2);
    938     const0_16x8b = _mm_setzero_si128();
    939     pu1_src_left_cpy = au1_src_left_tmp;
    940     pu1_src_left_str = au1_src_left_tmp1;
    941     {
    942         au1_mask_cpy = au1_mask;
    943         for(col = wd; col >= 16; col -= 16)
    944         {
    945             pu1_src_cpy = pu1_src;
    946             au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
    947             //pu1_src_left_cpy =au1_src_left_tmp;
    948             for(row = ht; row > 0; row -= 2)
    949             {
    950 
    951                 left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
    952                 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
    953                 src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
    954                 // row = 1
    955                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
    956 
    957                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 2);
    958                 //row 1 left
    959                 left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15);
    960                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
    961                 //row 0 left
    962                 left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15);
    963                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
    964 
    965 
    966                 //separating +ve and and -ve values.
    967                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
    968                 cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
    969                 cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
    970                 cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
    971                 //creating mask 00 for +ve and -ve values and FF for zero.
    972                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
    973                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
    974                 cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
    975                 cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
    976                 //combining the appropriate sign change
    977                 left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
    978                 left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
    979 
    980                 //row = 0 right
    981                 edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1));
    982                 // row = 1 right
    983                 edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
    984                 //separating +ve and and -ve values.
    985                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
    986                 cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
    987                 cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
    988                 cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
    989                 //creating mask 00 for +ve and -ve values and FF for zero.
    990                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
    991                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
    992                 cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
    993                 cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
    994                 //combining the appropriate sign change
    995                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
    996                 edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
    997 
    998                 //combining sign-left and sign_right
    999                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
   1000                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
   1001                 //adding constant 2
   1002                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   1003                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
   1004                 //shuffle to get sao index
   1005                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   1006                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
   1007                 //using availability mask
   1008                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
   1009                 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
   1010 
   1011                 //shuffle to get sao offset
   1012                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   1013                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
   1014                 //cnvert to 16 bit then add and then saturated pack
   1015                 left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   1016                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   1017                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
   1018                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
   1019                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1020                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
   1021                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
   1022                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
   1023 
   1024                 left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
   1025                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
   1026                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
   1027                 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
   1028                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1029                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
   1030                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
   1031                 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
   1032 
   1033 
   1034                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
   1035                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   1036                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   1037                 // row = 1
   1038                 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
   1039 
   1040                 pu1_src_cpy += (src_strd << 1);
   1041                 pu1_src_left_cpy += 2;
   1042                 pu1_src_left_str += 2;
   1043             }
   1044             au1_mask_cpy += 16;
   1045             pu1_src += 16;
   1046             pu1_src_left_cpy -= ht;
   1047             pu1_src_left_str -= ht;
   1048 
   1049             pu1_left_tmp = pu1_src_left_cpy;
   1050             pu1_src_left_cpy = pu1_src_left_str;
   1051             pu1_src_left_str = pu1_left_tmp;
   1052         }
   1053 
   1054         wd_rem = wd & 0xF;
   1055         if(wd_rem)
   1056         {
   1057 
   1058             cmp_gt1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd));
   1059             _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt1_16x8b);
   1060 
   1061             au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy);
   1062             pu1_src_cpy = pu1_src;
   1063             au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
   1064             //pu1_src_left_cpy =au1_src_left_tmp;
   1065             for(row = ht; row > 0; row -= 4)
   1066             {
   1067                 left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
   1068                 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
   1069                 src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
   1070                 // row = 1
   1071                 cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
   1072                 // row  = 2
   1073                 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
   1074                 // row = 3
   1075                 cmp_gt1_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
   1076 
   1077 
   1078                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4);
   1079                 //row 3 left
   1080                 edge0_16x8b = _mm_slli_si128(cmp_gt1_16x8b, 8);
   1081                 cmp_lt1_16x8b = _mm_alignr_epi8(cmp_gt1_16x8b, left_store_16x8b, 15);
   1082                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
   1083                 //row 2 left
   1084                 edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
   1085                 left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15);
   1086                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
   1087                 //row 1 left
   1088                 edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8);
   1089                 cmp_lt0_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 15);
   1090                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
   1091                 //row 0 left
   1092                 edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
   1093                 left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15);
   1094                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
   1095 
   1096                 // packing rows together for 16 SIMD operations
   1097                 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b);
   1098                 src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_gt1_16x8b);
   1099                 // packing rows together for 16 SIMD operations
   1100                 left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, cmp_lt0_16x8b);
   1101                 left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, cmp_lt1_16x8b);
   1102 
   1103                 //separating +ve and and -ve values.
   1104                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
   1105                 cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
   1106                 cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
   1107                 cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
   1108                 //creating mask 00 for +ve and -ve values and FF for zero.
   1109                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   1110                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   1111                 cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
   1112                 cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
   1113                 //combining the appropriate sign change
   1114                 left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1115                 left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
   1116 
   1117                 //row = 0 right
   1118                 edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 1));
   1119                 // row = 1 right
   1120                 cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 1));
   1121                 // row = 2 right
   1122                 edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 1));
   1123                 // row = 3 right
   1124                 cmp_gt1_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 1));
   1125                 // packing rows together for 16 SIMD operations
   1126                 edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b);
   1127                 edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_gt1_16x8b);
   1128 
   1129                 //separating +ve and and -ve values.
   1130                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
   1131                 cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
   1132                 cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
   1133                 cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
   1134                 //creating mask 00 for +ve and -ve values and FF for zero.
   1135                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   1136                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   1137                 cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
   1138                 cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
   1139                 //combining the appropriate sign change
   1140                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1141                 edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
   1142 
   1143                 //combining sign-left and sign_right
   1144                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
   1145                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
   1146                 //adding constant 2
   1147                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   1148                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
   1149                 //shuffle to get sao index
   1150                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   1151                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
   1152                 //shuffle to get sao offset
   1153                 //using availability mask
   1154                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
   1155                 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
   1156 
   1157                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   1158                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
   1159                 //cnvert to 16 bit then add and then saturated pack
   1160                 left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   1161                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   1162                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
   1163                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
   1164                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1165                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
   1166                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
   1167                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
   1168 
   1169                 left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
   1170                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
   1171                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
   1172                 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
   1173                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1174                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
   1175                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
   1176                 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
   1177                 //separting row 1 and row 3
   1178                 cmp_lt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
   1179                 cmp_lt1_16x8b = _mm_srli_si128(src_temp1_16x8b, 8);
   1180 
   1181                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
   1182                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   1183                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   1184                 // row = 1
   1185                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_lt0_16x8b);
   1186                 // row = 2
   1187                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b);
   1188                 // row = 3
   1189                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt1_16x8b);
   1190 
   1191                 pu1_src_cpy += (src_strd << 2);
   1192                 pu1_src_left_cpy += 4;
   1193                 pu1_src_left_str += 4;
   1194             }
   1195             pu1_src += wd;
   1196             pu1_src_left_cpy -= ht;
   1197             pu1_src_left_str -= ht;
   1198 
   1199             pu1_left_tmp = pu1_src_left_cpy;
   1200             pu1_src_left_cpy = pu1_src_left_str;
   1201             pu1_src_left_str = pu1_left_tmp;
   1202         }
   1203         for(row = 0; row < ht; row++)
   1204         {
   1205             pu1_src_left[row] = pu1_src_left_cpy[row];
   1206         }
   1207     }
   1208 }
   1209 
   1210 
   1211 void ihevc_sao_edge_offset_class0_chroma_ssse3(UWORD8 *pu1_src,
   1212                                                WORD32 src_strd,
   1213                                                UWORD8 *pu1_src_left,
   1214                                                UWORD8 *pu1_src_top,
   1215                                                UWORD8 *pu1_src_top_left,
   1216                                                UWORD8 *pu1_src_top_right,
   1217                                                UWORD8 *pu1_src_bot_left,
   1218                                                UWORD8 *pu1_avail,
   1219                                                WORD8 *pi1_sao_offset_u,
   1220                                                WORD8 *pi1_sao_offset_v,
   1221                                                WORD32 wd,
   1222                                                WORD32 ht)
   1223 {
   1224     WORD32 row, col;
   1225     UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp;
   1226     UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
   1227     UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
   1228     UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)];
   1229     UWORD8 u1_avail0, u1_avail1;
   1230     WORD32 wd_rem;
   1231     WORD32 offset = 0;
   1232 
   1233     __m128i src_temp0_16x8b, src_temp1_16x8b;
   1234     __m128i left0_16x8b, left1_16x8b;
   1235     __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
   1236     __m128i edge0_16x8b, edge1_16x8b;
   1237     __m128i au1_mask8x16b;
   1238     __m128i edge_idx_8x16b, sao_offset_8x16b;
   1239     __m128i const2_16x8b, const0_16x8b;
   1240     __m128i left_store_16x8b;
   1241     __m128i chroma_offset_8x16b;
   1242     UNUSED(pu1_src_top_right);
   1243     UNUSED(pu1_src_bot_left);
   1244 
   1245     au1_mask8x16b = _mm_set1_epi8(0xff);
   1246 
   1247     /* Update  top and top-left arrays */
   1248     pu1_src_top_left[0] = pu1_src_top[wd - 2];
   1249     pu1_src_top_left[1] = pu1_src_top[wd - 1];;
   1250 
   1251     for(col = wd; col >= 16; col -= 16)
   1252     {
   1253         const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd));
   1254         _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b);
   1255         offset += 16;
   1256     }
   1257     for(row = 0; row < 2 * ht; row++)
   1258     {
   1259         au1_src_left_tmp[row] = pu1_src_left[row];
   1260     }
   1261     //setting availability mask to ff size MAX_CTB_SIZE
   1262     for(col = 0; col < MAX_CTB_SIZE; col += 16)
   1263         _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
   1264 
   1265     edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
   1266     sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
   1267     const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
   1268     chroma_offset_8x16b = _mm_set1_epi16(0x0800);
   1269     //availability mask creation
   1270     u1_avail0 = pu1_avail[0];
   1271     u1_avail1 = pu1_avail[1];
   1272     au1_mask[0] = u1_avail0;
   1273     au1_mask[1] = u1_avail0;
   1274     au1_mask[wd - 1] = u1_avail1;
   1275     au1_mask[wd - 2] = u1_avail1;
   1276     sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
   1277     const2_16x8b = _mm_set1_epi8(2);
   1278     const0_16x8b = _mm_setzero_si128();
   1279 
   1280     {
   1281         pu1_src_left_cpy = au1_src_left_tmp;
   1282         pu1_src_left_str = au1_src_left_tmp1;
   1283         au1_mask_cpy = au1_mask;
   1284         for(col = wd; col >= 16; col -= 16)
   1285         {
   1286             pu1_src_cpy = pu1_src;
   1287             au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
   1288 
   1289             for(row = ht; row > 0; row -= 2)
   1290             {
   1291 
   1292                 left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
   1293                 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
   1294                 src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
   1295                 // row = 1
   1296                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
   1297 
   1298                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4);
   1299                 //row 1 left
   1300                 left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14);
   1301                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
   1302                 //row 0 left
   1303                 left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14);
   1304                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
   1305 
   1306 
   1307                 //separating +ve and and -ve values.row 0 left
   1308                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
   1309                 cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
   1310                 //creating mask 00 for +ve and -ve values and FF for zero.
   1311                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   1312                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   1313                 //combining the appropriate sign change
   1314                 left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1315 
   1316                 //separating +ve and and -ve values.row 1 left
   1317                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
   1318                 cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
   1319                 //creating mask 00 for +ve and -ve values and FF for zero.
   1320                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   1321                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   1322                 //combining the appropriate sign change
   1323                 left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1324 
   1325 
   1326                 //row = 0 right
   1327                 edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2));
   1328                 // row = 1 right
   1329                 edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
   1330                 //separating +ve and and -ve values.row 0 right
   1331                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
   1332                 cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
   1333                 //creating mask 00 for +ve and -ve values and FF for zero.
   1334                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   1335                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   1336                 //combining the appropriate sign change
   1337                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1338 
   1339                 //separating +ve and and -ve values.row 1 right
   1340                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
   1341                 cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
   1342                 //creating mask 00 for +ve and -ve values and FF for zero.
   1343                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   1344                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   1345                 //combining the appropriate sign change
   1346                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1347 
   1348                 //combining sign-left and sign_right
   1349                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
   1350                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
   1351                 //adding constant 2
   1352                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   1353                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
   1354                 //shuffle to get sao index
   1355                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   1356                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
   1357                 //using availability mask
   1358                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
   1359                 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
   1360                 //adding chroma offset to access U and V
   1361                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
   1362                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
   1363 
   1364                 //shuffle to get sao offset
   1365                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   1366                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
   1367                 //cnvert to 16 bit then add and then saturated pack
   1368                 left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   1369                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   1370                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
   1371                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
   1372                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
   1373                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1374                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
   1375                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
   1376 
   1377                 left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
   1378                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
   1379                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
   1380                 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
   1381                 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
   1382                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1383                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
   1384                 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
   1385 
   1386                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
   1387                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   1388                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   1389                 // row = 1
   1390                 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
   1391 
   1392                 pu1_src_cpy += (src_strd << 1);
   1393                 pu1_src_left_cpy += 4;
   1394                 pu1_src_left_str += 4;
   1395             }
   1396             au1_mask_cpy += 16;
   1397             pu1_src += 16;
   1398             pu1_src_left_cpy -= 2 * ht;
   1399             pu1_src_left_str -= 2 * ht;
   1400 
   1401             pu1_left_tmp = pu1_src_left_cpy;
   1402             pu1_src_left_cpy = pu1_src_left_str;
   1403             pu1_src_left_str = pu1_left_tmp;
   1404         }
   1405 
   1406         wd_rem = wd & 0xF;
   1407         if(wd_rem)
   1408         {
   1409 
   1410             cmp_gt0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd));
   1411             _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt0_16x8b);
   1412 
   1413             au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy);
   1414             pu1_src_cpy = pu1_src;
   1415             au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
   1416 
   1417             for(row = ht; row > 0; row -= 4)
   1418             {
   1419                 left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
   1420                 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
   1421                 src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
   1422                 // row = 1
   1423                 cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
   1424                 // row  = 2
   1425                 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
   1426                 // row = 3
   1427                 cmp_lt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
   1428 
   1429 
   1430                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 8);
   1431                 //row 3 left
   1432                 edge0_16x8b = _mm_slli_si128(cmp_lt0_16x8b, 8);
   1433                 left0_16x8b = _mm_alignr_epi8(cmp_lt0_16x8b, left_store_16x8b, 14);
   1434                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
   1435                 //row 2 left
   1436                 edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
   1437                 left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14);
   1438                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
   1439 
   1440 
   1441                 // packing rows together for 16 SIMD operations
   1442                 src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_lt0_16x8b);
   1443                 left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, left0_16x8b);
   1444 
   1445                 //row 1 left
   1446                 edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8);
   1447                 edge1_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 14);
   1448                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
   1449                 //row 0 left
   1450                 edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
   1451                 left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14);
   1452                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
   1453                 // packing rows together for 16 SIMD operations
   1454                 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b);
   1455                 left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, edge1_16x8b);
   1456 
   1457                 //separating +ve and and -ve values.for row 2 and row 3
   1458                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
   1459                 cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
   1460                 //creating mask 00 for +ve and -ve values and FF for zero.
   1461                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   1462                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   1463                 //combining the appropriate sign change
   1464                 left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1465 
   1466 
   1467 
   1468 
   1469 
   1470                 //separating +ve and and -ve values.
   1471                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
   1472                 cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
   1473                 //creating mask 00 for +ve and -ve values and FF for zero.
   1474                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   1475                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   1476                 //combining the appropriate sign change
   1477                 left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1478 
   1479 
   1480                 //row = 0 right
   1481                 edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2));
   1482                 // row = 1 right
   1483                 cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 2));
   1484                 // row = 2 right
   1485                 edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 2));
   1486                 // row = 3 right
   1487                 cmp_lt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 2));
   1488                 // packing rows together for 16 SIMD operations
   1489                 edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b);
   1490                 edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_lt0_16x8b);
   1491 
   1492                 //separating +ve and and -ve values.
   1493                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
   1494                 cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
   1495                 //creating mask 00 for +ve and -ve values and FF for zero.
   1496                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   1497                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   1498                 //combining the appropriate sign change
   1499                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1500 
   1501                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
   1502                 cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
   1503                 //creating mask 00 for +ve and -ve values and FF for zero.
   1504                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   1505                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   1506                 //combining the appropriate sign change
   1507                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1508 
   1509                 //combining sign-left and sign_right
   1510                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
   1511                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
   1512                 //adding constant 2
   1513                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   1514                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
   1515                 //shuffle to get sao index
   1516                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   1517                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
   1518                 //shuffle to get sao offset
   1519                 //using availability mask
   1520                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
   1521                 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
   1522                 //adding chroma offset to access U and V
   1523                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
   1524                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
   1525 
   1526                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   1527                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
   1528                 //cnvert to 16 bit then add and then saturated pack
   1529                 left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   1530                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   1531                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
   1532                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
   1533                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
   1534                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1535                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
   1536                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
   1537 
   1538                 left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
   1539                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
   1540                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
   1541                 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
   1542                 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
   1543                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1544                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
   1545                 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
   1546 
   1547                 //seaprting row 1 and row 3
   1548                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
   1549                 cmp_lt0_16x8b = _mm_srli_si128(src_temp1_16x8b, 8);
   1550 
   1551                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
   1552                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   1553                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   1554                 // row = 1
   1555                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
   1556                 // row = 2
   1557                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b);
   1558                 // row = 3
   1559                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
   1560 
   1561                 pu1_src_cpy += (src_strd << 2);
   1562                 pu1_src_left_cpy += 8;
   1563                 pu1_src_left_str += 8;
   1564             }
   1565             pu1_src += wd;
   1566             pu1_src_left_cpy -= 2 * ht;
   1567             pu1_src_left_str -= 2 * ht;
   1568 
   1569             pu1_left_tmp = pu1_src_left_cpy;
   1570             pu1_src_left_cpy = pu1_src_left_str;
   1571             pu1_src_left_str = pu1_left_tmp;
   1572         }
   1573         for(row = 0; row < 2 * ht; row++)
   1574         {
   1575             pu1_src_left[row] = pu1_src_left_cpy[row];
   1576         }
   1577     }
   1578 
   1579 }
   1580 
   1581 
   1582 void ihevc_sao_edge_offset_class1_ssse3(UWORD8 *pu1_src,
   1583                                         WORD32 src_strd,
   1584                                         UWORD8 *pu1_src_left,
   1585                                         UWORD8 *pu1_src_top,
   1586                                         UWORD8 *pu1_src_top_left,
   1587                                         UWORD8 *pu1_src_top_right,
   1588                                         UWORD8 *pu1_src_bot_left,
   1589                                         UWORD8 *pu1_avail,
   1590                                         WORD8 *pi1_sao_offset,
   1591                                         WORD32 wd,
   1592                                         WORD32 ht)
   1593 {
   1594     WORD32 row, col;
   1595     UWORD8 *pu1_src_top_cpy;
   1596     UWORD8 *pu1_src_cpy;
   1597     WORD32 wd_rem;
   1598 
   1599 
   1600     __m128i src_top_16x8b, src_bottom_16x8b;
   1601     __m128i src_temp0_16x8b, src_temp1_16x8b;
   1602     __m128i signup0_16x8b, signdwn1_16x8b;
   1603     __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
   1604     __m128i edge0_16x8b, edge1_16x8b;
   1605     __m128i edge_idx_8x16b, sao_offset_8x16b;
   1606     __m128i const2_16x8b, const0_16x8b;
   1607 
   1608     UNUSED(pu1_src_top_right);
   1609     UNUSED(pu1_src_bot_left);
   1610 
   1611 
   1612     /* Updating left and top-left  */
   1613     for(row = 0; row < ht; row++)
   1614     {
   1615         pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
   1616     }
   1617     *pu1_src_top_left = pu1_src_top[wd - 1];
   1618 
   1619 
   1620 
   1621     pu1_src_top_cpy = pu1_src_top;
   1622     edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
   1623     sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
   1624 
   1625     /* Update height and source pointers based on the availability flags */
   1626     if(0 == pu1_avail[2])
   1627     {
   1628         pu1_src_top_cpy = pu1_src;
   1629         pu1_src += src_strd;
   1630         ht--;
   1631     }
   1632     if(0 == pu1_avail[3])
   1633     {
   1634         ht--;
   1635     }
   1636 
   1637     const2_16x8b = _mm_set1_epi8(2);
   1638     const0_16x8b = _mm_setzero_si128();
   1639 
   1640     {
   1641         WORD32 ht_rem;
   1642         for(col = wd; col >= 16; col -= 16)
   1643         {
   1644             pu1_src_cpy = pu1_src;
   1645             src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
   1646             //row = 0
   1647             src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
   1648             //separating +ve and and -ve values.
   1649             cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
   1650             cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
   1651             //creating mask 00 for +ve and -ve values and FF for zero.
   1652             cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   1653             cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   1654             //combining the appropriate sign change
   1655             signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1656 
   1657             for(row = ht; row >= 2; row -= 2)
   1658             {
   1659 
   1660                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
   1661                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
   1662                 // row = 2
   1663                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
   1664 
   1665 
   1666                 //row 0 -row1
   1667                 //separating +ve and and -ve values.
   1668                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
   1669                 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
   1670                 //creating mask 00 for +ve and -ve values and FF for zero.
   1671                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   1672                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   1673                 //combining the appropriate sign change
   1674                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1675                 //row1-row0
   1676                 edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
   1677 
   1678                 //row1 -bottom
   1679                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
   1680                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
   1681                 //creating mask 00 for +ve and -ve values and FF for zero.
   1682                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   1683                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   1684                 //combining the appropriate sign change
   1685                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1686 
   1687                 //combining sign-left and sign_right
   1688                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
   1689                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
   1690 
   1691                 //for the next iteration signup0_16x8b = -signdwn1_16x8b
   1692                 signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
   1693                 //adding constant 2
   1694                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   1695                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
   1696                 //shuffle to get sao index
   1697                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   1698                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
   1699                 //shuffle to get sao offset
   1700                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   1701                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
   1702                 //copying the next top
   1703                 src_top_16x8b = src_temp1_16x8b;
   1704                 //cnvert to 16 bit then add and then saturated pack
   1705                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   1706                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   1707                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   1708                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
   1709                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1710                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
   1711                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
   1712                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
   1713 
   1714                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
   1715                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
   1716                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
   1717                 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
   1718                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1719                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
   1720                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
   1721                 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
   1722 
   1723                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   1724                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   1725                 // row = 1
   1726                 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
   1727 
   1728                 src_temp0_16x8b = src_bottom_16x8b;
   1729                 pu1_src_cpy += (src_strd << 1);
   1730             }
   1731             ht_rem = ht & 0x1;
   1732 
   1733             if(ht_rem)
   1734             {
   1735                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
   1736                 //current row -next row
   1737                 //separating +ve and and -ve values.
   1738                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
   1739                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
   1740                 //creating mask 00 for +ve and -ve values and FF for zero.
   1741                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   1742                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   1743                 //combining the appropriate sign change
   1744                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1745                 //adding top and botton and constant 2
   1746                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
   1747                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   1748 
   1749                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   1750                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   1751                 //copying the next top
   1752                 src_top_16x8b = src_temp0_16x8b;
   1753                 //cnvert to 16 bit then add and then saturated pack
   1754                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   1755                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   1756                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   1757                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
   1758                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1759                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
   1760                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
   1761                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
   1762 
   1763                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   1764             }
   1765             if(0 == pu1_avail[3])
   1766             {
   1767                 src_top_16x8b = src_bottom_16x8b;
   1768             }
   1769             //updating top flag
   1770             _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
   1771             pu1_src += 16;
   1772         }
   1773 
   1774         wd_rem = wd & 0xF;
   1775         if(wd_rem)
   1776         {
   1777             pu1_src_cpy = pu1_src;
   1778             src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
   1779             //row = 0
   1780             src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
   1781             //separating +ve and and -ve values.
   1782             cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
   1783             cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
   1784             //creating mask 00 for +ve and -ve values and FF for zero.
   1785             cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   1786             cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   1787             //combining the appropriate sign change
   1788             signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1789             signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
   1790             for(row = ht; row >= 4; row -= 4)
   1791             {
   1792                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
   1793                 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
   1794                 // row = 2
   1795                 src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
   1796 
   1797                 //row 0 -row1
   1798                 //separating +ve and and -ve values.
   1799                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
   1800                 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
   1801                 //creating mask 00 for +ve and -ve values and FF for zero.
   1802                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   1803                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   1804                 //combining the appropriate sign change
   1805                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1806 
   1807                 //row1-row0
   1808                 edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
   1809                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
   1810                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
   1811                 //row1 -row2
   1812                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
   1813                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
   1814                 //creating mask 00 for +ve and -ve values and FF for zero.
   1815                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   1816                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   1817                 //combining the appropriate sign change
   1818                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
   1819                 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
   1820                 //packing row 0 n row 1
   1821                 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
   1822                 //row = 3
   1823                 src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
   1824                 // row = 4
   1825                 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd));
   1826 
   1827                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
   1828                 signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2)
   1829                 //separating +ve and and -ve values.(2,3)
   1830                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b);
   1831                 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b);
   1832                 //creating mask 00 for +ve and -ve values and FF for zero.
   1833                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   1834                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   1835                 //combining the appropriate sign change
   1836                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
   1837 
   1838                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down)
   1839                 edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8);
   1840                 //separating +ve and and -ve values.(3,4)
   1841                 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b);
   1842                 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b);
   1843                 //creating mask 00 for +ve and -ve values and FF for zero.
   1844                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   1845                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   1846                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4)
   1847                 //combining sign-left and sign_right
   1848                 edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3)
   1849 
   1850                 edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
   1851 
   1852                 //packing row 2 n row 3
   1853                 src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
   1854                 //for the next iteration signup0_16x8b = -signdwn1_16x8b
   1855                 signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3)
   1856 
   1857                 //adding constant 2
   1858                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   1859                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
   1860                 //shuffle to get sao index
   1861                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   1862                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
   1863                 //shuffle to get sao offset
   1864                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   1865                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
   1866                 //the next top already in  src_top_16x8b
   1867                 //src_top_16x8b = src_temp1_16x8b;
   1868                 //cnvert to 16 bit then add and then saturated pack
   1869                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   1870                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   1871                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   1872                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
   1873                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1874                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
   1875                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
   1876                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
   1877 
   1878                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
   1879                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
   1880                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
   1881                 src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
   1882                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1883                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
   1884                 src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
   1885                 src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
   1886 
   1887                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
   1888                 cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
   1889                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   1890                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   1891                 // row = 1
   1892                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
   1893                 //row = 2
   1894                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
   1895                 // row = 3
   1896                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
   1897 
   1898                 src_temp0_16x8b = src_temp1_16x8b;
   1899                 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
   1900                 pu1_src_cpy += (src_strd << 2);
   1901 
   1902             }
   1903             ht_rem = ht & 0x2;
   1904             if(ht_rem)
   1905             {
   1906 
   1907                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
   1908                 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
   1909                 // row = 2
   1910                 src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
   1911 
   1912                 //row 0 -row1
   1913                 //separating +ve and and -ve values.
   1914                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
   1915                 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
   1916                 //creating mask 00 for +ve and -ve values and FF for zero.
   1917                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   1918                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   1919                 //combining the appropriate sign change
   1920                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1921                 //row1-row0
   1922                 edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
   1923                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
   1924                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
   1925                 //row1 -row2
   1926                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
   1927                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
   1928                 //creating mask 00 for +ve and -ve values and FF for zero.
   1929                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   1930                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   1931                 //combining the appropriate sign change
   1932                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
   1933                 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
   1934                 //adding top and down substraction
   1935                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
   1936                 //for the next iteration signup0_16x8b = -signdwn1_16x8b
   1937                 signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next
   1938                 src_top_16x8b = src_temp1_16x8b;
   1939                 //adding constant 2
   1940                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   1941 
   1942                 //shuffle to get sao index
   1943                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   1944 
   1945                 //shuffle to get sao offset
   1946                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   1947 
   1948                 //the next top already in  src_top_16x8b
   1949                 //cnvert to 16 bit then add and then saturated pack
   1950                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   1951                 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   1952                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   1953                 src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
   1954                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
   1955                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
   1956                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
   1957                 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
   1958 
   1959                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
   1960 
   1961                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   1962                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   1963                 // row = 1
   1964                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
   1965                 src_temp0_16x8b = src_bottom_16x8b;
   1966                 pu1_src_cpy += (src_strd << 1);
   1967 
   1968             }
   1969             ht_rem = ht & 0x1;
   1970             if(ht_rem)
   1971             {
   1972 
   1973                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
   1974                 src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
   1975 
   1976                 //row 0 -row1
   1977                 //separating +ve and and -ve values.
   1978                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
   1979                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
   1980                 //creating mask 00 for +ve and -ve values and FF for zero.
   1981                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   1982                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   1983                 //combining the appropriate sign change
   1984                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   1985                 //adding top and down substraction
   1986                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
   1987                 //adding constant 2
   1988                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   1989                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
   1990                 edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
   1991                 //shuffle to get sao index
   1992                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   1993                 //shuffle to get sao offset
   1994                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   1995                 src_top_16x8b = src_temp0_16x8b;
   1996                 //cnvert to 16 bit then add and then saturated pack
   1997                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   1998                 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   1999                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   2000                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
   2001                 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
   2002                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   2003                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   2004                 pu1_src_cpy += (src_strd);
   2005 
   2006             }
   2007             if(0 == pu1_avail[3])
   2008             {
   2009                 src_top_16x8b = src_bottom_16x8b;
   2010             }
   2011             _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
   2012             pu1_src += 8;
   2013         }
   2014     }
   2015 }
   2016 
   2017 void ihevc_sao_edge_offset_class1_chroma_ssse3(UWORD8 *pu1_src,
   2018                                                WORD32 src_strd,
   2019                                                UWORD8 *pu1_src_left,
   2020                                                UWORD8 *pu1_src_top,
   2021                                                UWORD8 *pu1_src_top_left,
   2022                                                UWORD8 *pu1_src_top_right,
   2023                                                UWORD8 *pu1_src_bot_left,
   2024                                                UWORD8 *pu1_avail,
   2025                                                WORD8 *pi1_sao_offset_u,
   2026                                                WORD8 *pi1_sao_offset_v,
   2027                                                WORD32 wd,
   2028                                                WORD32 ht)
   2029 {
   2030     WORD32 row, col;
   2031     UWORD8 *pu1_src_top_cpy;
   2032     UWORD8 *pu1_src_cpy;
   2033     WORD32 wd_rem;
   2034 
   2035 
   2036     __m128i src_top_16x8b, src_bottom_16x8b;
   2037     __m128i src_temp0_16x8b, src_temp1_16x8b;
   2038     __m128i signup0_16x8b, signdwn1_16x8b;
   2039     __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
   2040     __m128i edge0_16x8b, edge1_16x8b;
   2041     __m128i edge_idx_8x16b, sao_offset_8x16b;
   2042     __m128i const2_16x8b, const0_16x8b;
   2043     __m128i chroma_offset_8x16b;
   2044 
   2045     UNUSED(pu1_src_top_right);
   2046     UNUSED(pu1_src_bot_left);
   2047 
   2048     /* Updating left and top and top-left */
   2049     for(row = 0; row < ht; row++)
   2050     {
   2051         pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
   2052         pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
   2053     }
   2054     pu1_src_top_left[0] = pu1_src_top[wd - 2];
   2055     pu1_src_top_left[1] = pu1_src_top[wd - 1];
   2056 
   2057 
   2058 
   2059     pu1_src_top_cpy = pu1_src_top;
   2060     edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
   2061     sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
   2062     const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
   2063     chroma_offset_8x16b = _mm_set1_epi16(0x0800);
   2064     /* Update height and source pointers based on the availability flags */
   2065     if(0 == pu1_avail[2])
   2066     {
   2067         pu1_src_top_cpy = pu1_src;
   2068         pu1_src += src_strd;
   2069         ht--;
   2070     }
   2071     if(0 == pu1_avail[3])
   2072     {
   2073         ht--;
   2074     }
   2075     sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
   2076     const2_16x8b = _mm_set1_epi8(2);
   2077     const0_16x8b = _mm_setzero_si128();
   2078 
   2079 
   2080     {
   2081         WORD32 ht_rem;
   2082 
   2083 
   2084 
   2085         for(col = wd; col >= 16; col -= 16)
   2086         {
   2087             pu1_src_cpy = pu1_src;
   2088             src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
   2089             //row = 0
   2090             src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
   2091             //separating +ve and and -ve values.
   2092             cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
   2093             cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
   2094             //creating mask 00 for +ve and -ve values and FF for zero.
   2095             cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2096             cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2097             //combining the appropriate sign change
   2098             signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   2099 
   2100             for(row = ht; row >= 2; row -= 2)
   2101             {
   2102 
   2103                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
   2104                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
   2105                 // row = 2
   2106                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
   2107 
   2108 
   2109                 //row 0 -row1
   2110                 //separating +ve and and -ve values.
   2111                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
   2112                 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
   2113                 //creating mask 00 for +ve and -ve values and FF for zero.
   2114                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2115                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2116                 //combining the appropriate sign change
   2117                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   2118                 //row1-row0
   2119                 edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
   2120 
   2121                 //row1 -bottom
   2122                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
   2123                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
   2124                 //creating mask 00 for +ve and -ve values and FF for zero.
   2125                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2126                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2127                 //combining the appropriate sign change
   2128                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   2129 
   2130                 //combining sign-left and sign_right
   2131                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
   2132                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
   2133 
   2134                 //for the next iteration signup0_16x8b = -signdwn1_16x8b
   2135                 signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
   2136                 //adding constant 2
   2137                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   2138                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
   2139                 //copying the next top
   2140                 src_top_16x8b = src_temp1_16x8b;
   2141 
   2142 
   2143                 //shuffle to get sao index
   2144                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   2145                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
   2146                 //adding chroma offset to access U and V
   2147                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
   2148                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
   2149 
   2150                 //shuffle to get sao offset
   2151                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   2152                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
   2153                 //cnvert to 16 bit then add and then saturated pack
   2154                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   2155                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   2156                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   2157                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
   2158                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
   2159                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   2160                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
   2161                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
   2162 
   2163                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
   2164                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
   2165                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
   2166                 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
   2167                 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
   2168                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   2169                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
   2170                 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
   2171                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   2172                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   2173                 // row = 1
   2174                 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
   2175 
   2176                 src_temp0_16x8b = src_bottom_16x8b;
   2177                 pu1_src_cpy += (src_strd << 1);
   2178             }
   2179             ht_rem = ht & 0x1;
   2180 
   2181             if(ht_rem)
   2182             {
   2183                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
   2184                 //current row -next row
   2185                 //separating +ve and and -ve values.
   2186                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
   2187                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
   2188                 //creating mask 00 for +ve and -ve values and FF for zero.
   2189                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2190                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2191                 //combining the appropriate sign change
   2192                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   2193                 //adding top and botton and constant 2
   2194                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
   2195                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   2196                 //copying the next top
   2197                 src_top_16x8b = src_temp0_16x8b;
   2198 
   2199                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   2200                 //adding chroma offset to access U and V
   2201                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
   2202                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   2203 
   2204                 //cnvert to 16 bit then add and then saturated pack
   2205                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   2206                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   2207                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   2208                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
   2209                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
   2210                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   2211                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
   2212                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
   2213 
   2214                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   2215             }
   2216             if(0 == pu1_avail[3])
   2217             {
   2218                 src_top_16x8b = src_bottom_16x8b;
   2219             }
   2220             //updating top flag
   2221             _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
   2222             pu1_src += 16;
   2223         }
   2224 
   2225         wd_rem = wd & 0xF;
   2226         if(wd_rem)
   2227         {
   2228             pu1_src_cpy = pu1_src;
   2229             src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
   2230             //row = 0
   2231             src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
   2232             //separating +ve and and -ve values.
   2233             cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
   2234             cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
   2235             //creating mask 00 for +ve and -ve values and FF for zero.
   2236             cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2237             cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2238             //combining the appropriate sign change
   2239             signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   2240             signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
   2241             for(row = ht; row >= 4; row -= 4)
   2242             {
   2243                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
   2244                 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
   2245                 // row = 2
   2246                 src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
   2247 
   2248                 //row 0 -row1
   2249                 //separating +ve and and -ve values.
   2250                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
   2251                 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
   2252                 //creating mask 00 for +ve and -ve values and FF for zero.
   2253                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2254                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2255                 //combining the appropriate sign change
   2256                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   2257 
   2258                 //row1-row0
   2259                 edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
   2260                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
   2261                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
   2262                 //row1 -row2
   2263                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
   2264                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
   2265                 //creating mask 00 for +ve and -ve values and FF for zero.
   2266                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2267                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2268                 //combining the appropriate sign change
   2269                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
   2270                 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
   2271                 //packing row 0 n row 1
   2272                 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
   2273                 //row = 3
   2274                 src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
   2275                 // row = 4
   2276                 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd));
   2277 
   2278                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
   2279                 signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2)
   2280                 //separating +ve and and -ve values.(2,3)
   2281                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b);
   2282                 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b);
   2283                 //creating mask 00 for +ve and -ve values and FF for zero.
   2284                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2285                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2286                 //combining the appropriate sign change
   2287                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
   2288 
   2289                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down)
   2290                 edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8);
   2291                 //separating +ve and and -ve values.(3,4)
   2292                 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b);
   2293                 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b);
   2294                 //creating mask 00 for +ve and -ve values and FF for zero.
   2295                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2296                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2297                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4)
   2298                 //combining sign-left and sign_right
   2299                 edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3)
   2300 
   2301                 edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
   2302 
   2303                 //packing row 2 n row 3
   2304                 src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
   2305                 //for the next iteration signup0_16x8b = -signdwn1_16x8b
   2306                 signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3)
   2307                 //adding constant 2
   2308                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   2309                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
   2310                 //shuffle to get sao index
   2311                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   2312                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
   2313                 //adding chroma offset to access U and V
   2314                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
   2315                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
   2316 
   2317                 //shuffle to get sao offset
   2318                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   2319                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
   2320                 //the next top already in  src_top_16x8b
   2321                 //cnvert to 16 bit then add and then saturated pack
   2322                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   2323                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   2324                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   2325                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
   2326                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
   2327                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   2328                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
   2329                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
   2330 
   2331                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
   2332                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
   2333                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
   2334                 src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
   2335                 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
   2336                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   2337                 src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
   2338                 src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
   2339 
   2340                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
   2341                 cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
   2342                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   2343                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   2344                 // row = 1
   2345                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
   2346                 //row = 2
   2347                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
   2348                 // row = 3
   2349                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
   2350 
   2351                 src_temp0_16x8b = src_temp1_16x8b;
   2352                 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
   2353                 pu1_src_cpy += (src_strd << 2);
   2354 
   2355             }
   2356             ht_rem = ht & 0x2;
   2357             if(ht_rem)
   2358             {
   2359 
   2360                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
   2361                 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
   2362                 // row = 2
   2363                 src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
   2364 
   2365                 //row 0 -row1
   2366                 //separating +ve and and -ve values.
   2367                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
   2368                 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
   2369                 //creating mask 00 for +ve and -ve values and FF for zero.
   2370                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2371                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2372                 //combining the appropriate sign change
   2373                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   2374                 //row1-row0
   2375                 edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
   2376                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
   2377                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
   2378                 //row1 -row2
   2379                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
   2380                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
   2381                 //creating mask 00 for +ve and -ve values and FF for zero.
   2382                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2383                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2384                 //combining the appropriate sign change
   2385                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
   2386                 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
   2387                 //adding top and down substraction
   2388                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
   2389                 //for the next iteration signup0_16x8b = -signdwn1_16x8b
   2390                 signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next
   2391                 src_top_16x8b = src_temp1_16x8b;
   2392 
   2393                 //adding constant 2
   2394                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   2395 
   2396                 //shuffle to get sao index
   2397                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   2398 
   2399                 //adding chroma offset to access U and V
   2400                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
   2401                 //shuffle to get sao offset
   2402                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   2403                 //the next top already in  src_top_16x8b
   2404                 //cnvert to 16 bit then add and then saturated pack
   2405                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   2406                 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   2407                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   2408                 src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
   2409                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
   2410                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
   2411                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
   2412                 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
   2413 
   2414                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
   2415 
   2416                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   2417                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   2418                 // row = 1
   2419                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
   2420                 src_temp0_16x8b = src_bottom_16x8b;
   2421                 pu1_src_cpy += (src_strd << 1);
   2422 
   2423             }
   2424             ht_rem = ht & 0x1;
   2425             if(ht_rem)
   2426             {
   2427 
   2428                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
   2429                 src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
   2430 
   2431                 //row 0 -row1
   2432                 //separating +ve and and -ve values.
   2433                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
   2434                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
   2435                 //creating mask 00 for +ve and -ve values and FF for zero.
   2436                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2437                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2438                 //combining the appropriate sign change
   2439                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   2440                 //adding top and down substraction
   2441                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
   2442                 //adding constant 2
   2443                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   2444                 src_top_16x8b = src_temp0_16x8b;
   2445 
   2446                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
   2447                 edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
   2448                 //shuffle to get sao index
   2449                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   2450                 //adding chroma offset to access U and V
   2451                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
   2452                 //shuffle to get sao offset
   2453                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   2454 
   2455                 //cnvert to 16 bit then add and then saturated pack
   2456                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   2457                 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   2458                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   2459                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
   2460                 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
   2461                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   2462                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   2463                 pu1_src_cpy += (src_strd);
   2464 
   2465             }
   2466             if(0 == pu1_avail[3])
   2467             {
   2468                 src_top_16x8b = src_bottom_16x8b;
   2469             }
   2470             _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
   2471             pu1_src += 8;
   2472         }
   2473     }
   2474 }
   2475 
   2476 /* 135 degree filtering */
   2477 void ihevc_sao_edge_offset_class2_ssse3(UWORD8 *pu1_src,
   2478                                         WORD32 src_strd,
   2479                                         UWORD8 *pu1_src_left,
   2480                                         UWORD8 *pu1_src_top,
   2481                                         UWORD8 *pu1_src_top_left,
   2482                                         UWORD8 *pu1_src_top_right,
   2483                                         UWORD8 *pu1_src_bot_left,
   2484                                         UWORD8 *pu1_avail,
   2485                                         WORD8 *pi1_sao_offset,
   2486                                         WORD32 wd,
   2487                                         WORD32 ht)
   2488 {
   2489     WORD32 row, col;
   2490     UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
   2491     UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
   2492     UWORD8 *pu1_firstleft;
   2493     UWORD8 *pu1_src_cpy, *pu1_src_org;
   2494     UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
   2495     UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
   2496     UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
   2497     WORD32 wd_rem;
   2498     UWORD8 u1_pos_0_0_tmp, u1_pos_wd_ht_tmp;
   2499     WORD32 ht_tmp, ht_0;
   2500 
   2501     WORD32 bit_depth;
   2502     UWORD8 u1_avail0, u1_avail1;
   2503 
   2504     __m128i src_top_16x8b, src_bottom_16x8b;
   2505     __m128i src_temp0_16x8b, src_temp1_16x8b;
   2506     __m128i signup0_16x8b, signdwn1_16x8b;
   2507     __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
   2508     __m128i edge0_16x8b, edge1_16x8b;
   2509     __m128i au1_mask8x16b;
   2510     __m128i edge_idx_8x16b, sao_offset_8x16b;
   2511     __m128i const2_16x8b, const0_16x8b;
   2512     __m128i left_store_16x8b;
   2513     UNUSED(pu1_src_top_right);
   2514     UNUSED(pu1_src_bot_left);
   2515 
   2516     ht_0 = ht; ht_tmp = ht;
   2517     au1_mask8x16b = _mm_set1_epi8(0xff);
   2518 
   2519     //setting availability mask to ff size MAX_CTB_SIZE
   2520     for(col = 0; col < MAX_CTB_SIZE; col += 16)
   2521         _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
   2522     for(row = 0; row < ht; row++)
   2523     {
   2524         au1_src_left_tmp[row] = pu1_src_left[row];
   2525     }
   2526     bit_depth = BIT_DEPTH_LUMA;
   2527     pu1_src_org = pu1_src;
   2528     pu1_src_top_cpy = pu1_src_top;
   2529     pu1_src_left_cpy2 = au1_src_left_tmp;
   2530     pu1_src_left_cpy = au1_src_left_tmp;
   2531     pu1_src_left_str2 = au1_src_left_tmp1;
   2532     pu1_src_left_str = au1_src_left_tmp1;
   2533     edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
   2534     sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
   2535 
   2536 
   2537     /* If top-left is available, process separately */
   2538     if(0 != pu1_avail[4])
   2539     {
   2540         WORD8 edge_idx;
   2541 
   2542         edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
   2543                         SIGN(pu1_src[0] - pu1_src[1 + src_strd]);
   2544 
   2545         edge_idx = gi1_table_edge_idx[edge_idx];
   2546 
   2547         if(0 != edge_idx)
   2548         {
   2549             u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
   2550         }
   2551         else
   2552         {
   2553             u1_pos_0_0_tmp = pu1_src[0];
   2554         }
   2555     }
   2556     else
   2557     {
   2558         u1_pos_0_0_tmp = pu1_src[0];
   2559     }
   2560 
   2561     /* If bottom-right is available, process separately */
   2562     if(0 != pu1_avail[7])
   2563     {
   2564         WORD8 edge_idx;
   2565 
   2566         edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]) +
   2567                         SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]);
   2568 
   2569         edge_idx = gi1_table_edge_idx[edge_idx];
   2570 
   2571         if(0 != edge_idx)
   2572         {
   2573             u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
   2574         }
   2575         else
   2576         {
   2577             u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
   2578         }
   2579     }
   2580     else
   2581     {
   2582         u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
   2583     }
   2584     pu1_firstleft = pu1_src_top_left;
   2585 
   2586     /* Update height and source pointers based on the availability flags */
   2587     if(0 == pu1_avail[2])
   2588     {
   2589         pu1_firstleft = pu1_src_left_cpy2;
   2590         pu1_src_left_cpy2++;
   2591         pu1_src_left_str2++;
   2592         pu1_src_top_cpy = pu1_src;
   2593         pu1_src += src_strd;
   2594         ht--;
   2595     }
   2596     if(0 == pu1_avail[3])
   2597     {
   2598         ht--;
   2599         ht_0--;
   2600     }
   2601     //storing top left in a mmx register
   2602     left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft);
   2603     const2_16x8b = _mm_set1_epi8(2);
   2604     const0_16x8b = _mm_setzero_si128();
   2605     left_store_16x8b = _mm_slli_si128(left_store_16x8b, 15);
   2606     //update top -left
   2607     *pu1_src_top_left = pu1_src_top[wd - 1];
   2608     //availability mask creation
   2609     u1_avail0 = pu1_avail[0];
   2610     u1_avail1 = pu1_avail[1];
   2611     au1_mask[0] = u1_avail0;
   2612     au1_mask[wd - 1] = u1_avail1;
   2613     {
   2614         WORD32 ht_rem;
   2615 
   2616 
   2617         pu1_src_left_cpy = pu1_src_left_cpy2;
   2618         pu1_src_left_str = pu1_src_left_str2;
   2619         au1_mask_cpy = au1_mask;
   2620         for(col = wd; col >= 16; col -= 16)
   2621         {
   2622             pu1_src_cpy = pu1_src;
   2623             src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
   2624             //row = 0
   2625             src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
   2626             src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15);
   2627             //loading the mask
   2628             au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
   2629             //separating +ve and and -ve values.
   2630             cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
   2631             cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
   2632             //creating mask 00 for +ve and -ve values and FF for zero.
   2633             cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2634             cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2635             //combining the appropriate sign change
   2636             signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   2637 
   2638 
   2639             for(row = ht; row >= 2; row -= 2)
   2640             {
   2641                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
   2642                 //row = 1
   2643                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
   2644                 // row = 1 right
   2645                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
   2646                 //to insert left in row 0
   2647                 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15);
   2648                 //row 0 -row1
   2649                 //separating +ve and and -ve values.
   2650                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
   2651                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
   2652 
   2653                 //creating mask 00 for +ve and -ve values and FF for zero.
   2654                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2655                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2656                 //manipulation for row 1 - row 0
   2657                 signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
   2658                 //combining the appropriate sign change
   2659                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
   2660                 //row1-row0
   2661                 //separating +ve and and -ve values.
   2662                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
   2663                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
   2664                 //creating mask 00 for +ve and -ve values and FF for zero.
   2665                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2666                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2667                 // row = 2 right
   2668                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 1));
   2669                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
   2670 
   2671 
   2672                 //row1 -bottom
   2673                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
   2674                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
   2675                 //creating mask 00 for +ve and -ve values and FF for zero.
   2676                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2677                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2678                 //combining the appropriate sign change
   2679                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   2680                 // row = 2
   2681                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
   2682 
   2683                 //combining sign-left and sign_right
   2684                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
   2685 
   2686                 //storing the row 1 left for next row.
   2687                 signup0_16x8b = _mm_slli_si128(left_store_16x8b, 14);
   2688 
   2689                 //combining sign-left and sign_right
   2690                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
   2691                 //manipulation for bottom - row 1
   2692                 signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 15);
   2693                 //eliminating old left for row 0 and row 1
   2694                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
   2695                 //bottom - row1
   2696                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b);
   2697                 cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b);
   2698                 //creating mask 00 for +ve and -ve values and FF for zero.
   2699                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2700                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2701                 //for the next iteration bottom -row1
   2702                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   2703                 //row1  getting it right for left of next block
   2704                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
   2705                 //adding constant 2
   2706                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   2707                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
   2708                 //shuffle to get sao index
   2709                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   2710                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
   2711                 //using availability mask
   2712                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
   2713                 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
   2714                 //shuffle to get sao offset
   2715                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   2716                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
   2717                 //row0  getting it right for left of next block
   2718                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
   2719                 //copying the next top
   2720                 src_top_16x8b = src_temp1_16x8b;
   2721                 //cnvert to 16 bit then add and then saturated pack
   2722                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   2723                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   2724                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   2725                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
   2726                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   2727                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
   2728                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
   2729                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
   2730 
   2731                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
   2732                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
   2733                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
   2734                 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
   2735                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   2736                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
   2737                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
   2738                 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
   2739 
   2740                 //store left boundary
   2741                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
   2742                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   2743                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   2744                 // row = 1
   2745                 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
   2746 
   2747                 src_temp0_16x8b = src_bottom_16x8b;
   2748                 pu1_src_cpy += (src_strd << 1);
   2749                 pu1_src_left_cpy += 2;
   2750                 pu1_src_left_str += 2;
   2751             }
   2752             ht_rem = ht & 0x1;
   2753 
   2754             if(ht_rem)
   2755             {
   2756                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
   2757                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
   2758                 //current row -next row
   2759                 //separating +ve and and -ve values.
   2760                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
   2761                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
   2762                 //creating mask 00 for +ve and -ve values and FF for zero.
   2763                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2764                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2765                 //combining the appropriate sign change
   2766                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   2767                 //adding top and botton and constant 2
   2768                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
   2769                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   2770                 //eliminating old left for row 0 and row 1
   2771                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
   2772 
   2773                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   2774                 //using availability mask
   2775                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
   2776 
   2777                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   2778 
   2779                 //row0  getting it right for left of next block
   2780                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
   2781                 //copying the next top
   2782                 src_top_16x8b = src_temp0_16x8b;
   2783                 //cnvert to 16 bit then add and then saturated pack
   2784                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   2785                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   2786                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   2787                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
   2788                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   2789                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
   2790                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
   2791                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
   2792                 //store left boundary
   2793                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
   2794 
   2795                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   2796                 pu1_src_cpy += (src_strd);
   2797                 pu1_src_left_cpy += 1;
   2798                 pu1_src_left_str += 1;
   2799             }
   2800             if(0 == pu1_avail[3])
   2801             {
   2802                 src_top_16x8b = src_bottom_16x8b;
   2803                 pu1_src_left_str[0] = pu1_src_cpy[15];
   2804             }
   2805             if(0 == pu1_avail[2])
   2806             {
   2807                 pu1_src_left_str[-ht_0] = pu1_src[15 - src_strd];
   2808             }
   2809 
   2810             //for the top left of next part of the block
   2811             left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
   2812             //updating top flag
   2813             _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
   2814             pu1_src += 16;
   2815             au1_mask_cpy += 16;
   2816 
   2817 
   2818             pu1_left_tmp = pu1_src_left_cpy2;
   2819             pu1_src_left_cpy2 = pu1_src_left_str2;
   2820             pu1_src_left_str2 = pu1_left_tmp;
   2821 
   2822             pu1_src_left_cpy = pu1_src_left_cpy2;
   2823             pu1_src_left_str = pu1_src_left_str2;
   2824         }
   2825 
   2826         wd_rem = wd & 0xF;
   2827         if(wd_rem)
   2828         {
   2829             pu1_src_left_cpy = pu1_src_left_cpy2;
   2830             pu1_src_left_str = pu1_src_left_str2;
   2831             pu1_src_cpy = pu1_src;
   2832             src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
   2833             //row = 0
   2834             src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
   2835             src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15);
   2836             au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
   2837             //separating +ve and and -ve values.
   2838             cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
   2839             cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
   2840             //creating mask 00 for +ve and -ve values and FF for zero.
   2841             cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2842             cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2843             //preparing au1_mask
   2844             au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
   2845             //combining the appropriate sign change
   2846             signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   2847             signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
   2848 
   2849             for(row = ht; row >= 4; row -= 4)
   2850             {
   2851                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
   2852                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
   2853                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
   2854                 // row = 2
   2855                 src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
   2856                 //right row1
   2857                 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
   2858                 //row 0 -row1
   2859                 //separating +ve and and -ve values.
   2860                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
   2861                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
   2862                 //manipulation for row 1 -row 0
   2863                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 15);
   2864                 //creating mask 00 for +ve and -ve values and FF for zero.
   2865                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2866                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2867                 //row 0 left
   2868                 signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
   2869                 //combining the appropriate sign change
   2870                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   2871                 //row 1 -row0
   2872                 //separating +ve and and -ve values.
   2873                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
   2874                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
   2875 
   2876                 //creating mask 00 for +ve and -ve values and FF for zero.
   2877                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2878                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2879                 //row1-row0
   2880                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   2881 
   2882                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
   2883 
   2884                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
   2885                 //right row2
   2886                 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
   2887                 //packing row 0 n row 1
   2888                 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
   2889                 //row1 -row2
   2890                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
   2891                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
   2892                 //creating mask 00 for +ve and -ve values and FF for zero.
   2893                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2894                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2895                 //combining the appropriate sign change
   2896                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
   2897                 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
   2898                 //manipulation for row 2 -row 1
   2899                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
   2900                 //row 1 left
   2901                 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
   2902                 //row = 3
   2903                 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
   2904 
   2905                 // row = 4
   2906                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
   2907 
   2908                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
   2909 
   2910                 //separating +ve and and -ve values.(2,1)
   2911                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
   2912                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
   2913                 //manipulation for row 3 -row 2
   2914                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 13);
   2915                 //creating mask 00 for +ve and -ve values and FF for zero.
   2916                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2917                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2918                 //row 2 left
   2919                 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
   2920                 //combining the appropriate sign change
   2921                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
   2922 
   2923                 //separating +ve and and -ve values.(3,2)
   2924                 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
   2925                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
   2926                 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
   2927                 //creating mask 00 for +ve and -ve values and FF for zero.
   2928                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2929                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2930                 //right row3
   2931                 signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1);
   2932                 //combining the appropriate sign change
   2933                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
   2934 
   2935                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
   2936 
   2937                 //separating +ve and and -ve values.(2,3)
   2938                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
   2939                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
   2940                 //right row 4
   2941                 signdwn1_16x8b =  _mm_srli_si128(src_temp1_16x8b, 1);
   2942                 //creating mask 00 for +ve and -ve values and FF for zero.
   2943                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2944                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2945                 //combining the appropriate sign change
   2946                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
   2947 
   2948                 //separating +ve and and -ve values.(3,bottom)
   2949                 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
   2950                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
   2951 
   2952                 //creating mask 00 for +ve and -ve values and FF for zero.
   2953                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2954                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2955                 edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
   2956                 //combining the appropriate sign change
   2957                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
   2958                 edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
   2959 
   2960                 //manipulation for bottom -row 3
   2961                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
   2962                 //eliminating old left for row 0,1,2,3
   2963                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
   2964                 //packing row 2 n row 3
   2965                 src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
   2966                 //row 3 left
   2967                 signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15);
   2968                 //loading row 3 right into left
   2969                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15);
   2970                 //adding bottom and top values of row 2 and row 3
   2971                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
   2972                 //separating +ve and and -ve values.(botttom,3)
   2973                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
   2974                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
   2975                 //to store right of row 2
   2976                 signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
   2977                 //creating mask 00 for +ve and -ve values and FF for zero.
   2978                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   2979                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   2980                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
   2981 
   2982                 //storing right of row 2into left
   2983                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
   2984                 //to store right of row 0
   2985                 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
   2986                 //storing right of row 1 into left
   2987                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
   2988 
   2989                 //adding constant 2
   2990                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   2991                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
   2992                 //shuffle to get sao index
   2993                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   2994                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
   2995                 //using availability mask
   2996                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
   2997                 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
   2998                 //shuffle to get sao offset
   2999                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   3000                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
   3001 
   3002                 //storing right of row 0 into left
   3003                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
   3004                 //cnvert to 16 bit then add and then saturated pack
   3005                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   3006                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   3007                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   3008                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
   3009                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   3010                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
   3011                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
   3012                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
   3013 
   3014                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
   3015                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
   3016                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
   3017                 src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
   3018                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   3019                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
   3020                 src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
   3021                 src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
   3022 
   3023                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
   3024                 cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
   3025 
   3026                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
   3027                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   3028                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   3029                 // row = 1
   3030                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
   3031                 //row = 2
   3032                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
   3033                 // row = 3
   3034                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
   3035 
   3036                 src_temp0_16x8b = src_temp1_16x8b;
   3037                 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
   3038                 pu1_src_cpy += (src_strd << 2);
   3039                 pu1_src_left_cpy += 4;
   3040                 pu1_src_left_str += 4;
   3041             }
   3042             ht_rem = ht & 0x2;
   3043             if(ht_rem)
   3044             {
   3045                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
   3046                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
   3047                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
   3048                 // row = 2
   3049                 src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
   3050 
   3051                 //row 0 -row 1
   3052                 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
   3053                 //separating +ve and and -ve values.
   3054                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
   3055                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
   3056                 //manipulation for row 1 -row 0
   3057                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 15);
   3058                 //creating mask 00 for +ve and -ve values and FF for zero.
   3059                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3060                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3061                 //manipulation for row 1 - row 0
   3062                 signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
   3063                 //combining the appropriate sign change
   3064                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   3065 
   3066                 //row1-row0
   3067                 //separating +ve and and -ve values.
   3068                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
   3069                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
   3070 
   3071                 //creating mask 00 for +ve and -ve values and FF for zero.
   3072                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3073                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3074                 //combining the appropriate sign chang
   3075                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   3076                 //row 1 -bottom
   3077                 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
   3078 
   3079                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
   3080                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
   3081                 //row1 -bottom
   3082                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
   3083                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
   3084 
   3085                 //creating mask 00 for +ve and -ve values and FF for zero.
   3086                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3087                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3088                 //combining the appropriate sign change
   3089                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
   3090                 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
   3091                 //manipulation for bottom -row1
   3092                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
   3093                 //manipulation for bottom- row 1
   3094                 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
   3095                 //adding top and down substraction
   3096                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
   3097                 //bottom - row 1
   3098                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
   3099                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
   3100 
   3101                 //eliminating old left for row 0,1
   3102                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
   3103                 signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
   3104                 //creating mask 00 for +ve and -ve values and FF for zero.
   3105                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3106                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3107                 //for the next iteration signup0_16x8b
   3108                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
   3109 
   3110                 //storing right of row 1 into left
   3111                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
   3112                 //for storing right of row 1
   3113                 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
   3114 
   3115                 src_top_16x8b = src_temp1_16x8b;
   3116                 //storing right of row 0 into left
   3117                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
   3118 
   3119                 //adding constant 2
   3120                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   3121 
   3122                 //shuffle to get sao index
   3123                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   3124                 //using availability mask
   3125                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
   3126                 //shuffle to get sao offset
   3127                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   3128 
   3129                 //the next top already in  src_top_16x8b
   3130                 //cnvert to 16 bit then add and then saturated pack
   3131                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   3132                 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   3133                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   3134                 src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
   3135                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
   3136                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
   3137                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
   3138                 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
   3139 
   3140                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
   3141 
   3142                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
   3143                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   3144                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   3145                 // row = 1
   3146                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
   3147                 src_temp0_16x8b = src_bottom_16x8b;
   3148                 pu1_src_cpy += (src_strd << 1);
   3149                 pu1_src_left_cpy += 2;
   3150                 pu1_src_left_str += 2;
   3151             }
   3152             ht_rem = ht & 0x1;
   3153             if(ht_rem)
   3154             {
   3155                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
   3156                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
   3157                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
   3158                 //left store manipulation 1
   3159                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
   3160                 //row 0 -row1
   3161                 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
   3162                 //separating +ve and and -ve values.
   3163                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
   3164                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
   3165                 //creating mask 00 for +ve and -ve values and FF for zero.
   3166                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3167                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3168                 //combining the appropriate sign change
   3169                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   3170                 //adding top and down substraction
   3171                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
   3172                 //for row 0 right to put into left store
   3173                 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
   3174                 //adding constant 2
   3175                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   3176                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
   3177                 edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
   3178                 //filling the left boundary value
   3179                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
   3180 
   3181                 //shuffle to get sao index
   3182                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   3183                 //using availability mask
   3184                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
   3185                 //shuffle to get sao offset
   3186                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   3187                 src_top_16x8b = src_temp0_16x8b;
   3188                 //cnvert to 16 bit then add and then saturated pack
   3189                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   3190                 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   3191                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   3192                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
   3193                 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
   3194 
   3195                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
   3196                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   3197                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   3198                 pu1_src_cpy += (src_strd);
   3199                 pu1_src_left_cpy += 1;
   3200                 pu1_src_left_str += 1;
   3201             }
   3202             if(0 == pu1_avail[3])
   3203             {
   3204                 src_top_16x8b = src_bottom_16x8b;
   3205                 pu1_src_left_str[0] = pu1_src_cpy[7];
   3206             }
   3207 
   3208             if(0 == pu1_avail[2])
   3209             {
   3210                 pu1_src_left_str[-ht_0] = pu1_src[7 - src_strd];
   3211             }
   3212 
   3213             _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
   3214             pu1_src += 8;
   3215             au1_mask_cpy += 16;
   3216 
   3217             pu1_left_tmp = pu1_src_left_cpy2;
   3218             pu1_src_left_cpy2 = pu1_src_left_str2;
   3219             pu1_src_left_str2 = pu1_left_tmp;
   3220 
   3221             pu1_src_left_cpy = pu1_src_left_cpy2;
   3222             pu1_src_left_str = pu1_src_left_str2;
   3223         }
   3224         pu1_src_org[0] = u1_pos_0_0_tmp;
   3225         pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp;
   3226         pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy;
   3227         for(row = 0; row < ht_tmp; row++)
   3228         {
   3229             pu1_src_left[row] = pu1_src_left_cpy[row];
   3230         }
   3231     }
   3232 
   3233 }
   3234 
   3235 /* 135 degree filtering */
   3236 void ihevc_sao_edge_offset_class2_chroma_ssse3(UWORD8 *pu1_src,
   3237                                                WORD32 src_strd,
   3238                                                UWORD8 *pu1_src_left,
   3239                                                UWORD8 *pu1_src_top,
   3240                                                UWORD8 *pu1_src_top_left,
   3241                                                UWORD8 *pu1_src_top_right,
   3242                                                UWORD8 *pu1_src_bot_left,
   3243                                                UWORD8 *pu1_avail,
   3244                                                WORD8 *pi1_sao_offset_u,
   3245                                                WORD8 *pi1_sao_offset_v,
   3246                                                WORD32 wd,
   3247                                                WORD32 ht)
   3248 {
   3249     WORD32 row, col;
   3250     UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
   3251     UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
   3252     UWORD8 *pu1_firstleft;
   3253     UWORD8 *pu1_src_cpy, *pu1_src_org;
   3254     UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
   3255     UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
   3256     UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)];
   3257     WORD32 wd_rem;
   3258     UWORD8 u1_pos_0_0_tmp_u, u1_pos_0_0_tmp_v, u1_pos_wd_ht_tmp_u, u1_pos_wd_ht_tmp_v;
   3259     WORD32 ht_tmp;
   3260     WORD32 ht_0;
   3261 
   3262     WORD32 bit_depth;
   3263     UWORD8 u1_avail0, u1_avail1;
   3264 
   3265     __m128i src_temp0_16x8b, src_temp1_16x8b;
   3266     __m128i signup0_16x8b, signdwn1_16x8b;
   3267     __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
   3268     __m128i edge0_16x8b, edge1_16x8b;
   3269     __m128i src_top_16x8b, src_bottom_16x8b;
   3270     __m128i au1_mask8x16b;
   3271     __m128i edge_idx_8x16b, sao_offset_8x16b;
   3272     __m128i const2_16x8b, const0_16x8b;
   3273     __m128i left_store_16x8b;
   3274     __m128i chroma_offset_8x16b;
   3275 
   3276     UNUSED(pu1_src_top_right);
   3277     UNUSED(pu1_src_bot_left);
   3278 
   3279     ht_0 = ht; ht_tmp = ht;
   3280     au1_mask8x16b = _mm_set1_epi8(0xff);
   3281     /* Updating left and top-left  */
   3282     for(row = 0; row < 2 * ht; row++)
   3283     {
   3284         au1_src_left_tmp[row] = pu1_src_left[row];
   3285     }
   3286     //setting availability mask to ff size MAX_CTB_SIZE
   3287     for(col = 0; col < MAX_CTB_SIZE; col += 16)
   3288         _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
   3289     bit_depth = BIT_DEPTH_LUMA;
   3290     pu1_src_org = pu1_src;
   3291     pu1_src_top_cpy = pu1_src_top;
   3292     pu1_src_left_cpy2 = au1_src_left_tmp;
   3293     pu1_src_left_cpy = au1_src_left_tmp;
   3294     pu1_src_left_str2 = au1_src_left_tmp1;
   3295     pu1_src_left_str = au1_src_left_tmp1;
   3296     edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
   3297     sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
   3298     const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
   3299     chroma_offset_8x16b = _mm_set1_epi16(0x0800);
   3300 
   3301     /* If top-left is available, process separately */
   3302     if(0 != pu1_avail[4])
   3303     {
   3304         WORD32 edge_idx;
   3305 
   3306         /* U */
   3307         edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
   3308                         SIGN(pu1_src[0] - pu1_src[2 + src_strd]);
   3309 
   3310         edge_idx = gi1_table_edge_idx[edge_idx];
   3311 
   3312         if(0 != edge_idx)
   3313         {
   3314             u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
   3315         }
   3316         else
   3317         {
   3318             u1_pos_0_0_tmp_u = pu1_src[0];
   3319         }
   3320 
   3321         /* V */
   3322         edge_idx = 2 + SIGN(pu1_src[1] - pu1_src_top_left[1]) +
   3323                         SIGN(pu1_src[1] - pu1_src[1 + 2 + src_strd]);
   3324 
   3325         edge_idx = gi1_table_edge_idx[edge_idx];
   3326 
   3327         if(0 != edge_idx)
   3328         {
   3329             u1_pos_0_0_tmp_v = CLIP3(pu1_src[1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
   3330         }
   3331         else
   3332         {
   3333             u1_pos_0_0_tmp_v = pu1_src[1];
   3334         }
   3335     }
   3336     else
   3337     {
   3338         u1_pos_0_0_tmp_u = pu1_src[0];
   3339         u1_pos_0_0_tmp_v = pu1_src[1];
   3340     }
   3341 
   3342     /* If bottom-right is available, process separately */
   3343     if(0 != pu1_avail[7])
   3344     {
   3345         WORD32 edge_idx;
   3346 
   3347         /* U */
   3348         edge_idx = 2 + SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]) +
   3349                         SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]);
   3350 
   3351         edge_idx = gi1_table_edge_idx[edge_idx];
   3352 
   3353         if(0 != edge_idx)
   3354         {
   3355             u1_pos_wd_ht_tmp_u = CLIP3(pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
   3356         }
   3357         else
   3358         {
   3359             u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
   3360         }
   3361 
   3362         /* V */
   3363         edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]) +
   3364                         SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]);
   3365 
   3366         edge_idx = gi1_table_edge_idx[edge_idx];
   3367 
   3368         if(0 != edge_idx)
   3369         {
   3370             u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
   3371         }
   3372         else
   3373         {
   3374             u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
   3375         }
   3376     }
   3377     else
   3378     {
   3379         u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
   3380         u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
   3381     }
   3382     pu1_firstleft = pu1_src_top_left;
   3383 
   3384     /* Update height and source pointers based on the availability flags */
   3385     if(0 == pu1_avail[2])
   3386     {
   3387         pu1_firstleft = pu1_src_left_cpy2;
   3388         pu1_src_left_cpy2 += 2;
   3389         pu1_src_left_str2 += 2;
   3390         pu1_src_top_cpy = pu1_src;
   3391         pu1_src += src_strd;
   3392         ht--;
   3393     }
   3394     if(0 == pu1_avail[3])
   3395     {
   3396         ht--;
   3397         ht_0--;
   3398     }
   3399     //storing top left in a mmx register
   3400     left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft);
   3401     sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
   3402     const2_16x8b = _mm_set1_epi8(2);
   3403     const0_16x8b = _mm_setzero_si128();
   3404     left_store_16x8b = _mm_slli_si128(left_store_16x8b, 14);
   3405 
   3406     //availability mask creation
   3407     u1_avail0 = pu1_avail[0];
   3408     u1_avail1 = pu1_avail[1];
   3409     au1_mask[0] = u1_avail0;
   3410     au1_mask[1] = u1_avail0;
   3411     au1_mask[wd - 1] = u1_avail1;
   3412     au1_mask[wd - 2] = u1_avail1;
   3413 
   3414     /* top-left arrays */
   3415     pu1_src_top_left[0] = pu1_src_top[wd - 2];
   3416     pu1_src_top_left[1] = pu1_src_top[wd - 1];
   3417     {
   3418         WORD32 ht_rem;
   3419         au1_mask_cpy = au1_mask;
   3420 
   3421         pu1_src_left_cpy = pu1_src_left_cpy2;
   3422         pu1_src_left_str = pu1_src_left_str2;
   3423         for(col = wd; col >= 16; col -= 16)
   3424         {
   3425             pu1_src_cpy = pu1_src;
   3426             src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
   3427             //row = 0
   3428             src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
   3429             src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14);
   3430             //loading the mask
   3431             au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
   3432             //separating +ve and and -ve values.
   3433             cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
   3434             cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
   3435             //creating mask 00 for +ve and -ve values and FF for zero.
   3436             cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3437             cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3438             //combining the appropriate sign change
   3439             signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   3440 
   3441 
   3442             for(row = ht; row >= 2; row -= 2)
   3443             {
   3444                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
   3445                 //row = 1
   3446                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
   3447                 // row = 1 right
   3448                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
   3449                 //to insert left in row 0
   3450                 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
   3451                 //row 0 -row1
   3452                 //separating +ve and and -ve values.
   3453                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
   3454                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
   3455 
   3456                 //creating mask 00 for +ve and -ve values and FF for zero.
   3457                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3458                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3459                 //manipulation for row 1 - row 0
   3460                 signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
   3461                 //combining the appropriate sign change
   3462                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
   3463                 //row1-row0
   3464                 //separating +ve and and -ve values.
   3465                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
   3466                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
   3467                 //creating mask 00 for +ve and -ve values and FF for zero.
   3468                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3469                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3470                  // row = 2 right
   3471                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 2));
   3472                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
   3473 
   3474 
   3475                 //row1 -bottom
   3476                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
   3477                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
   3478                 //creating mask 00 for +ve and -ve values and FF for zero.
   3479                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3480                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3481                 //combining the appropriate sign change
   3482                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   3483                 // row = 2
   3484                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
   3485 
   3486                 //combining sign-left and sign_right
   3487                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
   3488 
   3489                 //storing the row 1 left for next row.
   3490                 signup0_16x8b = _mm_slli_si128(left_store_16x8b, 12);
   3491 
   3492                 //combining sign-left and sign_right
   3493                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
   3494                 //manipulation for bottom - row 1
   3495                 signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 14);
   3496                 //eliminating old left for row 0 and row 1
   3497                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
   3498                 //bottom - row1
   3499                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b);
   3500                 cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b);
   3501                 //creating mask 00 for +ve and -ve values and FF for zero.
   3502                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3503                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3504                 //for the next iteration bottom -row1
   3505                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   3506                 //row1  getting it right for left of next iteration
   3507                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
   3508                 //copying the next top
   3509                 src_top_16x8b = src_temp1_16x8b;
   3510                 //row0  getting its right for left of next iteration.
   3511                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
   3512 
   3513 
   3514                 //adding constant 2
   3515                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   3516                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
   3517                 //shuffle to get sao index
   3518                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   3519                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
   3520                 //using availability mask
   3521                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
   3522                 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
   3523                 //adding chroma offset to access U and V
   3524                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
   3525                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
   3526 
   3527 
   3528                 //shuffle to get sao offset
   3529                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   3530                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
   3531                 //cnvert to 16 bit then add and then saturated pack
   3532                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   3533                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   3534                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   3535                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
   3536                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
   3537                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   3538                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
   3539                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
   3540 
   3541                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
   3542                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
   3543                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
   3544                 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
   3545                 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
   3546                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   3547                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
   3548                 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
   3549 
   3550                 //store left boundary
   3551                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
   3552                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   3553                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   3554                 // row = 1
   3555                 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
   3556 
   3557                 src_temp0_16x8b = src_bottom_16x8b;
   3558                 pu1_src_cpy += (src_strd << 1);
   3559                 pu1_src_left_cpy += 4;
   3560                 pu1_src_left_str += 4;
   3561             }
   3562             ht_rem = ht & 0x1;
   3563 
   3564             if(ht_rem)
   3565             {
   3566                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
   3567                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
   3568                 //current row -next row
   3569                 //separating +ve and and -ve values.
   3570                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
   3571                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
   3572                 //creating mask 00 for +ve and -ve values and FF for zero.
   3573                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3574                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3575                 //combining the appropriate sign change
   3576                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   3577                 //adding top and botton and constant 2
   3578                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
   3579                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   3580 
   3581                 //eliminating old left for row 0 and row 1
   3582                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
   3583                 //copying the next top
   3584                 src_top_16x8b = src_temp0_16x8b;
   3585                 //row0  getting it right for left of next block
   3586                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
   3587 
   3588                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   3589                 //using availability mask
   3590                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
   3591                 //adding chroma offset to access U and V
   3592                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
   3593 
   3594                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   3595 
   3596                 //cnvert to 16 bit then add and then saturated pack
   3597                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   3598                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   3599                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   3600                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
   3601                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
   3602                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   3603                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
   3604                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
   3605 
   3606                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
   3607 
   3608                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   3609                 pu1_src_cpy += (src_strd);
   3610                 pu1_src_left_cpy += 2;
   3611                 pu1_src_left_str += 2;
   3612             }
   3613             if(0 == pu1_avail[3])
   3614             {
   3615                 src_top_16x8b = src_bottom_16x8b;
   3616                 pu1_src_left_str[1] = pu1_src_cpy[15];
   3617                 pu1_src_left_str[0] = pu1_src_cpy[14];
   3618             }
   3619             if(0 == pu1_avail[2])
   3620             {
   3621                 pu1_src_left_str[-2 * ht_0] = pu1_src[14 - src_strd];
   3622                 pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[15 - src_strd];
   3623             }
   3624 
   3625             //for the top left of next part of the block
   3626             left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
   3627             //updating top flag
   3628             _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
   3629             pu1_src += 16;
   3630             au1_mask_cpy += 16;
   3631 
   3632             pu1_left_tmp = pu1_src_left_cpy2;
   3633             pu1_src_left_cpy2 = pu1_src_left_str2;
   3634             pu1_src_left_str2 = pu1_left_tmp;
   3635 
   3636             pu1_src_left_cpy = pu1_src_left_cpy2;
   3637             pu1_src_left_str = pu1_src_left_str2;
   3638         }
   3639         wd_rem = wd & 0xF;
   3640         if(wd_rem)
   3641         {
   3642             pu1_src_left_cpy = pu1_src_left_cpy2;
   3643             pu1_src_left_str = pu1_src_left_str2;
   3644             pu1_src_cpy = pu1_src;
   3645             src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
   3646             //row = 0
   3647             src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
   3648             src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14);
   3649             au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
   3650             //separating +ve and and -ve values.
   3651             cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
   3652             cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
   3653             //creating mask 00 for +ve and -ve values and FF for zero.
   3654             cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3655             cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3656             //preparing au1_mask
   3657             au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
   3658             //combining the appropriate sign change
   3659             signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   3660             signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
   3661 
   3662             for(row = ht; row >= 4; row -= 4)
   3663             {
   3664                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
   3665                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
   3666                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
   3667                 // row = 2
   3668                 src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
   3669                 //right row1
   3670                 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
   3671                 //row 0 -row1
   3672                 //separating +ve and and -ve values.
   3673                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
   3674                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
   3675                 //manipulation for row 1 -row 0
   3676                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
   3677                 //creating mask 00 for +ve and -ve values and FF for zero.
   3678                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3679                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3680                 //row 0 left
   3681                 signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
   3682                 //combining the appropriate sign change
   3683                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   3684                 //row 1 -row0
   3685                 //separating +ve and and -ve values.
   3686                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
   3687                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
   3688 
   3689                 //creating mask 00 for +ve and -ve values and FF for zero.
   3690                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3691                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3692                 //row1-row0
   3693                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   3694 
   3695                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
   3696 
   3697                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
   3698                 //right row2
   3699                 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
   3700                 //packing row 0 n row 1
   3701                 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
   3702                 //row1 -row2
   3703                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
   3704                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
   3705                 //creating mask 00 for +ve and -ve values and FF for zero.
   3706                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3707                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3708                 //combining the appropriate sign change
   3709                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
   3710                 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
   3711                 //manipulation for row 2 -row 1
   3712                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
   3713                 //row 1 left
   3714                 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
   3715                 //row = 3
   3716                 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
   3717 
   3718                 // row = 4
   3719                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
   3720 
   3721                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
   3722 
   3723                 //separating +ve and and -ve values.(2,1)
   3724                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
   3725                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
   3726                 //manipulation for row 3 -row 2
   3727                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 10);
   3728                 //creating mask 00 for +ve and -ve values and FF for zero.
   3729                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3730                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3731                 //row 2 left
   3732                 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
   3733                 //combining the appropriate sign change
   3734                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
   3735 
   3736                 //separating +ve and and -ve values.(3,2)
   3737                 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
   3738                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
   3739                 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
   3740                 //creating mask 00 for +ve and -ve values and FF for zero.
   3741                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3742                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3743                 //right row3
   3744                 signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2);
   3745                 //combining the appropriate sign change
   3746                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
   3747 
   3748                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
   3749 
   3750                 //separating +ve and and -ve values.(2,3)
   3751                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
   3752                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
   3753                 //right row 4
   3754                 signdwn1_16x8b =  _mm_srli_si128(src_temp1_16x8b, 2);
   3755                 //creating mask 00 for +ve and -ve values and FF for zero.
   3756                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3757                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3758                 //combining the appropriate sign change
   3759                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
   3760 
   3761                 //separating +ve and and -ve values.(3,bottom)
   3762                 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
   3763                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
   3764 
   3765                 //creating mask 00 for +ve and -ve values and FF for zero.
   3766                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3767                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3768                 edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
   3769                 //combining the appropriate sign change
   3770                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
   3771                 edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
   3772 
   3773                 //manipulation for bottom -row 3
   3774                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 8);
   3775                 //eliminating old left for row 0,1,2,3
   3776                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8);
   3777                 //packing row 2 n row 3
   3778                 src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
   3779                 //row 3 left
   3780                 signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14);
   3781 
   3782                 //adding bottom and top values of row 2 and row 3
   3783                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
   3784                 //separating +ve and and -ve values.(botttom,3)
   3785                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
   3786                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
   3787 
   3788                 //creating mask 00 for +ve and -ve values and FF for zero.
   3789                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3790                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3791                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
   3792 
   3793                 //to store right of row 2
   3794                 signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
   3795                 //loading row 3 right into left
   3796                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14);
   3797                 //storing right of row 2into left
   3798                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
   3799                 //to store right of row 0
   3800                 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
   3801                 //storing right of row 1 into left
   3802                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
   3803                 //storing right of row 0 into left
   3804                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
   3805 
   3806                 //adding constant 2
   3807                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   3808                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
   3809                 //shuffle to get sao index
   3810                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   3811                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
   3812                 //using availability mask
   3813                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
   3814                 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
   3815 
   3816                 //adding chroma offset to access U and V
   3817                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
   3818                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
   3819 
   3820                 //shuffle to get sao offset
   3821                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   3822                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
   3823                 //cnvert to 16 bit then add and then saturated pack
   3824                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   3825                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   3826                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   3827                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
   3828                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
   3829                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   3830                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
   3831                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
   3832 
   3833                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
   3834                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
   3835                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
   3836                 src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
   3837                 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
   3838                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   3839                 src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
   3840                 src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
   3841 
   3842                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
   3843                 cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
   3844 
   3845 
   3846                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
   3847                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   3848                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   3849                 // row = 1
   3850                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
   3851                 //row = 2
   3852                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
   3853                 // row = 3
   3854                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
   3855 
   3856                 src_temp0_16x8b = src_temp1_16x8b;
   3857                 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
   3858                 pu1_src_cpy += (src_strd << 2);
   3859                 pu1_src_left_cpy += 8;
   3860                 pu1_src_left_str += 8;
   3861             }
   3862             ht_rem = ht & 0x2;
   3863             if(ht_rem)
   3864             {
   3865                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
   3866                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
   3867                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
   3868                 // row = 2
   3869                 src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
   3870 
   3871                 //row 0 -row 1
   3872                 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
   3873                 //separating +ve and and -ve values.
   3874                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
   3875                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
   3876                 //manipulation for row 1 -row 0
   3877                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
   3878                 //creating mask 00 for +ve and -ve values and FF for zero.
   3879                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3880                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3881                 //manipulation for row 1 - row 0
   3882                 signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
   3883                 //combining the appropriate sign change
   3884                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   3885 
   3886                 //row1-row0
   3887                 //separating +ve and and -ve values.
   3888                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
   3889                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
   3890 
   3891                 //creating mask 00 for +ve and -ve values and FF for zero.
   3892                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3893                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3894                 //combining the appropriate sign chang
   3895                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   3896                 //row 1 -bottom
   3897                 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
   3898 
   3899                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
   3900                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
   3901                 //row1 -bottom
   3902                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
   3903                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
   3904 
   3905                 //creating mask 00 for +ve and -ve values and FF for zero.
   3906                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3907                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3908                 //combining the appropriate sign change
   3909                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
   3910                 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
   3911                 //manipulation for bottom -row1
   3912                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
   3913                 //eliminating old left for row 0,1
   3914                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
   3915                 //manipulation for bottom- row 1
   3916                 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
   3917                 //adding top and down substraction
   3918                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
   3919                 //bottom - row 1
   3920                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
   3921                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
   3922 
   3923                 //shifting row 1
   3924                 signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
   3925                 //creating mask 00 for +ve and -ve values and FF for zero.
   3926                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3927                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3928                 //for the next iteration signup0_16x8b
   3929                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
   3930                 //storing right of row 1 into left
   3931                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); //for storing right of row 0
   3932                 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
   3933                 //the next top  in  src_top_16x8b
   3934                 src_top_16x8b = src_temp1_16x8b;
   3935                 //storing right of row 0 into left
   3936                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
   3937 
   3938 
   3939                 //adding constant 2
   3940                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   3941 
   3942                 //shuffle to get sao index
   3943                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   3944                 //using availability mask
   3945                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
   3946 
   3947                 //adding chroma offset to access U and V
   3948                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
   3949 
   3950                 //shuffle to get sao offset
   3951                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   3952                 //the next top already in  src_top_16x8b
   3953                 //cnvert to 16 bit then add and then saturated pack
   3954                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   3955                 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   3956                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   3957                 src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
   3958                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
   3959                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
   3960                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
   3961                 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
   3962 
   3963                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
   3964 
   3965                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
   3966                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   3967                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   3968                 // row = 1
   3969                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
   3970                 src_temp0_16x8b = src_bottom_16x8b;
   3971                 pu1_src_cpy += (src_strd << 1);
   3972                 pu1_src_left_cpy += 4;
   3973                 pu1_src_left_str += 4;
   3974             }
   3975             ht_rem = ht & 0x1;
   3976             if(ht_rem)
   3977             {
   3978                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
   3979                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
   3980                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
   3981 
   3982                 //row 0 -row1
   3983                 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
   3984                 //separating +ve and and -ve values.
   3985                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
   3986                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
   3987                 //creating mask 00 for +ve and -ve values and FF for zero.
   3988                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   3989                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   3990                 //combining the appropriate sign change
   3991                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   3992                 //adding top and down substraction
   3993                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
   3994 
   3995                 //for row 0 right to put into left store
   3996                 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
   3997                 //left store manipulation 1
   3998                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
   3999                 src_top_16x8b = src_temp0_16x8b;
   4000                 //filling the left boundary value
   4001                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
   4002 
   4003                 //adding constant 2
   4004                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   4005                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
   4006                 edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
   4007 
   4008 
   4009                 //shuffle to get sao index
   4010                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   4011                 //using availability mask
   4012                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
   4013                 //adding chroma offset to access U and V
   4014                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
   4015 
   4016                 //shuffle to get sao offset
   4017                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   4018 
   4019                 //cnvert to 16 bit then add and then saturated pack
   4020                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   4021                 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   4022                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   4023                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
   4024                 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
   4025 
   4026                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
   4027                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   4028                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   4029                 pu1_src_cpy += (src_strd);
   4030                 pu1_src_left_cpy += 2;
   4031                 pu1_src_left_str += 2;
   4032             }
   4033             if(0 == pu1_avail[3])
   4034             {
   4035                 src_top_16x8b = src_bottom_16x8b;
   4036                 pu1_src_left_str[1] = pu1_src_cpy[7];
   4037                 pu1_src_left_str[0] = pu1_src_cpy[6];
   4038             }
   4039 
   4040             if(0 == pu1_avail[2])
   4041             {
   4042                 pu1_src_left_str[-2 * ht_0] = pu1_src[6 - src_strd];
   4043                 pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[7 - src_strd];
   4044             }
   4045 
   4046             _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
   4047             pu1_src += 8;
   4048 
   4049             pu1_left_tmp = pu1_src_left_cpy2;
   4050             pu1_src_left_cpy2 = pu1_src_left_str2;
   4051             pu1_src_left_str2 = pu1_left_tmp;
   4052 
   4053             pu1_src_left_cpy = pu1_src_left_cpy2;
   4054             pu1_src_left_str = pu1_src_left_str2;
   4055         }
   4056         pu1_src_org[0] = u1_pos_0_0_tmp_u;
   4057         pu1_src_org[1] = u1_pos_0_0_tmp_v;
   4058         pu1_src_org[wd - 2 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_u;
   4059         pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_v;
   4060         pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 2) : pu1_src_left_cpy;
   4061         for(row = 0; row < 2 * ht_tmp; row++)
   4062         {
   4063             pu1_src_left[row] = pu1_src_left_cpy[row];
   4064         }
   4065     }
   4066 
   4067 }
   4068 
   4069 void ihevc_sao_edge_offset_class3_ssse3(UWORD8 *pu1_src,
   4070                                         WORD32 src_strd,
   4071                                         UWORD8 *pu1_src_left,
   4072                                         UWORD8 *pu1_src_top,
   4073                                         UWORD8 *pu1_src_top_left,
   4074                                         UWORD8 *pu1_src_top_right,
   4075                                         UWORD8 *pu1_src_bot_left,
   4076                                         UWORD8 *pu1_avail,
   4077                                         WORD8 *pi1_sao_offset,
   4078                                         WORD32 wd,
   4079                                         WORD32 ht)
   4080 {
   4081     WORD32 row, col;
   4082     UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
   4083     UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
   4084     UWORD8 *pu1_src_cpy, *pu1_src_org;
   4085     UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
   4086     UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
   4087     UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
   4088     WORD32 wd_rem;
   4089     UWORD8 u1_pos_wd_0_tmp, u1_pos_0_ht_tmp;
   4090     WORD32 ht_tmp;
   4091     WORD32 bit_depth;
   4092     UWORD8 u1_avail0, u1_avail1;
   4093 
   4094     __m128i src_top_16x8b, src_bottom_16x8b;
   4095     __m128i src_temp0_16x8b, src_temp1_16x8b;
   4096     __m128i signup0_16x8b, signdwn1_16x8b;
   4097     __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
   4098     __m128i edge0_16x8b, edge1_16x8b;
   4099     __m128i au1_mask8x16b;
   4100     __m128i edge_idx_8x16b, sao_offset_8x16b;
   4101     __m128i const2_16x8b, const0_16x8b;
   4102     __m128i left_store_16x8b;
   4103 
   4104     ht_tmp = ht;
   4105     au1_mask8x16b = _mm_set1_epi8(0xff);
   4106 
   4107     au1_src_left_tmp[0] = pu1_src[(wd - 1)];
   4108     //manipulation for bottom left
   4109     for(row = 1; row < ht; row++)
   4110     {
   4111         au1_src_left_tmp[row] = pu1_src_left[row];
   4112     }
   4113     au1_src_left_tmp[ht] = pu1_src_bot_left[0];
   4114 
   4115     *pu1_src_top_left = pu1_src_top[wd - 1];
   4116     //setting availability mask to ff size MAX_CTB_SIZE
   4117     for(col = 0; col < MAX_CTB_SIZE; col += 16)
   4118         _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
   4119     bit_depth = BIT_DEPTH_LUMA;
   4120     pu1_src_org = pu1_src;
   4121     pu1_src_top_cpy = pu1_src_top;
   4122     pu1_src_left_cpy2 = au1_src_left_tmp;
   4123     pu1_src_left_cpy = au1_src_left_tmp;
   4124     pu1_src_left_str2 = au1_src_left_tmp1;
   4125     pu1_src_left_str = au1_src_left_tmp1;
   4126     edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
   4127     sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
   4128 
   4129     /* If top-right is available, process separately */
   4130     if(0 != pu1_avail[5])
   4131     {
   4132         WORD32 edge_idx;
   4133 
   4134         edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) +
   4135                         SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]);
   4136 
   4137         edge_idx = gi1_table_edge_idx[edge_idx];
   4138 
   4139         if(0 != edge_idx)
   4140         {
   4141             u1_pos_wd_0_tmp = CLIP3(pu1_src[wd - 1] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
   4142         }
   4143         else
   4144         {
   4145             u1_pos_wd_0_tmp = pu1_src[wd - 1];
   4146         }
   4147     }
   4148     else
   4149     {
   4150         u1_pos_wd_0_tmp = pu1_src[wd - 1];
   4151     }
   4152 
   4153     /* If bottom-left is available, process separately */
   4154     if(0 != pu1_avail[6])
   4155     {
   4156         WORD32 edge_idx;
   4157 
   4158         edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]) +
   4159                         SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);
   4160 
   4161         edge_idx = gi1_table_edge_idx[edge_idx];
   4162 
   4163         if(0 != edge_idx)
   4164         {
   4165             u1_pos_0_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
   4166         }
   4167         else
   4168         {
   4169             u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
   4170         }
   4171     }
   4172     else
   4173     {
   4174         u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
   4175     }
   4176 
   4177 
   4178 
   4179     /* Update height and source pointers based on the availability flags */
   4180     if(0 == pu1_avail[2])
   4181     {
   4182         pu1_src_left_cpy2++;
   4183         pu1_src_left_str2++;
   4184         pu1_src_top_cpy = pu1_src;
   4185         pu1_src += src_strd;
   4186         ht--;
   4187     }
   4188     if(0 == pu1_avail[3])
   4189     {
   4190         ht--;
   4191     }
   4192 
   4193 
   4194     const2_16x8b = _mm_set1_epi8(2);
   4195     const0_16x8b = _mm_setzero_si128();
   4196 
   4197 
   4198     //availability mask creation
   4199     u1_avail0 = pu1_avail[0];
   4200     u1_avail1 = pu1_avail[1];
   4201     au1_mask[0] = u1_avail0;
   4202     au1_mask[wd - 1] = u1_avail1;
   4203     {
   4204         WORD32 ht_rem;
   4205 
   4206         pu1_src_left_cpy = pu1_src_left_cpy2;
   4207         pu1_src_left_str = pu1_src_left_str2;
   4208         au1_mask_cpy = au1_mask;
   4209         for(col = wd; col >= 16; col -= 16)
   4210         {
   4211             pu1_src_cpy = pu1_src;
   4212             src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 1));
   4213             //row = 0
   4214             src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
   4215 
   4216             //loading the mask
   4217             au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
   4218             //separating +ve and and -ve values.
   4219             cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
   4220             cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
   4221             //creating mask 00 for +ve and -ve values and FF for zero.
   4222             cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   4223             cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   4224             //combining the appropriate sign change
   4225             signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   4226 
   4227             for(row = ht; row >= 2; row -= 2)
   4228             {
   4229                 left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
   4230                 //row = 1
   4231                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
   4232                 //to insert left in row 1
   4233                 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
   4234                 // row = 0 right
   4235                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1));
   4236 
   4237                 //manipulation for row 1 - row 0
   4238                 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
   4239                 //row 0 -row1
   4240                 //separating +ve and and -ve values.
   4241                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
   4242                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
   4243 
   4244                 //creating mask 00 for +ve and -ve values and FF for zero.
   4245                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   4246                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   4247 
   4248                 //combining the appropriate sign change
   4249                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
   4250                 //combining sign-left and sign_right
   4251                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
   4252 
   4253                 //row1-row0
   4254                 //separating +ve and and -ve values.
   4255                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
   4256                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
   4257                 //creating mask 00 for +ve and -ve values and FF for zero.
   4258                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   4259                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   4260 
   4261                 // row = 2
   4262                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
   4263                 // row = 1 right
   4264                 signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
   4265                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
   4266 
   4267                 //bottom - row1
   4268                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
   4269                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
   4270                 //creating mask 00 for +ve and -ve values and FF for zero.
   4271                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   4272                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   4273                 //for the next iteration bottom -row1
   4274                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   4275 
   4276                 //to insert left in row 1
   4277                 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13);
   4278                 //manipulation for row 1 - bottom
   4279                 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
   4280 
   4281                 //row1 -bottom
   4282                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
   4283                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
   4284                 //creating mask 00 for +ve and -ve values and FF for zero.
   4285                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   4286                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   4287                 //combining the appropriate sign change
   4288                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   4289 
   4290                 //combining sign-left and sign_right
   4291                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
   4292 
   4293                 //eliminating old left for row 0 and row 1
   4294                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
   4295 
   4296                 //row1  getting it right for left of next block
   4297                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
   4298                 //adding constant 2
   4299                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   4300                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
   4301                 //shuffle to get sao index
   4302                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   4303                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
   4304                 //using availability mask
   4305                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
   4306                 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
   4307                 //shuffle to get sao offset
   4308                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   4309                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
   4310                 //row0  getting it right for left of next block
   4311                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
   4312                 //copying the next top
   4313                 src_top_16x8b = src_temp1_16x8b;
   4314                 //cnvert to 16 bit then add and then saturated pack
   4315                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   4316                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   4317                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   4318                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
   4319                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   4320                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
   4321                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
   4322                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
   4323 
   4324                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
   4325                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
   4326                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
   4327                 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
   4328                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   4329                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
   4330                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
   4331                 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
   4332                 //store left boundary
   4333                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
   4334                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   4335                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   4336                 // row = 1
   4337                 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
   4338 
   4339                 src_temp0_16x8b = src_bottom_16x8b;
   4340                 pu1_src_cpy += (src_strd << 1);
   4341                 pu1_src_left_cpy += 2;
   4342                 pu1_src_left_str += 2;
   4343             }
   4344             ht_rem = ht & 0x1;
   4345 
   4346             if(ht_rem)
   4347             {
   4348                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
   4349                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
   4350                 //to insert left in row 1
   4351                 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
   4352                 //manipulation for row 1 - row 0
   4353                 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
   4354 
   4355                 //current row -next row
   4356                 //separating +ve and and -ve values.
   4357                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
   4358                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
   4359                 //creating mask 00 for +ve and -ve values and FF for zero.
   4360                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   4361                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   4362                 //combining the appropriate sign change
   4363                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   4364                 //adding top and bottom and constant 2
   4365                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
   4366                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   4367                 //eliminating old left for row 0 and row 1
   4368                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
   4369 
   4370                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   4371                 //using availability mask
   4372                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
   4373 
   4374                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   4375 
   4376                 //row0  getting it right for left of next block
   4377                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
   4378                 //copying the next top
   4379                 src_top_16x8b = src_temp0_16x8b;
   4380                 //cnvert to 16 bit then add and then saturated pack
   4381                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   4382                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   4383                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   4384                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
   4385                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   4386                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
   4387                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
   4388                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
   4389                 //store left boundary
   4390                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
   4391 
   4392                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   4393                 pu1_src_cpy += (src_strd);
   4394                 src_temp0_16x8b = src_bottom_16x8b;
   4395                 pu1_src_left_cpy++;
   4396                 pu1_src_left_str++;
   4397             }
   4398             {   //for bottom right
   4399                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
   4400                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
   4401                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
   4402                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
   4403             }
   4404             if(0 == pu1_avail[3])
   4405             {
   4406                 src_top_16x8b = src_bottom_16x8b;
   4407             }
   4408             //for the top left of next part of the block
   4409             left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
   4410             //updating top flag
   4411             _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
   4412             pu1_src += 16;
   4413             au1_mask_cpy += 16;
   4414 
   4415             pu1_left_tmp = pu1_src_left_cpy2;
   4416             pu1_src_left_cpy2 = pu1_src_left_str2;
   4417             pu1_src_left_str2 = pu1_left_tmp;
   4418 
   4419             pu1_src_left_cpy = pu1_src_left_cpy2;
   4420             pu1_src_left_str = pu1_src_left_str2;
   4421         }
   4422 
   4423         wd_rem = wd & 0xF;
   4424         if(wd_rem)
   4425         {
   4426             pu1_src_cpy = pu1_src;
   4427             pu1_src_left_cpy = pu1_src_left_cpy2;
   4428             pu1_src_left_str = pu1_src_left_str2;
   4429             src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 1));
   4430             //row = 0
   4431             src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
   4432             au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
   4433             //separating +ve and and -ve values.
   4434             cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
   4435             cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
   4436             //creating mask 00 for +ve and -ve values and FF for zero.
   4437             cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   4438             cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   4439             //preparing au1_mask
   4440             au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
   4441             //combining the appropriate sign change
   4442             signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   4443             signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
   4444 
   4445             for(row = ht; row >= 4; row -= 4)
   4446             {
   4447                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
   4448                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
   4449                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
   4450                 // row = 2
   4451                 src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
   4452                 //manipulation for row 0 -row 1
   4453                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
   4454                 //row 1 left
   4455                 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
   4456                 //row 0 -row1
   4457                 //separating +ve and and -ve values.
   4458                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
   4459                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
   4460 
   4461                 //creating mask 00 for +ve and -ve values and FF for zero.
   4462                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   4463                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   4464                 //manipulatiing for row 1 -row 0
   4465                 signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1);
   4466                 //combining the appropriate sign change
   4467                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   4468                 //row 1 -row0
   4469                 //separating +ve and and -ve values.
   4470                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
   4471                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
   4472 
   4473                 //creating mask 00 for +ve and -ve values and FF for zero.
   4474                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   4475                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   4476                 //row1-row0
   4477                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   4478 
   4479                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
   4480 
   4481                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
   4482                 //manipulation for row 1 -row 2
   4483                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 13);
   4484                 //row 2 left
   4485                 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
   4486                 //packing row 0 n row 1
   4487                 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
   4488                 //row1 -row2
   4489                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
   4490                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
   4491                 //creating mask 00 for +ve and -ve values and FF for zero.
   4492                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   4493                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   4494                 //combining the appropriate sign change
   4495                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
   4496                 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
   4497 
   4498                 //row 1 right
   4499                 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
   4500                 //row = 3
   4501                 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
   4502 
   4503                 // row = 4
   4504                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
   4505 
   4506                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
   4507 
   4508                 //separating +ve and and -ve values.(2,1)
   4509                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
   4510                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
   4511 
   4512                 //creating mask 00 for +ve and -ve values and FF for zero.
   4513                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   4514                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   4515                 //row 2 right
   4516                 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
   4517                 //combining the appropriate sign change
   4518                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
   4519 
   4520                 //separating +ve and and -ve values.(3,2)
   4521                 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
   4522                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
   4523                 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
   4524                 //creating mask 00 for +ve and -ve values and FF for zero.
   4525                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   4526                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   4527                 //manipulation for row 2 -row 3
   4528                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
   4529                 //row 3 left
   4530                 signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15);
   4531                 //combining the appropriate sign change
   4532                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
   4533 
   4534                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
   4535 
   4536                 //separating +ve and and -ve values.(2,3)
   4537                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
   4538                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
   4539 
   4540                 //manipulation for row 3 -bottom
   4541                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 11);
   4542                 //bottom left
   4543                 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
   4544 
   4545                 //creating mask 00 for +ve and -ve values and FF for zero.
   4546                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   4547                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   4548                 //combining the appropriate sign change
   4549                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
   4550 
   4551                 //separating +ve and and -ve values.(3,bottom)
   4552                 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
   4553                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
   4554 
   4555                 //creating mask 00 for +ve and -ve values and FF for zero.
   4556                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   4557                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   4558                 edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
   4559                 //combining the appropriate sign change
   4560                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
   4561                 edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
   4562 
   4563 
   4564                 //eliminating old left for row 0,1,2,3
   4565                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
   4566                 //packing row 2 n row 3
   4567                 src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
   4568                 //row 3 right
   4569                 signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1);
   4570                 //loading row 3 right into left
   4571                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15);
   4572                 //adding bottom and top values of row 2 and row 3
   4573                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
   4574                 //separating +ve and and -ve values.(botttom,3)
   4575                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
   4576                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
   4577                 //to store right of row 2
   4578                 signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
   4579                 //creating mask 00 for +ve and -ve values and FF for zero.
   4580                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   4581                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   4582                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
   4583 
   4584                 //storing right of row 2into left
   4585                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
   4586                 //to store right of row 0
   4587                 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
   4588                 //storing right of row 1 into left
   4589                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
   4590 
   4591                 //adding constant 2
   4592                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   4593                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
   4594                 //shuffle to get sao index
   4595                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   4596                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
   4597                 //using availability mask
   4598                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
   4599                 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
   4600                 //shuffle to get sao offset
   4601                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   4602                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
   4603 
   4604                 //storing right of row 0 into left
   4605                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
   4606                 //cnvert to 16 bit then add and then saturated pack
   4607                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   4608                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   4609                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   4610                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
   4611                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   4612                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
   4613                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
   4614                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
   4615 
   4616                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
   4617                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
   4618                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
   4619                 src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
   4620                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   4621                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
   4622                 src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
   4623                 src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
   4624 
   4625                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
   4626                 cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
   4627 
   4628                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
   4629                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   4630                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   4631                 // row = 1
   4632                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
   4633                 //row = 2
   4634                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
   4635                 // row = 3
   4636                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
   4637 
   4638                 src_temp0_16x8b = src_temp1_16x8b;
   4639                 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
   4640                 pu1_src_cpy += (src_strd << 2);
   4641                 pu1_src_left_cpy += 4;
   4642                 pu1_src_left_str += 4;
   4643             }
   4644             ht_rem = ht & 0x2;
   4645             if(ht_rem)
   4646             {
   4647                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
   4648                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
   4649                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
   4650                 // row = 2
   4651                 src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
   4652 
   4653                 //manipulation for row 0 -row 1
   4654                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
   4655                 //bottom left
   4656                 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
   4657                 //separating +ve and and -ve values.
   4658                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
   4659                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
   4660 
   4661                 //creating mask 00 for +ve and -ve values and FF for zero.
   4662                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   4663                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   4664                 //manipulation for row 1 - row 0
   4665                 signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1);
   4666                 //combining the appropriate sign change
   4667                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   4668 
   4669                 //row1-row0
   4670                 //separating +ve and and -ve values.
   4671                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
   4672                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
   4673 
   4674                 //creating mask 00 for +ve and -ve values and FF for zero.
   4675                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   4676                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   4677                 //combining the appropriate sign chang
   4678                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   4679 
   4680                 //manipulation for row 1 -bottom
   4681                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 13);
   4682                 //bottom left
   4683                 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
   4684 
   4685                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
   4686                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
   4687                 //row1 -bottom
   4688                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
   4689                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
   4690 
   4691                 //creating mask 00 for +ve and -ve values and FF for zero.
   4692                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   4693                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   4694                 //combining the appropriate sign change
   4695                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
   4696                 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
   4697                 //manipulation for bottom- row 1 (row 1 right)
   4698                 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
   4699                 //adding top and down substraction
   4700                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
   4701                 //bottom - row 1
   4702                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
   4703                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
   4704 
   4705                 //eliminating old left for row 0,1
   4706                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
   4707                 signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
   4708                 //creating mask 00 for +ve and -ve values and FF for zero.
   4709                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   4710                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   4711                 //for the next iteration signup0_16x8b
   4712                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
   4713 
   4714                 //storing right of row 1 into left
   4715                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
   4716                 //for storing right of row 1
   4717                 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
   4718 
   4719                 src_top_16x8b = src_temp1_16x8b;
   4720                 //storing right of row 0 into left
   4721                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
   4722 
   4723                 //adding constant 2
   4724                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   4725 
   4726                 //shuffle to get sao index
   4727                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   4728                 //using availability mask
   4729                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
   4730                 //shuffle to get sao offset
   4731                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   4732 
   4733                 //the next top already in  src_top_16x8b
   4734                 //cnvert to 16 bit then add and then saturated pack
   4735                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   4736                 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   4737                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   4738                 src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
   4739                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
   4740                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
   4741                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
   4742                 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
   4743 
   4744                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
   4745 
   4746                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
   4747                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   4748                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   4749                 // row = 1
   4750                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
   4751                 src_temp0_16x8b = src_bottom_16x8b;
   4752                 pu1_src_cpy += (src_strd << 1);
   4753                 pu1_src_left_cpy += 2;
   4754                 pu1_src_left_str += 2;
   4755             }
   4756             ht_rem = ht & 0x1;
   4757             if(ht_rem)
   4758             {
   4759                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
   4760                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
   4761                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
   4762 
   4763 
   4764                 //manipulation for row 0 -bottom
   4765                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
   4766                 //bottom left
   4767                 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
   4768                 //separating +ve and and -ve values.
   4769                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
   4770                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
   4771                 //creating mask 00 for +ve and -ve values and FF for zero.
   4772                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   4773                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   4774                 //combining the appropriate sign change
   4775                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   4776                 //adding top and down substraction
   4777                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
   4778                 //for row 0 right to put into left store
   4779                 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
   4780                 //adding constant 2
   4781                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   4782                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
   4783                 edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
   4784                 //left store manipulation 1
   4785                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
   4786                 //filling the left boundary value
   4787                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
   4788 
   4789                 //shuffle to get sao index
   4790                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   4791                 //using availability mask
   4792                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
   4793                 //shuffle to get sao offset
   4794                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   4795                 src_top_16x8b = src_temp0_16x8b;
   4796                 //cnvert to 16 bit then add and then saturated pack
   4797                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   4798                 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   4799                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   4800                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
   4801                 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
   4802 
   4803                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
   4804                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   4805                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   4806                 pu1_src_cpy += (src_strd);
   4807                 src_temp0_16x8b = src_bottom_16x8b;
   4808                 pu1_src_left_cpy++;
   4809                 pu1_src_left_str++;
   4810             }
   4811             {   //for bottom right
   4812                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
   4813                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
   4814                 src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
   4815                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
   4816                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
   4817             }
   4818             if(0 == pu1_avail[3])
   4819             {
   4820                 src_top_16x8b = src_bottom_16x8b;
   4821             }
   4822             _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
   4823             pu1_src += 8;
   4824 
   4825             pu1_left_tmp = pu1_src_left_cpy2;
   4826             pu1_src_left_cpy2 = pu1_src_left_str2;
   4827             pu1_src_left_str2 = pu1_left_tmp;
   4828 
   4829             pu1_src_left_cpy = pu1_src_left_cpy2;
   4830             pu1_src_left_str = pu1_src_left_str2;
   4831 
   4832         }
   4833         pu1_src_org[wd - 1] = u1_pos_wd_0_tmp;
   4834         pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp;
   4835         pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy;
   4836         pu1_src_left[0] = au1_src_left_tmp[0];
   4837         for(row = 1; row < ht_tmp; row++)
   4838         {
   4839             pu1_src_left[row] = pu1_src_left_cpy[row];
   4840         }
   4841     }
   4842 
   4843 }
   4844 
   4845 void ihevc_sao_edge_offset_class3_chroma_ssse3(UWORD8 *pu1_src,
   4846                                                WORD32 src_strd,
   4847                                                UWORD8 *pu1_src_left,
   4848                                                UWORD8 *pu1_src_top,
   4849                                                UWORD8 *pu1_src_top_left,
   4850                                                UWORD8 *pu1_src_top_right,
   4851                                                UWORD8 *pu1_src_bot_left,
   4852                                                UWORD8 *pu1_avail,
   4853                                                WORD8 *pi1_sao_offset_u,
   4854                                                WORD8 *pi1_sao_offset_v,
   4855                                                WORD32 wd,
   4856                                                WORD32 ht)
   4857 {
   4858     WORD32 row, col;
   4859     UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
   4860     UWORD8 *pu1_src_cpy, *pu1_src_org;
   4861     UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
   4862     UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
   4863     WORD32 wd_rem;
   4864     UWORD8 u1_pos_wd_0_tmp_u, u1_pos_wd_0_tmp_v, u1_pos_0_ht_tmp_u, u1_pos_0_ht_tmp_v;
   4865     WORD32 ht_tmp;
   4866     WORD32 bit_depth;
   4867     UWORD8 u1_avail0, u1_avail1;
   4868 
   4869     __m128i src_top_16x8b, src_bottom_16x8b;
   4870     __m128i src_temp0_16x8b, src_temp1_16x8b;
   4871     __m128i signup0_16x8b, signdwn1_16x8b;
   4872     __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
   4873     __m128i edge0_16x8b, edge1_16x8b;
   4874     __m128i au1_mask8x16b;
   4875     __m128i edge_idx_8x16b, sao_offset_8x16b;
   4876     __m128i left_store_16x8b;
   4877     __m128i const0_16x8b, const2_16x8b;
   4878     __m128i chroma_offset_8x16b;
   4879 
   4880     ht_tmp = ht;
   4881     au1_mask8x16b = _mm_set1_epi8(0xff);
   4882 
   4883 
   4884     au1_src_left_tmp[0] = pu1_src[(wd - 2)];
   4885     au1_src_left_tmp[1] = pu1_src[(wd - 1)];
   4886     //manipulation for bottom left
   4887     for(row = 2; row < 2 * ht; row++)
   4888     {
   4889         au1_src_left_tmp[row] = pu1_src_left[row];
   4890     }
   4891     au1_src_left_tmp[2 * ht] = pu1_src_bot_left[0];
   4892     au1_src_left_tmp[2 * ht + 1] = pu1_src_bot_left[1];
   4893 
   4894     pu1_src_top_left[0] = pu1_src_top[wd - 2];
   4895     pu1_src_top_left[1] = pu1_src_top[wd - 1];
   4896     //setting availability mask to ff size MAX_CTB_SIZE
   4897     for(col = 0; col < MAX_CTB_SIZE; col += 16)
   4898         _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
   4899     bit_depth = BIT_DEPTH_LUMA;
   4900     pu1_src_org = pu1_src;
   4901     pu1_src_top_cpy = pu1_src_top;
   4902     pu1_src_left_cpy2 = au1_src_left_tmp;
   4903     pu1_src_left_cpy = au1_src_left_tmp;
   4904     edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
   4905     sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
   4906     const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
   4907     chroma_offset_8x16b = _mm_set1_epi16(0x0800);
   4908     /* If top-right is available, process separately */
   4909     if(0 != pu1_avail[5])
   4910     {
   4911         WORD32 edge_idx;
   4912 
   4913         /* U */
   4914         edge_idx = 2 + SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) +
   4915                         SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]);
   4916 
   4917         edge_idx = gi1_table_edge_idx[edge_idx];
   4918 
   4919         if(0 != edge_idx)
   4920         {
   4921             u1_pos_wd_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
   4922         }
   4923         else
   4924         {
   4925             u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
   4926         }
   4927 
   4928         /* V */
   4929         edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) +
   4930                         SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]);
   4931 
   4932         edge_idx = gi1_table_edge_idx[edge_idx];
   4933 
   4934         if(0 != edge_idx)
   4935         {
   4936             u1_pos_wd_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
   4937         }
   4938         else
   4939         {
   4940             u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
   4941         }
   4942     }
   4943     else
   4944     {
   4945         u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
   4946         u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
   4947     }
   4948 
   4949     /* If bottom-left is available, process separately */
   4950     if(0 != pu1_avail[6])
   4951     {
   4952         WORD32 edge_idx;
   4953 
   4954         /* U */
   4955         edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]) +
   4956                         SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);
   4957 
   4958         edge_idx = gi1_table_edge_idx[edge_idx];
   4959 
   4960         if(0 != edge_idx)
   4961         {
   4962             u1_pos_0_ht_tmp_u = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
   4963         }
   4964         else
   4965         {
   4966             u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
   4967         }
   4968 
   4969         /* V */
   4970         edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]) +
   4971                         SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]);
   4972 
   4973         edge_idx = gi1_table_edge_idx[edge_idx];
   4974 
   4975         if(0 != edge_idx)
   4976         {
   4977             u1_pos_0_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd + 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
   4978         }
   4979         else
   4980         {
   4981             u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
   4982         }
   4983     }
   4984     else
   4985     {
   4986         u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
   4987         u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
   4988     }
   4989 
   4990 
   4991 
   4992     /* Update height and source pointers based on the availability flags */
   4993     if(0 == pu1_avail[2])
   4994     {
   4995         pu1_src_left_cpy2 += 2;
   4996         pu1_src_top_cpy = pu1_src;
   4997         pu1_src += src_strd;
   4998         ht--;
   4999     }
   5000     if(0 == pu1_avail[3])
   5001     {
   5002         ht--;
   5003     }
   5004 
   5005     sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
   5006     const2_16x8b = _mm_set1_epi8(2);
   5007     const0_16x8b = _mm_setzero_si128();
   5008 
   5009 
   5010     //availability mask creation
   5011     u1_avail0 = pu1_avail[0];
   5012     u1_avail1 = pu1_avail[1];
   5013     au1_mask[0] = u1_avail0;
   5014     au1_mask[1] = u1_avail0;
   5015     au1_mask[wd - 1] = u1_avail1;
   5016     au1_mask[wd - 2] = u1_avail1;
   5017     {
   5018         WORD32 ht_rem;
   5019         au1_mask_cpy = au1_mask;
   5020         for(col = wd; col >= 16; col -= 16)
   5021         {
   5022             pu1_src_cpy = pu1_src;
   5023             src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 2));
   5024             //row = 0
   5025             src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
   5026 
   5027             //loading the mask
   5028             au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
   5029             //separating +ve and and -ve values.
   5030             cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
   5031             cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
   5032             //creating mask 00 for +ve and -ve values and FF for zero.
   5033             cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   5034             cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   5035             //combining the appropriate sign change
   5036             signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   5037             pu1_src_left_cpy = pu1_src_left_cpy2;
   5038 
   5039             for(row = ht; row >= 2; row -= 2)
   5040             {
   5041                 left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
   5042                 //row = 1
   5043                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
   5044                 //to insert left in row 1
   5045                 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
   5046                 // row = 0 right
   5047                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2));
   5048 
   5049                 //manipulation for row 1 - row 0
   5050                 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
   5051                 //row 0 -row1
   5052                 //separating +ve and and -ve values.
   5053                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
   5054                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
   5055 
   5056                 //creating mask 00 for +ve and -ve values and FF for zero.
   5057                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   5058                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   5059 
   5060                 //combining the appropriate sign change
   5061                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
   5062                 //combining sign-left and sign_right
   5063                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
   5064 
   5065                 //row1-row0
   5066                 //separating +ve and and -ve values.
   5067                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
   5068                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
   5069                 //creating mask 00 for +ve and -ve values and FF for zero.
   5070                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   5071                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   5072 
   5073                 // row = 2
   5074                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
   5075                 // row = 1 right
   5076                 signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
   5077                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
   5078 
   5079                 //bottom - row1
   5080                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
   5081                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
   5082                 //creating mask 00 for +ve and -ve values and FF for zero.
   5083                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   5084                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   5085                 //for the next iteration bottom -row1
   5086                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   5087 
   5088                 //to insert left in row 1
   5089                 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10);
   5090                 //manipulation for row 1 - bottom
   5091                 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
   5092 
   5093                 //row1 -bottom
   5094                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
   5095                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
   5096                 //creating mask 00 for +ve and -ve values and FF for zero.
   5097                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   5098                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   5099                 //combining the appropriate sign change
   5100                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   5101 
   5102                 //combining sign-left and sign_right
   5103                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
   5104 
   5105                 //eliminating old left for row 0 and row 1
   5106                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
   5107                 //row1  getting it right for left of next block
   5108                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
   5109                 //row0  getting it right for left of next block
   5110                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
   5111                 //copying the next top
   5112                 src_top_16x8b = src_temp1_16x8b;
   5113 
   5114 
   5115                 //adding constant 2
   5116                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   5117                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
   5118                 //shuffle to get sao index
   5119                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   5120                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
   5121                 //using availability mask
   5122                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
   5123                 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
   5124 
   5125                 //adding chroma offset to access U and V
   5126                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
   5127                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
   5128 
   5129                 //shuffle to get sao offset
   5130                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   5131                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
   5132                 //cnvert to 16 bit then add and then saturated pack
   5133                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   5134                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   5135                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   5136                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
   5137                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
   5138                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   5139                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
   5140                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
   5141 
   5142                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
   5143                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
   5144                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
   5145                 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
   5146                 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
   5147                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   5148                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
   5149                 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
   5150                 //store left boundary
   5151                 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
   5152                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   5153                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   5154                 // row = 1
   5155                 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
   5156 
   5157                 src_temp0_16x8b = src_bottom_16x8b;
   5158                 pu1_src_cpy += (src_strd << 1);
   5159                 pu1_src_left_cpy += 4;
   5160             }
   5161             ht_rem = ht & 0x1;
   5162 
   5163             if(ht_rem)
   5164             {
   5165                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
   5166                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
   5167                 //to insert left in row 1
   5168                 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
   5169                 //manipulation for row 1 - row 0
   5170                 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
   5171 
   5172                 //current row -next row
   5173                 //separating +ve and and -ve values.
   5174                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
   5175                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
   5176                 //creating mask 00 for +ve and -ve values and FF for zero.
   5177                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   5178                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   5179                 //combining the appropriate sign change
   5180                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   5181                 //adding top and bottom and constant 2
   5182                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
   5183                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   5184                 //eliminating old left for row 0 and row 1
   5185                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
   5186                 //row0  getting it right for left of next block
   5187                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
   5188                 //copying the next top
   5189                 src_top_16x8b = src_temp0_16x8b;
   5190 
   5191                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   5192                 //using availability mask
   5193                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
   5194 
   5195                 //adding chroma offset to access U and V
   5196                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
   5197 
   5198 
   5199                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   5200 
   5201                 //cnvert to 16 bit then add and then saturated pack
   5202                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   5203                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   5204                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   5205                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
   5206                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
   5207                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   5208                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
   5209                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
   5210 
   5211                 //store left boundary
   5212                 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
   5213 
   5214                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   5215                 pu1_src_cpy += (src_strd);
   5216                 src_temp0_16x8b = src_bottom_16x8b;
   5217                 pu1_src_left_cpy += 2;
   5218             }
   5219             {   //for bottom right
   5220                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
   5221                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
   5222                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
   5223                 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
   5224             }
   5225             if(0 == pu1_avail[3])
   5226             {
   5227                 src_top_16x8b = src_bottom_16x8b;
   5228             }
   5229             //for the top left of next part of the block
   5230             left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
   5231             //updating top flag
   5232             _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
   5233             pu1_src += 16;
   5234             au1_mask_cpy += 16;
   5235         }
   5236         pu1_src_left_cpy = pu1_src_left_cpy2;
   5237         wd_rem = wd & 0xF;
   5238         if(wd_rem)
   5239         {
   5240             pu1_src_cpy = pu1_src;
   5241             src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 2));
   5242             //row = 0
   5243             src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
   5244             au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
   5245             //separating +ve and and -ve values.
   5246             cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
   5247             cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
   5248             //creating mask 00 for +ve and -ve values and FF for zero.
   5249             cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   5250             cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   5251             //preparing au1_mask
   5252             au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
   5253             //combining the appropriate sign change
   5254             signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   5255             signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
   5256             pu1_src_left_cpy = pu1_src_left_cpy2;
   5257             for(row = ht; row >= 4; row -= 4)
   5258             {
   5259                 left_store_16x8b = _mm_loadu_si128((__m128i *)pu1_src_left_cpy);
   5260                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
   5261                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
   5262                 // row = 2
   5263                 src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
   5264                 //manipulation for row 0 -row 1
   5265                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
   5266                 //row 1 left
   5267                 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
   5268                 //row 0 -row1
   5269                 //separating +ve and and -ve values.
   5270                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
   5271                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
   5272 
   5273                 //creating mask 00 for +ve and -ve values and FF for zero.
   5274                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   5275                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   5276                 //manipulatiing for row 1 -row 0
   5277                 signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2);
   5278                 //combining the appropriate sign change
   5279                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   5280                 //row 1 -row0
   5281                 //separating +ve and and -ve values.
   5282                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
   5283                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
   5284 
   5285                 //creating mask 00 for +ve and -ve values and FF for zero.
   5286                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   5287                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   5288                 //row1-row0
   5289                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   5290 
   5291                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
   5292 
   5293                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
   5294                 //manipulation for row 1 -row 2
   5295                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 10);
   5296                 //row 2 left
   5297                 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
   5298                 //packing row 0 n row 1
   5299                 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
   5300                 //row1 -row2
   5301                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
   5302                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
   5303                 //creating mask 00 for +ve and -ve values and FF for zero.
   5304                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   5305                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   5306                 //combining the appropriate sign change
   5307                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
   5308                 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
   5309 
   5310                 //row 1 right
   5311                 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
   5312                 //row = 3
   5313                 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
   5314 
   5315                 // row = 4
   5316                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
   5317 
   5318                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
   5319 
   5320                 //separating +ve and and -ve values.(2,1)
   5321                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
   5322                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
   5323 
   5324                 //creating mask 00 for +ve and -ve values and FF for zero.
   5325                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   5326                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   5327                 //row 2 right
   5328                 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
   5329                 //combining the appropriate sign change
   5330                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
   5331 
   5332                 //separating +ve and and -ve values.(3,2)
   5333                 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
   5334                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
   5335                 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
   5336                 //creating mask 00 for +ve and -ve values and FF for zero.
   5337                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   5338                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   5339                 //manipulation for row 2 -row 3
   5340                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 8);
   5341                 //row 3 left
   5342                 signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14);
   5343                 //combining the appropriate sign change
   5344                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
   5345 
   5346                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
   5347 
   5348                 //separating +ve and and -ve values.(2,3)
   5349                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
   5350                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
   5351 
   5352                 //manipulation for row 3 -bottom
   5353                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 6);
   5354                 //bottom left
   5355                 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
   5356 
   5357                 //creating mask 00 for +ve and -ve values and FF for zero.
   5358                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   5359                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   5360                 //combining the appropriate sign change
   5361                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
   5362 
   5363                 //separating +ve and and -ve values.(3,bottom)
   5364                 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
   5365                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
   5366 
   5367                 //creating mask 00 for +ve and -ve values and FF for zero.
   5368                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   5369                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   5370                 edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
   5371                 //combining the appropriate sign change
   5372                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
   5373                 edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
   5374 
   5375 
   5376                 //eliminating old left for row 0,1,2,3
   5377                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8);
   5378                 //packing row 2 n row 3
   5379                 src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
   5380                 //row 3 right
   5381                 signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2);
   5382                 //loading row 3 right into left
   5383                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14);
   5384                 //adding bottom and top values of row 2 and row 3
   5385                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
   5386                 //separating +ve and and -ve values.(botttom,3)
   5387                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
   5388                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
   5389                 //to store right of row 2
   5390                 signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
   5391                 //creating mask 00 for +ve and -ve values and FF for zero.
   5392                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   5393                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   5394                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
   5395 
   5396                 //storing right of row 2into left
   5397                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
   5398                 //to store right of row 0
   5399                 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
   5400                 //storing right of row 1 into left
   5401                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
   5402                 //storing right of row 0 into left
   5403                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
   5404 
   5405 
   5406                 //adding constant 2
   5407                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   5408                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
   5409                 //shuffle to get sao index
   5410                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   5411                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
   5412                 //using availability mask
   5413                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
   5414                 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
   5415                 //adding chroma offset to access U and V
   5416                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
   5417                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
   5418                 //shuffle to get sao offset
   5419                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   5420                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
   5421 
   5422                 //cnvert to 16 bit then add and then saturated pack
   5423                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   5424                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   5425                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   5426                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
   5427                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
   5428                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   5429                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
   5430                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
   5431 
   5432                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
   5433                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
   5434                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
   5435                 src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
   5436                 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
   5437                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
   5438                 src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
   5439                 src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
   5440 
   5441                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
   5442                 cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
   5443                 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
   5444                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   5445                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   5446                 // row = 1
   5447                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
   5448                 //row = 2
   5449                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
   5450                 // row = 3
   5451                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
   5452 
   5453                 src_temp0_16x8b = src_temp1_16x8b;
   5454                 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
   5455                 pu1_src_cpy += (src_strd << 2);
   5456                 pu1_src_left_cpy += 8;
   5457             }
   5458             ht_rem = ht & 0x2;
   5459             if(ht_rem)
   5460             {
   5461                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
   5462                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
   5463                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
   5464                 // row = 2
   5465                 src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
   5466 
   5467                 //manipulation for row 0 -row 1
   5468                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
   5469                 //bottom left
   5470                 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
   5471                 //separating +ve and and -ve values.
   5472                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
   5473                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
   5474 
   5475                 //creating mask 00 for +ve and -ve values and FF for zero.
   5476                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   5477                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   5478                 //manipulation for row 1 - row 0
   5479                 signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2);
   5480                 //combining the appropriate sign change
   5481                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   5482 
   5483                 //row1-row0
   5484                 //separating +ve and and -ve values.
   5485                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
   5486                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
   5487 
   5488                 //creating mask 00 for +ve and -ve values and FF for zero.
   5489                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   5490                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   5491                 //combining the appropriate sign chang
   5492                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   5493 
   5494                 //manipulation for row 1 -bottom
   5495                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 10);
   5496                 //bottom left
   5497                 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
   5498 
   5499                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
   5500                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
   5501                 //row1 -bottom
   5502                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
   5503                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
   5504 
   5505                 //creating mask 00 for +ve and -ve values and FF for zero.
   5506                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   5507                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   5508                 //combining the appropriate sign change
   5509                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
   5510                 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
   5511 
   5512                 //manipulation for bottom- row 1 (row 1 right)
   5513                 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
   5514                 //adding top and down substraction
   5515                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
   5516                 //bottom - row 1
   5517                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
   5518                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
   5519 
   5520                 //eliminating old left for row 0,1
   5521                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
   5522                 signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
   5523                 //creating mask 00 for +ve and -ve values and FF for zero.
   5524                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   5525                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   5526                 //for the next iteration signup0_16x8b
   5527                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
   5528 
   5529                 //storing right of row 1 into left
   5530                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
   5531                 //for storing right of row 1
   5532                 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
   5533 
   5534                 src_top_16x8b = src_temp1_16x8b;
   5535                 //storing right of row 0 into left
   5536                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
   5537 
   5538                 //adding constant 2
   5539                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   5540 
   5541                 //shuffle to get sao index
   5542                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   5543                 //using availability mask
   5544                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
   5545                 //adding chroma offset to access U and V
   5546                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
   5547                 //shuffle to get sao offset
   5548                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   5549                 //the next top already in  src_top_16x8b
   5550                 //cnvert to 16 bit then add and then saturated pack
   5551                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   5552                 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   5553                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   5554                 src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
   5555                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
   5556                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
   5557                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
   5558                 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
   5559 
   5560                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
   5561 
   5562                 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
   5563                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   5564                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   5565                 // row = 1
   5566                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
   5567                 src_temp0_16x8b = src_bottom_16x8b;
   5568                 pu1_src_cpy += (src_strd << 1);
   5569                 pu1_src_left_cpy += 4;
   5570             }
   5571             ht_rem = ht & 0x1;
   5572             if(ht_rem)
   5573             {
   5574                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
   5575                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
   5576                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
   5577 
   5578 
   5579                 //manipulation for row 0 -bottom
   5580                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
   5581                 //bottom left
   5582                 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
   5583                 //separating +ve and and -ve values.
   5584                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
   5585                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
   5586                 //creating mask 00 for +ve and -ve values and FF for zero.
   5587                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
   5588                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
   5589                 //combining the appropriate sign change
   5590                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
   5591                 //adding top and down substraction
   5592                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
   5593                 //for row 0 right to put into left store
   5594                 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
   5595                 //adding constant 2
   5596                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
   5597                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
   5598                 edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
   5599                 //left store manipulation 1
   5600                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
   5601                 //filling the left boundary value
   5602                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
   5603                 src_top_16x8b = src_temp0_16x8b;
   5604 
   5605                 //shuffle to get sao index
   5606                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
   5607                 //using availability mask
   5608                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
   5609                 //adding chroma offset to access U and V
   5610                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
   5611                 //shuffle to get sao offset
   5612                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
   5613 
   5614                 //cnvert to 16 bit then add and then saturated pack
   5615                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
   5616                 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
   5617                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
   5618                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
   5619                 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
   5620 
   5621                 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
   5622                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
   5623                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
   5624                 pu1_src_cpy += (src_strd);
   5625                 src_temp0_16x8b = src_bottom_16x8b;
   5626                 pu1_src_left_cpy += 2;
   5627             }
   5628             {   //for bottom right
   5629                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
   5630                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
   5631                 src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
   5632                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
   5633                 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
   5634             }
   5635             if(0 == pu1_avail[3])
   5636             {
   5637                 src_top_16x8b = src_bottom_16x8b;
   5638             }
   5639 
   5640             _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
   5641             pu1_src += 8;
   5642         }
   5643         pu1_src_org[wd - 2] = u1_pos_wd_0_tmp_u;
   5644         pu1_src_org[wd - 1] = u1_pos_wd_0_tmp_v;
   5645         pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp_u;
   5646         pu1_src_org[(ht_tmp - 1) * src_strd + 1] = u1_pos_0_ht_tmp_v;
   5647         for(row = 0; row < 2 * ht_tmp; row++)
   5648         {
   5649             pu1_src_left[row] = au1_src_left_tmp[row];
   5650         }
   5651     }
   5652 
   5653 }
   5654