1 /* ------------------------------------------------------------------ 2 * Copyright (C) 1998-2009 PacketVideo 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 13 * express or implied. 14 * See the License for the specific language governing permissions 15 * and limitations under the License. 16 * ------------------------------------------------------------------- 17 */ 18 /*********************************************************************************/ 19 /* Filename: sad_mb_offset.h */ 20 /* Description: Implementation for in-line functions used in dct.cpp */ 21 /* Modified: */ 22 /*********************************************************************************/ 23 24 #if !defined(PV_ARM_GCC_V4) && !defined(PV_ARM_GCC_V5) /* ARM GNU COMPILER */ 25 26 #if (NUMBER==3) 27 __inline int32 sad_mb_offset3(UChar *ref, UChar *blk, Int lx, Int dmin) 28 #elif (NUMBER==2) 29 __inline int32 sad_mb_offset2(UChar *ref, UChar *blk, Int lx, Int dmin) 30 #elif (NUMBER==1) 31 __inline int32 sad_mb_offset1(UChar *ref, UChar *blk, Int lx, Int dmin) 32 #endif 33 { 34 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 35 36 // x5 = (x4<<8) - x4; 37 x4 = x5 = 0; 38 x6 = 0xFFFF00FF; 39 x9 = 0x80808080; /* const. */ 40 ref -= NUMBER; /* bic ref, ref, #3 */ 41 ref -= lx; 42 blk -= 16; 43 x8 = 16; 44 45 #if (NUMBER==3) 46 LOOP_SAD3: 47 #elif (NUMBER==2) 48 LOOP_SAD2: 49 #elif (NUMBER==1) 50 LOOP_SAD1: 51 #endif 52 /****** process 8 pixels ******/ 53 x10 = *((uint32*)(ref += lx)); /* D C B A */ 54 x11 = *((uint32*)(ref + 4)); /* H G F E */ 55 x12 = *((uint32*)(ref + 8)); /* L K J I */ 56 57 x10 = ((uint32)x10 >> SHIFT); /* 0 0 0 D */ 58 x10 = x10 | (x11 << (32 - SHIFT)); /* G F E D */ 59 x11 = ((uint32)x11 >> SHIFT); /* 0 0 0 H */ 60 x11 = x11 | (x12 << (32 - SHIFT)); /* K J I H */ 61 62 x12 = *((uint32*)(blk += 16)); 63 x14 = *((uint32*)(blk + 4)); 64 65 /* process x11 & x14 */ 66 x11 = sad_4pixel(x11, x14, x9); 67 68 /* process x12 & x10 */ 69 x10 = sad_4pixel(x10, x12, x9); 70 71 x5 = x5 + x10; /* accumulate low bytes */ 72 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 73 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 74 x5 = x5 + x11; /* accumulate low bytes */ 75 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 76 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 77 78 /****** process 8 pixels ******/ 79 x10 = *((uint32*)(ref + 8)); /* D C B A */ 80 x11 = *((uint32*)(ref + 12)); /* H G F E */ 81 x12 = *((uint32*)(ref + 16)); /* L K J I */ 82 83 x10 = ((uint32)x10 >> SHIFT); /* mvn x10, x10, lsr #24 = 0xFF 0xFF 0xFF ~D */ 84 x10 = x10 | (x11 << (32 - SHIFT)); /* bic x10, x10, x11, lsl #8 = ~G ~F ~E ~D */ 85 x11 = ((uint32)x11 >> SHIFT); /* 0xFF 0xFF 0xFF ~H */ 86 x11 = x11 | (x12 << (32 - SHIFT)); /* ~K ~J ~I ~H */ 87 88 x12 = *((uint32*)(blk + 8)); 89 x14 = *((uint32*)(blk + 12)); 90 91 /* process x11 & x14 */ 92 x11 = sad_4pixel(x11, x14, x9); 93 94 /* process x12 & x10 */ 95 x10 = sad_4pixel(x10, x12, x9); 96 97 x5 = x5 + x10; /* accumulate low bytes */ 98 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 99 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 100 x5 = x5 + x11; /* accumulate low bytes */ 101 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 102 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 103 104 /****************/ 105 x10 = x5 - (x4 << 8); /* extract low bytes */ 106 x10 = x10 + x4; /* add with high bytes */ 107 x10 = x10 + (x10 << 16); /* add with lower half word */ 108 109 if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */ 110 { 111 if (--x8) 112 { 113 #if (NUMBER==3) 114 goto LOOP_SAD3; 115 #elif (NUMBER==2) 116 goto LOOP_SAD2; 117 #elif (NUMBER==1) 118 goto LOOP_SAD1; 119 #endif 120 } 121 122 } 123 124 return ((uint32)x10 >> 16); 125 } 126 127 #elif defined(__CC_ARM) /* only work with arm v5 */ 128 129 #if (NUMBER==3) 130 __inline int32 sad_mb_offset3(UChar *ref, UChar *blk, Int lx, Int dmin, int32 x8) 131 #elif (NUMBER==2) 132 __inline int32 sad_mb_offset2(UChar *ref, UChar *blk, Int lx, Int dmin, int32 x8) 133 #elif (NUMBER==1) 134 __inline int32 sad_mb_offset1(UChar *ref, UChar *blk, Int lx, Int dmin, int32 x8) 135 #endif 136 { 137 int32 x4, x5, x6, x9, x10, x11, x12, x14; 138 139 x9 = 0x80808080; /* const. */ 140 x4 = x5 = 0; 141 142 __asm{ 143 MVN x6, #0xff0000; 144 BIC ref, ref, #3; 145 146 #if (NUMBER==3) 147 LOOP_SAD3: 148 #elif (NUMBER==2) 149 LOOP_SAD2: 150 #elif (NUMBER==1) 151 LOOP_SAD1: 152 #endif 153 } 154 /****** process 8 pixels ******/ 155 x11 = *((int32*)(ref + 12)); 156 x12 = *((int32*)(ref + 16)); 157 x10 = *((int32*)(ref + 8)); 158 x14 = *((int32*)(blk + 12)); 159 160 __asm{ 161 MVN x10, x10, lsr #SHIFT; 162 BIC x10, x10, x11, lsl #(32-SHIFT); 163 MVN x11, x11, lsr #SHIFT; 164 BIC x11, x11, x12, lsl #(32-SHIFT); 165 166 LDR x12, [blk, #8]; 167 } 168 169 /* process x11 & x14 */ 170 x11 = sad_4pixelN(x11, x14, x9); 171 172 /* process x12 & x10 */ 173 x10 = sad_4pixelN(x10, x12, x9); 174 175 sum_accumulate; 176 177 __asm{ 178 /****** process 8 pixels ******/ 179 LDR x11, [ref, #4]; 180 LDR x12, [ref, #8]; 181 LDR x10, [ref], lx ; 182 LDR x14, [blk, #4]; 183 184 MVN x10, x10, lsr #SHIFT; 185 BIC x10, x10, x11, lsl #(32-SHIFT); 186 MVN x11, x11, lsr #SHIFT; 187 BIC x11, x11, x12, lsl #(32-SHIFT); 188 189 LDR x12, [blk], #16; 190 } 191 192 /* process x11 & x14 */ 193 x11 = sad_4pixelN(x11, x14, x9); 194 195 /* process x12 & x10 */ 196 x10 = sad_4pixelN(x10, x12, x9); 197 198 sum_accumulate; 199 200 /****************/ 201 x10 = x5 - (x4 << 8); /* extract low bytes */ 202 x10 = x10 + x4; /* add with high bytes */ 203 x10 = x10 + (x10 << 16); /* add with lower half word */ 204 205 __asm{ 206 RSBS x11, dmin, x10, lsr #16 207 ADDLSS x8, x8, #INC_X8 208 #if (NUMBER==3) 209 BLS LOOP_SAD3; 210 #elif (NUMBER==2) 211 BLS LOOP_SAD2; 212 #elif (NUMBER==1) 213 BLS LOOP_SAD1; 214 #endif 215 } 216 217 return ((uint32)x10 >> 16); 218 } 219 220 #elif ( defined(PV_ARM_GCC_V5) || defined(PV_ARM_GCC_V4) ) /* ARM GNU COMPILER */ 221 222 #if (NUMBER==3) 223 __inline int32 sad_mb_offset3(UChar *ref, UChar *blk, Int lx, Int dmin) 224 #elif (NUMBER==2) 225 __inline int32 sad_mb_offset2(UChar *ref, UChar *blk, Int lx, Int dmin) 226 #elif (NUMBER==1) 227 __inline int32 sad_mb_offset1(UChar *ref, UChar *blk, Int lx, Int dmin) 228 #endif 229 { 230 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 231 232 // x5 = (x4<<8) - x4; 233 x4 = x5 = 0; 234 x6 = 0xFFFF00FF; 235 x9 = 0x80808080; /* const. */ 236 ref -= NUMBER; /* bic ref, ref, #3 */ 237 ref -= lx; 238 x8 = 16; 239 240 #if (NUMBER==3) 241 LOOP_SAD3: 242 #elif (NUMBER==2) 243 LOOP_SAD2: 244 #elif (NUMBER==1) 245 LOOP_SAD1: 246 #endif 247 /****** process 8 pixels ******/ 248 x10 = *((uint32*)(ref += lx)); /* D C B A */ 249 x11 = *((uint32*)(ref + 4)); /* H G F E */ 250 x12 = *((uint32*)(ref + 8)); /* L K J I */ 251 252 int32 shift = SHIFT; 253 int32 shift2 = 32 - SHIFT; 254 asm volatile("ldr %3, [%4, #4]\n\t" 255 "mvn %0, %0, lsr %5\n\t" 256 "bic %0, %0, %1, lsl %6\n\t" 257 "mvn %1, %1, lsr %5\n\t" 258 "bic %1, %1, %2, lsl %6\n\t" 259 "ldr %2, [%4, #8]" 260 : "+r"(x10), "+r"(x11), "+r"(x12), "=r"(x14) 261 : "r"(blk), "r"(shift), "r"(shift2)); 262 263 /* process x11 & x14 */ 264 x11 = sad_4pixel(x11, x14, x9); 265 266 /* process x12 & x10 */ 267 x10 = sad_4pixel(x10, x12, x9); 268 269 sum_accumulate; 270 271 /****** process 8 pixels ******/ 272 x10 = *((uint32*)(ref + 8)); /* D C B A */ 273 x11 = *((uint32*)(ref + 12)); /* H G F E */ 274 x12 = *((uint32*)(ref + 16)); /* L K J I */ 275 276 asm volatile("ldr %3, [%4, #4]\n\t" 277 "mvn %0, %0, lsr %5\n\t" 278 "bic %0, %0, %1, lsl %6\n\t" 279 "mvn %1, %1, lsr %5\n\t" 280 "bic %1, %1, %2, lsl %6\n\t" 281 "ldr %2, [%4, #8]" 282 : "+r"(x10), "+r"(x11), "+r"(x12), "=r"(x14) 283 : "r"(blk), "r"(shift), "r"(shift2)); 284 285 /* process x11 & x14 */ 286 x11 = sad_4pixel(x11, x14, x9); 287 288 /* process x12 & x10 */ 289 x10 = sad_4pixel(x10, x12, x9); 290 291 sum_accumulate; 292 293 /****************/ 294 x10 = x5 - (x4 << 8); /* extract low bytes */ 295 x10 = x10 + x4; /* add with high bytes */ 296 x10 = x10 + (x10 << 16); /* add with lower half word */ 297 298 if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */ 299 { 300 if (--x8) 301 { 302 #if (NUMBER==3) 303 goto LOOP_SAD3; 304 #elif (NUMBER==2) 305 goto LOOP_SAD2; 306 #elif (NUMBER==1) 307 goto LOOP_SAD1; 308 #endif 309 } 310 311 } 312 313 return ((uint32)x10 >> 16); 314 } 315 316 #endif 317 318