1 /* ------------------------------------------------------------------ 2 * Copyright (C) 1998-2009 PacketVideo 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 13 * express or implied. 14 * See the License for the specific language governing permissions 15 * and limitations under the License. 16 * ------------------------------------------------------------------- 17 */ 18 19 /* Intentionally not using the gcc asm version, since it is 20 * slightly slower than the plain C version on modern GCC versions. */ 21 #if !defined(__CC_ARM) /* Generic C version */ 22 23 #if (NUMBER==3) 24 __inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin) 25 #elif (NUMBER==2) 26 __inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin) 27 #elif (NUMBER==1) 28 __inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin) 29 #endif 30 { 31 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 32 33 // x5 = (x4<<8) - x4; 34 x4 = x5 = 0; 35 x6 = 0xFFFF00FF; 36 x9 = 0x80808080; /* const. */ 37 ref -= NUMBER; /* bic ref, ref, #3 */ 38 ref -= lx; 39 blk -= 16; 40 x8 = 16; 41 42 #if (NUMBER==3) 43 LOOP_SAD3: 44 #elif (NUMBER==2) 45 LOOP_SAD2: 46 #elif (NUMBER==1) 47 LOOP_SAD1: 48 #endif 49 /****** process 8 pixels ******/ 50 x10 = *((uint32*)(ref += lx)); /* D C B A */ 51 x11 = *((uint32*)(ref + 4)); /* H G F E */ 52 x12 = *((uint32*)(ref + 8)); /* L K J I */ 53 54 x10 = ((uint32)x10 >> SHIFT); /* 0 0 0 D */ 55 x10 = x10 | (x11 << (32 - SHIFT)); /* G F E D */ 56 x11 = ((uint32)x11 >> SHIFT); /* 0 0 0 H */ 57 x11 = x11 | (x12 << (32 - SHIFT)); /* K J I H */ 58 59 x12 = *((uint32*)(blk += 16)); 60 x14 = *((uint32*)(blk + 4)); 61 62 /* process x11 & x14 */ 63 x11 = sad_4pixel(x11, x14, x9); 64 65 /* process x12 & x10 */ 66 x10 = sad_4pixel(x10, x12, x9); 67 68 x5 = x5 + x10; /* accumulate low bytes */ 69 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 70 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 71 x5 = x5 + x11; /* accumulate low bytes */ 72 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 73 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 74 75 /****** process 8 pixels ******/ 76 x10 = *((uint32*)(ref + 8)); /* D C B A */ 77 x11 = *((uint32*)(ref + 12)); /* H G F E */ 78 x12 = *((uint32*)(ref + 16)); /* L K J I */ 79 80 x10 = ((uint32)x10 >> SHIFT); /* mvn x10, x10, lsr #24 = 0xFF 0xFF 0xFF ~D */ 81 x10 = x10 | (x11 << (32 - SHIFT)); /* bic x10, x10, x11, lsl #8 = ~G ~F ~E ~D */ 82 x11 = ((uint32)x11 >> SHIFT); /* 0xFF 0xFF 0xFF ~H */ 83 x11 = x11 | (x12 << (32 - SHIFT)); /* ~K ~J ~I ~H */ 84 85 x12 = *((uint32*)(blk + 8)); 86 x14 = *((uint32*)(blk + 12)); 87 88 /* process x11 & x14 */ 89 x11 = sad_4pixel(x11, x14, x9); 90 91 /* process x12 & x10 */ 92 x10 = sad_4pixel(x10, x12, x9); 93 94 x5 = x5 + x10; /* accumulate low bytes */ 95 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 96 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 97 x5 = x5 + x11; /* accumulate low bytes */ 98 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 99 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 100 101 /****************/ 102 x10 = x5 - (x4 << 8); /* extract low bytes */ 103 x10 = x10 + x4; /* add with high bytes */ 104 x10 = x10 + (x10 << 16); /* add with lower half word */ 105 106 if ((int)((uint32)x10 >> 16) <= dmin) /* compare with dmin */ 107 { 108 if (--x8) 109 { 110 #if (NUMBER==3) 111 goto LOOP_SAD3; 112 #elif (NUMBER==2) 113 goto LOOP_SAD2; 114 #elif (NUMBER==1) 115 goto LOOP_SAD1; 116 #endif 117 } 118 119 } 120 121 return ((uint32)x10 >> 16); 122 } 123 124 #elif defined(__CC_ARM) /* only work with arm v5 */ 125 126 #if (NUMBER==3) 127 __inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8) 128 #elif (NUMBER==2) 129 __inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8) 130 #elif (NUMBER==1) 131 __inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8) 132 #endif 133 { 134 int32 x4, x5, x6, x9, x10, x11, x12, x14; 135 136 x9 = 0x80808080; /* const. */ 137 x4 = x5 = 0; 138 139 __asm{ 140 MVN x6, #0xff0000; 141 #if (NUMBER==3) 142 LOOP_SAD3: 143 #elif (NUMBER==2) 144 LOOP_SAD2: 145 #elif (NUMBER==1) 146 LOOP_SAD1: 147 #endif 148 BIC ref, ref, #3; 149 } 150 /****** process 8 pixels ******/ 151 x11 = *((int32*)(ref + 12)); 152 x12 = *((int32*)(ref + 16)); 153 x10 = *((int32*)(ref + 8)); 154 x14 = *((int32*)(blk + 12)); 155 156 __asm{ 157 MVN x10, x10, lsr #SHIFT; 158 BIC x10, x10, x11, lsl #(32-SHIFT); 159 MVN x11, x11, lsr #SHIFT; 160 BIC x11, x11, x12, lsl #(32-SHIFT); 161 162 LDR x12, [blk, #8]; 163 } 164 165 /* process x11 & x14 */ 166 x11 = sad_4pixelN(x11, x14, x9); 167 168 /* process x12 & x10 */ 169 x10 = sad_4pixelN(x10, x12, x9); 170 171 sum_accumulate; 172 173 __asm{ 174 /****** process 8 pixels ******/ 175 LDR x11, [ref, #4]; 176 LDR x12, [ref, #8]; 177 LDR x10, [ref], lx ; 178 LDR x14, [blk, #4]; 179 180 MVN x10, x10, lsr #SHIFT; 181 BIC x10, x10, x11, lsl #(32-SHIFT); 182 MVN x11, x11, lsr #SHIFT; 183 BIC x11, x11, x12, lsl #(32-SHIFT); 184 185 LDR x12, [blk], #16; 186 } 187 188 /* process x11 & x14 */ 189 x11 = sad_4pixelN(x11, x14, x9); 190 191 /* process x12 & x10 */ 192 x10 = sad_4pixelN(x10, x12, x9); 193 194 sum_accumulate; 195 196 /****************/ 197 x10 = x5 - (x4 << 8); /* extract low bytes */ 198 x10 = x10 + x4; /* add with high bytes */ 199 x10 = x10 + (x10 << 16); /* add with lower half word */ 200 201 __asm{ 202 RSBS x11, dmin, x10, lsr #16 203 ADDLSS x8, x8, #INC_X8 204 #if (NUMBER==3) 205 BLS LOOP_SAD3; 206 #elif (NUMBER==2) 207 BLS LOOP_SAD2; 208 #elif (NUMBER==1) 209 BLS LOOP_SAD1; 210 #endif 211 } 212 213 return ((uint32)x10 >> 16); 214 } 215 216 #elif defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER */ 217 218 #if (NUMBER==3) 219 __inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin) 220 #elif (NUMBER==2) 221 __inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin) 222 #elif (NUMBER==1) 223 __inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin) 224 #endif 225 { 226 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 227 228 x9 = 0x80808080; /* const. */ 229 x4 = x5 = 0; 230 x8 = 16; //<<===========******* 231 232 __asm__ volatile("MVN %0, #0xFF0000": "=r"(x6)); 233 234 #if (NUMBER==3) 235 LOOP_SAD3: 236 #elif (NUMBER==2) 237 LOOP_SAD2: 238 #elif (NUMBER==1) 239 LOOP_SAD1: 240 #endif 241 __asm__ volatile("BIC %0, %0, #3": "+r"(ref)); 242 /****** process 8 pixels ******/ 243 x11 = *((int32*)(ref + 12)); 244 x12 = *((int32*)(ref + 16)); 245 x10 = *((int32*)(ref + 8)); 246 x14 = *((int32*)(blk + 12)); 247 248 #if (SHIFT==8) 249 __asm__ volatile( 250 "MVN %0, %0, lsr #8\n\t" 251 "BIC %0, %0, %1, lsl #24\n\t" 252 "MVN %1, %1, lsr #8\n\t" 253 "BIC %1, %1, %2, lsl #24" 254 : "+r"(x10), "+r"(x11) 255 : "r"(x12) 256 ); 257 #elif (SHIFT==16) 258 __asm__ volatile( 259 "MVN %0, %0, lsr #16\n\t" 260 "BIC %0, %0, %1, lsl #16\n\t" 261 "MVN %1, %1, lsr #16\n\t" 262 "BIC %1, %1, %2, lsl #16" 263 : "+r"(x10), "+r"(x11) 264 : "r"(x12) 265 ); 266 #elif (SHIFT==24) 267 __asm__ volatile( 268 "MVN %0, %0, lsr #24\n\t" 269 "BIC %0, %0, %1, lsl #8\n\t" 270 "MVN %1, %1, lsr #24\n\t" 271 "BIC %1, %1, %2, lsl #8" 272 : "+r"(x10), "+r"(x11) 273 : "r"(x12) 274 ); 275 #endif 276 277 x12 = *((int32*)(blk + 8)); 278 279 /* process x11 & x14 */ 280 x11 = sad_4pixelN(x11, x14, x9); 281 282 /* process x12 & x10 */ 283 x10 = sad_4pixelN(x10, x12, x9); 284 285 sum_accumulate; 286 287 /****** process 8 pixels ******/ 288 x11 = *((int32*)(ref + 4)); 289 x12 = *((int32*)(ref + 8)); 290 x10 = *((int32*)ref); ref += lx; 291 x14 = *((int32*)(blk + 4)); 292 293 #if (SHIFT==8) 294 __asm__ volatile( 295 "MVN %0, %0, lsr #8\n\t" 296 "BIC %0, %0, %1, lsl #24\n\t" 297 "MVN %1, %1, lsr #8\n\t" 298 "BIC %1, %1, %2, lsl #24" 299 : "+r"(x10), "+r"(x11) 300 : "r"(x12) 301 ); 302 #elif (SHIFT==16) 303 __asm__ volatile( 304 "MVN %0, %0, lsr #16\n\t" 305 "BIC %0, %0, %1, lsl #16\n\t" 306 "MVN %1, %1, lsr #16\n\t" 307 "BIC %1, %1, %2, lsl #16" 308 : "+r"(x10), "+r"(x11) 309 : "r"(x12) 310 ); 311 #elif (SHIFT==24) 312 __asm__ volatile( 313 "MVN %0, %0, lsr #24\n\t" 314 "BIC %0, %0, %1, lsl #8\n\t" 315 "MVN %1, %1, lsr #24\n\t" 316 "BIC %1, %1, %2, lsl #8" 317 : "+r"(x10), "+r"(x11) 318 : "r"(x12) 319 ); 320 #endif 321 __asm__ volatile("LDR %0, [%1], #16": "=&r"(x12), "+r"(blk)); 322 323 /* process x11 & x14 */ 324 x11 = sad_4pixelN(x11, x14, x9); 325 326 /* process x12 & x10 */ 327 x10 = sad_4pixelN(x10, x12, x9); 328 329 sum_accumulate; 330 331 /****************/ 332 x10 = x5 - (x4 << 8); /* extract low bytes */ 333 x10 = x10 + x4; /* add with high bytes */ 334 x10 = x10 + (x10 << 16); /* add with lower half word */ 335 336 if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */ 337 { 338 if (--x8) 339 { 340 #if (NUMBER==3) 341 goto LOOP_SAD3; 342 #elif (NUMBER==2) 343 goto LOOP_SAD2; 344 #elif (NUMBER==1) 345 goto LOOP_SAD1; 346 #endif 347 } 348 349 } 350 351 return ((uint32)x10 >> 16); 352 } 353 354 #endif 355 356