1 /* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 18 #include "rsCpuIntrinsic.h" 19 #include "rsCpuIntrinsicInlines.h" 20 21 using namespace android; 22 using namespace android::renderscript; 23 24 namespace android { 25 namespace renderscript { 26 27 28 class RsdCpuScriptIntrinsicConvolve5x5 : public RsdCpuScriptIntrinsic { 29 public: 30 virtual void populateScript(Script *); 31 virtual void invokeFreeChildren(); 32 33 virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength); 34 virtual void setGlobalObj(uint32_t slot, ObjectBase *data); 35 36 virtual ~RsdCpuScriptIntrinsicConvolve5x5(); 37 RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e); 38 39 protected: 40 float mFp[28]; 41 short mIp[28]; 42 ObjectBaseRef<Allocation> alloc; 43 44 45 static void kernelU1(const RsForEachStubParamStruct *p, 46 uint32_t xstart, uint32_t xend, 47 uint32_t instep, uint32_t outstep); 48 static void kernelU2(const RsForEachStubParamStruct *p, 49 uint32_t xstart, uint32_t xend, 50 uint32_t instep, uint32_t outstep); 51 static void kernelU4(const RsForEachStubParamStruct *p, 52 uint32_t xstart, uint32_t xend, 53 uint32_t instep, uint32_t outstep); 54 static void kernelF1(const RsForEachStubParamStruct *p, 55 uint32_t xstart, uint32_t xend, 56 uint32_t instep, uint32_t outstep); 57 static void kernelF2(const RsForEachStubParamStruct *p, 58 uint32_t xstart, uint32_t xend, 59 uint32_t instep, uint32_t outstep); 60 static void kernelF4(const RsForEachStubParamStruct *p, 61 uint32_t xstart, uint32_t xend, 62 uint32_t instep, uint32_t outstep); 63 64 65 }; 66 67 } 68 } 69 70 void RsdCpuScriptIntrinsicConvolve5x5::setGlobalObj(uint32_t slot, ObjectBase *data) { 71 rsAssert(slot == 1); 72 alloc.set(static_cast<Allocation *>(data)); 73 } 74 75 void RsdCpuScriptIntrinsicConvolve5x5::setGlobalVar(uint32_t slot, 76 const void *data, size_t dataLength) { 77 rsAssert(slot == 0); 78 memcpy (&mFp, data, dataLength); 79 for(int ct=0; ct < 25; ct++) { 80 if (mFp[ct] >= 0) { 81 mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f); 82 } else { 83 mIp[ct] = (short)(mFp[ct] * 256.f - 0.5f); 84 } 85 } 86 } 87 88 89 static void OneU4(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out, 90 const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, const uchar4 *py3, const uchar4 *py4, 91 const float* coeff) { 92 93 uint32_t x0 = rsMax((int32_t)x-2, 0); 94 uint32_t x1 = rsMax((int32_t)x-1, 0); 95 uint32_t x2 = x; 96 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1)); 97 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1)); 98 99 float4 px = convert_float4(py0[x0]) * coeff[0] + 100 convert_float4(py0[x1]) * coeff[1] + 101 convert_float4(py0[x2]) * coeff[2] + 102 convert_float4(py0[x3]) * coeff[3] + 103 convert_float4(py0[x4]) * coeff[4] + 104 105 convert_float4(py1[x0]) * coeff[5] + 106 convert_float4(py1[x1]) * coeff[6] + 107 convert_float4(py1[x2]) * coeff[7] + 108 convert_float4(py1[x3]) * coeff[8] + 109 convert_float4(py1[x4]) * coeff[9] + 110 111 convert_float4(py2[x0]) * coeff[10] + 112 convert_float4(py2[x1]) * coeff[11] + 113 convert_float4(py2[x2]) * coeff[12] + 114 convert_float4(py2[x3]) * coeff[13] + 115 convert_float4(py2[x4]) * coeff[14] + 116 117 convert_float4(py3[x0]) * coeff[15] + 118 convert_float4(py3[x1]) * coeff[16] + 119 convert_float4(py3[x2]) * coeff[17] + 120 convert_float4(py3[x3]) * coeff[18] + 121 convert_float4(py3[x4]) * coeff[19] + 122 123 convert_float4(py4[x0]) * coeff[20] + 124 convert_float4(py4[x1]) * coeff[21] + 125 convert_float4(py4[x2]) * coeff[22] + 126 convert_float4(py4[x3]) * coeff[23] + 127 convert_float4(py4[x4]) * coeff[24]; 128 px = clamp(px + 0.5f, 0.f, 255.f); 129 *out = convert_uchar4(px); 130 } 131 132 static void OneU2(const RsForEachStubParamStruct *p, uint32_t x, uchar2 *out, 133 const uchar2 *py0, const uchar2 *py1, const uchar2 *py2, const uchar2 *py3, const uchar2 *py4, 134 const float* coeff) { 135 136 uint32_t x0 = rsMax((int32_t)x-2, 0); 137 uint32_t x1 = rsMax((int32_t)x-1, 0); 138 uint32_t x2 = x; 139 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1)); 140 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1)); 141 142 float2 px = convert_float2(py0[x0]) * coeff[0] + 143 convert_float2(py0[x1]) * coeff[1] + 144 convert_float2(py0[x2]) * coeff[2] + 145 convert_float2(py0[x3]) * coeff[3] + 146 convert_float2(py0[x4]) * coeff[4] + 147 148 convert_float2(py1[x0]) * coeff[5] + 149 convert_float2(py1[x1]) * coeff[6] + 150 convert_float2(py1[x2]) * coeff[7] + 151 convert_float2(py1[x3]) * coeff[8] + 152 convert_float2(py1[x4]) * coeff[9] + 153 154 convert_float2(py2[x0]) * coeff[10] + 155 convert_float2(py2[x1]) * coeff[11] + 156 convert_float2(py2[x2]) * coeff[12] + 157 convert_float2(py2[x3]) * coeff[13] + 158 convert_float2(py2[x4]) * coeff[14] + 159 160 convert_float2(py3[x0]) * coeff[15] + 161 convert_float2(py3[x1]) * coeff[16] + 162 convert_float2(py3[x2]) * coeff[17] + 163 convert_float2(py3[x3]) * coeff[18] + 164 convert_float2(py3[x4]) * coeff[19] + 165 166 convert_float2(py4[x0]) * coeff[20] + 167 convert_float2(py4[x1]) * coeff[21] + 168 convert_float2(py4[x2]) * coeff[22] + 169 convert_float2(py4[x3]) * coeff[23] + 170 convert_float2(py4[x4]) * coeff[24]; 171 px = clamp(px + 0.5f, 0.f, 255.f); 172 *out = convert_uchar2(px); 173 } 174 175 static void OneU1(const RsForEachStubParamStruct *p, uint32_t x, uchar *out, 176 const uchar *py0, const uchar *py1, const uchar *py2, const uchar *py3, const uchar *py4, 177 const float* coeff) { 178 179 uint32_t x0 = rsMax((int32_t)x-2, 0); 180 uint32_t x1 = rsMax((int32_t)x-1, 0); 181 uint32_t x2 = x; 182 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1)); 183 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1)); 184 185 float px = (float)(py0[x0]) * coeff[0] + 186 (float)(py0[x1]) * coeff[1] + 187 (float)(py0[x2]) * coeff[2] + 188 (float)(py0[x3]) * coeff[3] + 189 (float)(py0[x4]) * coeff[4] + 190 191 (float)(py1[x0]) * coeff[5] + 192 (float)(py1[x1]) * coeff[6] + 193 (float)(py1[x2]) * coeff[7] + 194 (float)(py1[x3]) * coeff[8] + 195 (float)(py1[x4]) * coeff[9] + 196 197 (float)(py2[x0]) * coeff[10] + 198 (float)(py2[x1]) * coeff[11] + 199 (float)(py2[x2]) * coeff[12] + 200 (float)(py2[x3]) * coeff[13] + 201 (float)(py2[x4]) * coeff[14] + 202 203 (float)(py3[x0]) * coeff[15] + 204 (float)(py3[x1]) * coeff[16] + 205 (float)(py3[x2]) * coeff[17] + 206 (float)(py3[x3]) * coeff[18] + 207 (float)(py3[x4]) * coeff[19] + 208 209 (float)(py4[x0]) * coeff[20] + 210 (float)(py4[x1]) * coeff[21] + 211 (float)(py4[x2]) * coeff[22] + 212 (float)(py4[x3]) * coeff[23] + 213 (float)(py4[x4]) * coeff[24]; 214 px = clamp(px + 0.5f, 0.f, 255.f); 215 *out = px; 216 } 217 218 static void OneF4(const RsForEachStubParamStruct *p, uint32_t x, float4 *out, 219 const float4 *py0, const float4 *py1, const float4 *py2, const float4 *py3, const float4 *py4, 220 const float* coeff) { 221 222 uint32_t x0 = rsMax((int32_t)x-2, 0); 223 uint32_t x1 = rsMax((int32_t)x-1, 0); 224 uint32_t x2 = x; 225 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1)); 226 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1)); 227 228 float4 px = py0[x0] * coeff[0] + 229 py0[x1] * coeff[1] + 230 py0[x2] * coeff[2] + 231 py0[x3] * coeff[3] + 232 py0[x4] * coeff[4] + 233 234 py1[x0] * coeff[5] + 235 py1[x1] * coeff[6] + 236 py1[x2] * coeff[7] + 237 py1[x3] * coeff[8] + 238 py1[x4] * coeff[9] + 239 240 py2[x0] * coeff[10] + 241 py2[x1] * coeff[11] + 242 py2[x2] * coeff[12] + 243 py2[x3] * coeff[13] + 244 py2[x4] * coeff[14] + 245 246 py3[x0] * coeff[15] + 247 py3[x1] * coeff[16] + 248 py3[x2] * coeff[17] + 249 py3[x3] * coeff[18] + 250 py3[x4] * coeff[19] + 251 252 py4[x0] * coeff[20] + 253 py4[x1] * coeff[21] + 254 py4[x2] * coeff[22] + 255 py4[x3] * coeff[23] + 256 py4[x4] * coeff[24]; 257 *out = px; 258 } 259 260 static void OneF2(const RsForEachStubParamStruct *p, uint32_t x, float2 *out, 261 const float2 *py0, const float2 *py1, const float2 *py2, const float2 *py3, const float2 *py4, 262 const float* coeff) { 263 264 uint32_t x0 = rsMax((int32_t)x-2, 0); 265 uint32_t x1 = rsMax((int32_t)x-1, 0); 266 uint32_t x2 = x; 267 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1)); 268 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1)); 269 270 float2 px = py0[x0] * coeff[0] + 271 py0[x1] * coeff[1] + 272 py0[x2] * coeff[2] + 273 py0[x3] * coeff[3] + 274 py0[x4] * coeff[4] + 275 276 py1[x0] * coeff[5] + 277 py1[x1] * coeff[6] + 278 py1[x2] * coeff[7] + 279 py1[x3] * coeff[8] + 280 py1[x4] * coeff[9] + 281 282 py2[x0] * coeff[10] + 283 py2[x1] * coeff[11] + 284 py2[x2] * coeff[12] + 285 py2[x3] * coeff[13] + 286 py2[x4] * coeff[14] + 287 288 py3[x0] * coeff[15] + 289 py3[x1] * coeff[16] + 290 py3[x2] * coeff[17] + 291 py3[x3] * coeff[18] + 292 py3[x4] * coeff[19] + 293 294 py4[x0] * coeff[20] + 295 py4[x1] * coeff[21] + 296 py4[x2] * coeff[22] + 297 py4[x3] * coeff[23] + 298 py4[x4] * coeff[24]; 299 *out = px; 300 } 301 302 static void OneF1(const RsForEachStubParamStruct *p, uint32_t x, float *out, 303 const float *py0, const float *py1, const float *py2, const float *py3, const float *py4, 304 const float* coeff) { 305 306 uint32_t x0 = rsMax((int32_t)x-2, 0); 307 uint32_t x1 = rsMax((int32_t)x-1, 0); 308 uint32_t x2 = x; 309 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1)); 310 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1)); 311 312 float px = py0[x0] * coeff[0] + 313 py0[x1] * coeff[1] + 314 py0[x2] * coeff[2] + 315 py0[x3] * coeff[3] + 316 py0[x4] * coeff[4] + 317 318 py1[x0] * coeff[5] + 319 py1[x1] * coeff[6] + 320 py1[x2] * coeff[7] + 321 py1[x3] * coeff[8] + 322 py1[x4] * coeff[9] + 323 324 py2[x0] * coeff[10] + 325 py2[x1] * coeff[11] + 326 py2[x2] * coeff[12] + 327 py2[x3] * coeff[13] + 328 py2[x4] * coeff[14] + 329 330 py3[x0] * coeff[15] + 331 py3[x1] * coeff[16] + 332 py3[x2] * coeff[17] + 333 py3[x3] * coeff[18] + 334 py3[x4] * coeff[19] + 335 336 py4[x0] * coeff[20] + 337 py4[x1] * coeff[21] + 338 py4[x2] * coeff[22] + 339 py4[x3] * coeff[23] + 340 py4[x4] * coeff[24]; 341 *out = px; 342 } 343 344 345 extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1, 346 const void *y2, const void *y3, const void *y4, 347 const short *coef, uint32_t count); 348 349 void RsdCpuScriptIntrinsicConvolve5x5::kernelU4(const RsForEachStubParamStruct *p, 350 uint32_t xstart, uint32_t xend, 351 uint32_t instep, uint32_t outstep) { 352 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr; 353 if (!cp->alloc.get()) { 354 ALOGE("Convolve5x5 executed without input, skipping"); 355 return; 356 } 357 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr; 358 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride; 359 360 uint32_t y0 = rsMax((int32_t)p->y-2, 0); 361 uint32_t y1 = rsMax((int32_t)p->y-1, 0); 362 uint32_t y2 = p->y; 363 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1)); 364 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1)); 365 366 const uchar4 *py0 = (const uchar4 *)(pin + stride * y0); 367 const uchar4 *py1 = (const uchar4 *)(pin + stride * y1); 368 const uchar4 *py2 = (const uchar4 *)(pin + stride * y2); 369 const uchar4 *py3 = (const uchar4 *)(pin + stride * y3); 370 const uchar4 *py4 = (const uchar4 *)(pin + stride * y4); 371 372 uchar4 *out = (uchar4 *)p->out; 373 uint32_t x1 = xstart; 374 uint32_t x2 = xend; 375 376 while((x1 < x2) && (x1 < 2)) { 377 OneU4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 378 out++; 379 x1++; 380 } 381 #if defined(ARCH_X86_HAVE_SSSE3) 382 // for x86 SIMD, require minimum of 7 elements (4 for SIMD, 383 // 3 for end boundary where x may hit the end boundary) 384 if (gArchUseSIMD &&((x1 + 6) < x2)) { 385 // subtract 3 for end boundary 386 uint32_t len = (x2 - x1 - 3) >> 2; 387 rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len); 388 out += len << 2; 389 x1 += len << 2; 390 } 391 #endif 392 393 #if defined(ARCH_ARM_USE_INTRINSICS) 394 if(gArchUseSIMD && ((x1 + 3) < x2)) { 395 uint32_t len = (x2 - x1 - 3) >> 1; 396 rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len); 397 out += len << 1; 398 x1 += len << 1; 399 } 400 #endif 401 402 while(x1 < x2) { 403 OneU4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 404 out++; 405 x1++; 406 } 407 } 408 409 void RsdCpuScriptIntrinsicConvolve5x5::kernelU2(const RsForEachStubParamStruct *p, 410 uint32_t xstart, uint32_t xend, 411 uint32_t instep, uint32_t outstep) { 412 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr; 413 if (!cp->alloc.get()) { 414 ALOGE("Convolve5x5 executed without input, skipping"); 415 return; 416 } 417 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr; 418 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride; 419 420 uint32_t y0 = rsMax((int32_t)p->y-2, 0); 421 uint32_t y1 = rsMax((int32_t)p->y-1, 0); 422 uint32_t y2 = p->y; 423 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1)); 424 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1)); 425 426 const uchar2 *py0 = (const uchar2 *)(pin + stride * y0); 427 const uchar2 *py1 = (const uchar2 *)(pin + stride * y1); 428 const uchar2 *py2 = (const uchar2 *)(pin + stride * y2); 429 const uchar2 *py3 = (const uchar2 *)(pin + stride * y3); 430 const uchar2 *py4 = (const uchar2 *)(pin + stride * y4); 431 432 uchar2 *out = (uchar2 *)p->out; 433 uint32_t x1 = xstart; 434 uint32_t x2 = xend; 435 436 while((x1 < x2) && (x1 < 2)) { 437 OneU2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 438 out++; 439 x1++; 440 } 441 442 #if 0//defined(ARCH_ARM_HAVE_NEON) 443 if((x1 + 3) < x2) { 444 uint32_t len = (x2 - x1 - 3) >> 1; 445 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len); 446 out += len << 1; 447 x1 += len << 1; 448 } 449 #endif 450 451 while(x1 < x2) { 452 OneU2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 453 out++; 454 x1++; 455 } 456 } 457 458 void RsdCpuScriptIntrinsicConvolve5x5::kernelU1(const RsForEachStubParamStruct *p, 459 uint32_t xstart, uint32_t xend, 460 uint32_t instep, uint32_t outstep) { 461 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr; 462 if (!cp->alloc.get()) { 463 ALOGE("Convolve5x5 executed without input, skipping"); 464 return; 465 } 466 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr; 467 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride; 468 469 uint32_t y0 = rsMax((int32_t)p->y-2, 0); 470 uint32_t y1 = rsMax((int32_t)p->y-1, 0); 471 uint32_t y2 = p->y; 472 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1)); 473 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1)); 474 475 const uchar *py0 = (const uchar *)(pin + stride * y0); 476 const uchar *py1 = (const uchar *)(pin + stride * y1); 477 const uchar *py2 = (const uchar *)(pin + stride * y2); 478 const uchar *py3 = (const uchar *)(pin + stride * y3); 479 const uchar *py4 = (const uchar *)(pin + stride * y4); 480 481 uchar *out = (uchar *)p->out; 482 uint32_t x1 = xstart; 483 uint32_t x2 = xend; 484 485 while((x1 < x2) && (x1 < 2)) { 486 OneU1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 487 out++; 488 x1++; 489 } 490 491 #if 0//defined(ARCH_ARM_HAVE_NEON) 492 if((x1 + 3) < x2) { 493 uint32_t len = (x2 - x1 - 3) >> 1; 494 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len); 495 out += len << 1; 496 x1 += len << 1; 497 } 498 #endif 499 500 while(x1 < x2) { 501 OneU1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 502 out++; 503 x1++; 504 } 505 } 506 507 void RsdCpuScriptIntrinsicConvolve5x5::kernelF4(const RsForEachStubParamStruct *p, 508 uint32_t xstart, uint32_t xend, 509 uint32_t instep, uint32_t outstep) { 510 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr; 511 if (!cp->alloc.get()) { 512 ALOGE("Convolve5x5 executed without input, skipping"); 513 return; 514 } 515 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr; 516 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride; 517 518 uint32_t y0 = rsMax((int32_t)p->y-2, 0); 519 uint32_t y1 = rsMax((int32_t)p->y-1, 0); 520 uint32_t y2 = p->y; 521 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1)); 522 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1)); 523 524 const float4 *py0 = (const float4 *)(pin + stride * y0); 525 const float4 *py1 = (const float4 *)(pin + stride * y1); 526 const float4 *py2 = (const float4 *)(pin + stride * y2); 527 const float4 *py3 = (const float4 *)(pin + stride * y3); 528 const float4 *py4 = (const float4 *)(pin + stride * y4); 529 530 float4 *out = (float4 *)p->out; 531 uint32_t x1 = xstart; 532 uint32_t x2 = xend; 533 534 while((x1 < x2) && (x1 < 2)) { 535 OneF4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 536 out++; 537 x1++; 538 } 539 540 #if 0//defined(ARCH_ARM_HAVE_NEON) 541 if((x1 + 3) < x2) { 542 uint32_t len = (x2 - x1 - 3) >> 1; 543 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len); 544 out += len << 1; 545 x1 += len << 1; 546 } 547 #endif 548 549 while(x1 < x2) { 550 OneF4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 551 out++; 552 x1++; 553 } 554 } 555 556 void RsdCpuScriptIntrinsicConvolve5x5::kernelF2(const RsForEachStubParamStruct *p, 557 uint32_t xstart, uint32_t xend, 558 uint32_t instep, uint32_t outstep) { 559 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr; 560 if (!cp->alloc.get()) { 561 ALOGE("Convolve5x5 executed without input, skipping"); 562 return; 563 } 564 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr; 565 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride; 566 567 uint32_t y0 = rsMax((int32_t)p->y-2, 0); 568 uint32_t y1 = rsMax((int32_t)p->y-1, 0); 569 uint32_t y2 = p->y; 570 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1)); 571 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1)); 572 573 const float2 *py0 = (const float2 *)(pin + stride * y0); 574 const float2 *py1 = (const float2 *)(pin + stride * y1); 575 const float2 *py2 = (const float2 *)(pin + stride * y2); 576 const float2 *py3 = (const float2 *)(pin + stride * y3); 577 const float2 *py4 = (const float2 *)(pin + stride * y4); 578 579 float2 *out = (float2 *)p->out; 580 uint32_t x1 = xstart; 581 uint32_t x2 = xend; 582 583 while((x1 < x2) && (x1 < 2)) { 584 OneF2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 585 out++; 586 x1++; 587 } 588 589 #if 0//defined(ARCH_ARM_HAVE_NEON) 590 if((x1 + 3) < x2) { 591 uint32_t len = (x2 - x1 - 3) >> 1; 592 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len); 593 out += len << 1; 594 x1 += len << 1; 595 } 596 #endif 597 598 while(x1 < x2) { 599 OneF2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 600 out++; 601 x1++; 602 } 603 } 604 605 void RsdCpuScriptIntrinsicConvolve5x5::kernelF1(const RsForEachStubParamStruct *p, 606 uint32_t xstart, uint32_t xend, 607 uint32_t instep, uint32_t outstep) { 608 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr; 609 if (!cp->alloc.get()) { 610 ALOGE("Convolve5x5 executed without input, skipping"); 611 return; 612 } 613 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr; 614 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride; 615 616 uint32_t y0 = rsMax((int32_t)p->y-2, 0); 617 uint32_t y1 = rsMax((int32_t)p->y-1, 0); 618 uint32_t y2 = p->y; 619 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1)); 620 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1)); 621 622 const float *py0 = (const float *)(pin + stride * y0); 623 const float *py1 = (const float *)(pin + stride * y1); 624 const float *py2 = (const float *)(pin + stride * y2); 625 const float *py3 = (const float *)(pin + stride * y3); 626 const float *py4 = (const float *)(pin + stride * y4); 627 628 float *out = (float *)p->out; 629 uint32_t x1 = xstart; 630 uint32_t x2 = xend; 631 632 while((x1 < x2) && (x1 < 2)) { 633 OneF1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 634 out++; 635 x1++; 636 } 637 638 #if 0//defined(ARCH_ARM_HAVE_NEON) 639 if((x1 + 3) < x2) { 640 uint32_t len = (x2 - x1 - 3) >> 1; 641 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len); 642 out += len << 1; 643 x1 += len << 1; 644 } 645 #endif 646 647 while(x1 < x2) { 648 OneF1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 649 out++; 650 x1++; 651 } 652 } 653 654 RsdCpuScriptIntrinsicConvolve5x5::RsdCpuScriptIntrinsicConvolve5x5( 655 RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) 656 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5) { 657 658 if (e->getType() == RS_TYPE_FLOAT_32) { 659 switch(e->getVectorSize()) { 660 case 1: 661 mRootPtr = &kernelF1; 662 break; 663 case 2: 664 mRootPtr = &kernelF2; 665 break; 666 case 3: 667 case 4: 668 mRootPtr = &kernelF4; 669 break; 670 } 671 } else { 672 switch(e->getVectorSize()) { 673 case 1: 674 mRootPtr = &kernelU1; 675 break; 676 case 2: 677 mRootPtr = &kernelU2; 678 break; 679 case 3: 680 case 4: 681 mRootPtr = &kernelU4; 682 break; 683 } 684 } 685 for(int ct=0; ct < 25; ct++) { 686 mFp[ct] = 1.f / 25.f; 687 mIp[ct] = (short)(mFp[ct] * 256.f); 688 } 689 } 690 691 RsdCpuScriptIntrinsicConvolve5x5::~RsdCpuScriptIntrinsicConvolve5x5() { 692 } 693 694 void RsdCpuScriptIntrinsicConvolve5x5::populateScript(Script *s) { 695 s->mHal.info.exportedVariableCount = 2; 696 } 697 698 void RsdCpuScriptIntrinsicConvolve5x5::invokeFreeChildren() { 699 alloc.clear(); 700 } 701 702 703 RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx, 704 const Script *s, const Element *e) { 705 706 return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s, e); 707 } 708 709 710 711