1 /* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 18 #include "rsCpuIntrinsic.h" 19 #include "rsCpuIntrinsicInlines.h" 20 21 using namespace android; 22 using namespace android::renderscript; 23 24 namespace android { 25 namespace renderscript { 26 27 28 class RsdCpuScriptIntrinsicConvolve5x5 : public RsdCpuScriptIntrinsic { 29 public: 30 virtual void populateScript(Script *); 31 virtual void invokeFreeChildren(); 32 33 virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength); 34 virtual void setGlobalObj(uint32_t slot, ObjectBase *data); 35 36 virtual ~RsdCpuScriptIntrinsicConvolve5x5(); 37 RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e); 38 39 protected: 40 float mFp[28]; 41 short mIp[28]; 42 ObjectBaseRef<Allocation> alloc; 43 44 45 static void kernelU1(const RsForEachStubParamStruct *p, 46 uint32_t xstart, uint32_t xend, 47 uint32_t instep, uint32_t outstep); 48 static void kernelU2(const RsForEachStubParamStruct *p, 49 uint32_t xstart, uint32_t xend, 50 uint32_t instep, uint32_t outstep); 51 static void kernelU4(const RsForEachStubParamStruct *p, 52 uint32_t xstart, uint32_t xend, 53 uint32_t instep, uint32_t outstep); 54 static void kernelF1(const RsForEachStubParamStruct *p, 55 uint32_t xstart, uint32_t xend, 56 uint32_t instep, uint32_t outstep); 57 static void kernelF2(const RsForEachStubParamStruct *p, 58 uint32_t xstart, uint32_t xend, 59 uint32_t instep, uint32_t outstep); 60 static void kernelF4(const RsForEachStubParamStruct *p, 61 uint32_t xstart, uint32_t xend, 62 uint32_t instep, uint32_t outstep); 63 64 65 }; 66 67 } 68 } 69 70 void RsdCpuScriptIntrinsicConvolve5x5::setGlobalObj(uint32_t slot, ObjectBase *data) { 71 rsAssert(slot == 1); 72 alloc.set(static_cast<Allocation *>(data)); 73 } 74 75 void RsdCpuScriptIntrinsicConvolve5x5::setGlobalVar(uint32_t slot, 76 const void *data, size_t dataLength) { 77 rsAssert(slot == 0); 78 memcpy (&mFp, data, dataLength); 79 for(int ct=0; ct < 25; ct++) { 80 if (mFp[ct] >= 0) { 81 mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f); 82 } else { 83 mIp[ct] = (short)(mFp[ct] * 256.f - 0.5f); 84 } 85 } 86 } 87 88 89 static void OneU4(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out, 90 const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, const uchar4 *py3, const uchar4 *py4, 91 const float* coeff) { 92 93 uint32_t x0 = rsMax((int32_t)x-2, 0); 94 uint32_t x1 = rsMax((int32_t)x-1, 0); 95 uint32_t x2 = x; 96 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1)); 97 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1)); 98 99 float4 px = convert_float4(py0[x0]) * coeff[0] + 100 convert_float4(py0[x1]) * coeff[1] + 101 convert_float4(py0[x2]) * coeff[2] + 102 convert_float4(py0[x3]) * coeff[3] + 103 convert_float4(py0[x4]) * coeff[4] + 104 105 convert_float4(py1[x0]) * coeff[5] + 106 convert_float4(py1[x1]) * coeff[6] + 107 convert_float4(py1[x2]) * coeff[7] + 108 convert_float4(py1[x3]) * coeff[8] + 109 convert_float4(py1[x4]) * coeff[9] + 110 111 convert_float4(py2[x0]) * coeff[10] + 112 convert_float4(py2[x1]) * coeff[11] + 113 convert_float4(py2[x2]) * coeff[12] + 114 convert_float4(py2[x3]) * coeff[13] + 115 convert_float4(py2[x4]) * coeff[14] + 116 117 convert_float4(py3[x0]) * coeff[15] + 118 convert_float4(py3[x1]) * coeff[16] + 119 convert_float4(py3[x2]) * coeff[17] + 120 convert_float4(py3[x3]) * coeff[18] + 121 convert_float4(py3[x4]) * coeff[19] + 122 123 convert_float4(py4[x0]) * coeff[20] + 124 convert_float4(py4[x1]) * coeff[21] + 125 convert_float4(py4[x2]) * coeff[22] + 126 convert_float4(py4[x3]) * coeff[23] + 127 convert_float4(py4[x4]) * coeff[24]; 128 px = clamp(px, 0.f, 255.f); 129 *out = convert_uchar4(px); 130 } 131 132 static void OneU2(const RsForEachStubParamStruct *p, uint32_t x, uchar2 *out, 133 const uchar2 *py0, const uchar2 *py1, const uchar2 *py2, const uchar2 *py3, const uchar2 *py4, 134 const float* coeff) { 135 136 uint32_t x0 = rsMax((int32_t)x-2, 0); 137 uint32_t x1 = rsMax((int32_t)x-1, 0); 138 uint32_t x2 = x; 139 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1)); 140 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1)); 141 142 float2 px = convert_float2(py0[x0]) * coeff[0] + 143 convert_float2(py0[x1]) * coeff[1] + 144 convert_float2(py0[x2]) * coeff[2] + 145 convert_float2(py0[x3]) * coeff[3] + 146 convert_float2(py0[x4]) * coeff[4] + 147 148 convert_float2(py1[x0]) * coeff[5] + 149 convert_float2(py1[x1]) * coeff[6] + 150 convert_float2(py1[x2]) * coeff[7] + 151 convert_float2(py1[x3]) * coeff[8] + 152 convert_float2(py1[x4]) * coeff[9] + 153 154 convert_float2(py2[x0]) * coeff[10] + 155 convert_float2(py2[x1]) * coeff[11] + 156 convert_float2(py2[x2]) * coeff[12] + 157 convert_float2(py2[x3]) * coeff[13] + 158 convert_float2(py2[x4]) * coeff[14] + 159 160 convert_float2(py3[x0]) * coeff[15] + 161 convert_float2(py3[x1]) * coeff[16] + 162 convert_float2(py3[x2]) * coeff[17] + 163 convert_float2(py3[x3]) * coeff[18] + 164 convert_float2(py3[x4]) * coeff[19] + 165 166 convert_float2(py4[x0]) * coeff[20] + 167 convert_float2(py4[x1]) * coeff[21] + 168 convert_float2(py4[x2]) * coeff[22] + 169 convert_float2(py4[x3]) * coeff[23] + 170 convert_float2(py4[x4]) * coeff[24]; 171 px = clamp(px, 0.f, 255.f); 172 *out = convert_uchar2(px); 173 } 174 175 static void OneU1(const RsForEachStubParamStruct *p, uint32_t x, uchar *out, 176 const uchar *py0, const uchar *py1, const uchar *py2, const uchar *py3, const uchar *py4, 177 const float* coeff) { 178 179 uint32_t x0 = rsMax((int32_t)x-2, 0); 180 uint32_t x1 = rsMax((int32_t)x-1, 0); 181 uint32_t x2 = x; 182 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1)); 183 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1)); 184 185 float px = (float)(py0[x0]) * coeff[0] + 186 (float)(py0[x1]) * coeff[1] + 187 (float)(py0[x2]) * coeff[2] + 188 (float)(py0[x3]) * coeff[3] + 189 (float)(py0[x4]) * coeff[4] + 190 191 (float)(py1[x0]) * coeff[5] + 192 (float)(py1[x1]) * coeff[6] + 193 (float)(py1[x2]) * coeff[7] + 194 (float)(py1[x3]) * coeff[8] + 195 (float)(py1[x4]) * coeff[9] + 196 197 (float)(py2[x0]) * coeff[10] + 198 (float)(py2[x1]) * coeff[11] + 199 (float)(py2[x2]) * coeff[12] + 200 (float)(py2[x3]) * coeff[13] + 201 (float)(py2[x4]) * coeff[14] + 202 203 (float)(py3[x0]) * coeff[15] + 204 (float)(py3[x1]) * coeff[16] + 205 (float)(py3[x2]) * coeff[17] + 206 (float)(py3[x3]) * coeff[18] + 207 (float)(py3[x4]) * coeff[19] + 208 209 (float)(py4[x0]) * coeff[20] + 210 (float)(py4[x1]) * coeff[21] + 211 (float)(py4[x2]) * coeff[22] + 212 (float)(py4[x3]) * coeff[23] + 213 (float)(py4[x4]) * coeff[24]; 214 px = clamp(px, 0.f, 255.f); 215 *out = px; 216 } 217 218 static void OneF4(const RsForEachStubParamStruct *p, uint32_t x, float4 *out, 219 const float4 *py0, const float4 *py1, const float4 *py2, const float4 *py3, const float4 *py4, 220 const float* coeff) { 221 222 uint32_t x0 = rsMax((int32_t)x-2, 0); 223 uint32_t x1 = rsMax((int32_t)x-1, 0); 224 uint32_t x2 = x; 225 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1)); 226 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1)); 227 228 float4 px = py0[x0] * coeff[0] + 229 py0[x1] * coeff[1] + 230 py0[x2] * coeff[2] + 231 py0[x3] * coeff[3] + 232 py0[x4] * coeff[4] + 233 234 py1[x0] * coeff[5] + 235 py1[x1] * coeff[6] + 236 py1[x2] * coeff[7] + 237 py1[x3] * coeff[8] + 238 py1[x4] * coeff[9] + 239 240 py2[x0] * coeff[10] + 241 py2[x1] * coeff[11] + 242 py2[x2] * coeff[12] + 243 py2[x3] * coeff[13] + 244 py2[x4] * coeff[14] + 245 246 py3[x0] * coeff[15] + 247 py3[x1] * coeff[16] + 248 py3[x2] * coeff[17] + 249 py3[x3] * coeff[18] + 250 py3[x4] * coeff[19] + 251 252 py4[x0] * coeff[20] + 253 py4[x1] * coeff[21] + 254 py4[x2] * coeff[22] + 255 py4[x3] * coeff[23] + 256 py4[x4] * coeff[24]; 257 *out = px; 258 } 259 260 static void OneF2(const RsForEachStubParamStruct *p, uint32_t x, float2 *out, 261 const float2 *py0, const float2 *py1, const float2 *py2, const float2 *py3, const float2 *py4, 262 const float* coeff) { 263 264 uint32_t x0 = rsMax((int32_t)x-2, 0); 265 uint32_t x1 = rsMax((int32_t)x-1, 0); 266 uint32_t x2 = x; 267 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1)); 268 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1)); 269 270 float2 px = py0[x0] * coeff[0] + 271 py0[x1] * coeff[1] + 272 py0[x2] * coeff[2] + 273 py0[x3] * coeff[3] + 274 py0[x4] * coeff[4] + 275 276 py1[x0] * coeff[5] + 277 py1[x1] * coeff[6] + 278 py1[x2] * coeff[7] + 279 py1[x3] * coeff[8] + 280 py1[x4] * coeff[9] + 281 282 py2[x0] * coeff[10] + 283 py2[x1] * coeff[11] + 284 py2[x2] * coeff[12] + 285 py2[x3] * coeff[13] + 286 py2[x4] * coeff[14] + 287 288 py3[x0] * coeff[15] + 289 py3[x1] * coeff[16] + 290 py3[x2] * coeff[17] + 291 py3[x3] * coeff[18] + 292 py3[x4] * coeff[19] + 293 294 py4[x0] * coeff[20] + 295 py4[x1] * coeff[21] + 296 py4[x2] * coeff[22] + 297 py4[x3] * coeff[23] + 298 py4[x4] * coeff[24]; 299 *out = px; 300 } 301 302 static void OneF1(const RsForEachStubParamStruct *p, uint32_t x, float *out, 303 const float *py0, const float *py1, const float *py2, const float *py3, const float *py4, 304 const float* coeff) { 305 306 uint32_t x0 = rsMax((int32_t)x-2, 0); 307 uint32_t x1 = rsMax((int32_t)x-1, 0); 308 uint32_t x2 = x; 309 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1)); 310 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1)); 311 312 float px = py0[x0] * coeff[0] + 313 py0[x1] * coeff[1] + 314 py0[x2] * coeff[2] + 315 py0[x3] * coeff[3] + 316 py0[x4] * coeff[4] + 317 318 py1[x0] * coeff[5] + 319 py1[x1] * coeff[6] + 320 py1[x2] * coeff[7] + 321 py1[x3] * coeff[8] + 322 py1[x4] * coeff[9] + 323 324 py2[x0] * coeff[10] + 325 py2[x1] * coeff[11] + 326 py2[x2] * coeff[12] + 327 py2[x3] * coeff[13] + 328 py2[x4] * coeff[14] + 329 330 py3[x0] * coeff[15] + 331 py3[x1] * coeff[16] + 332 py3[x2] * coeff[17] + 333 py3[x3] * coeff[18] + 334 py3[x4] * coeff[19] + 335 336 py4[x0] * coeff[20] + 337 py4[x1] * coeff[21] + 338 py4[x2] * coeff[22] + 339 py4[x3] * coeff[23] + 340 py4[x4] * coeff[24]; 341 *out = px; 342 } 343 344 345 extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1, 346 const void *y2, const void *y3, const void *y4, 347 const short *coef, uint32_t count); 348 349 void RsdCpuScriptIntrinsicConvolve5x5::kernelU4(const RsForEachStubParamStruct *p, 350 uint32_t xstart, uint32_t xend, 351 uint32_t instep, uint32_t outstep) { 352 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr; 353 if (!cp->alloc.get()) { 354 ALOGE("Convolve5x5 executed without input, skipping"); 355 return; 356 } 357 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr; 358 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride; 359 360 uint32_t y0 = rsMax((int32_t)p->y-2, 0); 361 uint32_t y1 = rsMax((int32_t)p->y-1, 0); 362 uint32_t y2 = p->y; 363 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1)); 364 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1)); 365 366 const uchar4 *py0 = (const uchar4 *)(pin + stride * y0); 367 const uchar4 *py1 = (const uchar4 *)(pin + stride * y1); 368 const uchar4 *py2 = (const uchar4 *)(pin + stride * y2); 369 const uchar4 *py3 = (const uchar4 *)(pin + stride * y3); 370 const uchar4 *py4 = (const uchar4 *)(pin + stride * y4); 371 372 uchar4 *out = (uchar4 *)p->out; 373 uint32_t x1 = xstart; 374 uint32_t x2 = xend; 375 376 while((x1 < x2) && (x1 < 2)) { 377 OneU4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 378 out++; 379 x1++; 380 } 381 382 #if defined(ARCH_ARM_HAVE_VFP) 383 if(gArchUseSIMD && ((x1 + 3) < x2)) { 384 uint32_t len = (x2 - x1 - 3) >> 1; 385 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->mIp, len); 386 out += len << 1; 387 x1 += len << 1; 388 } 389 #endif 390 391 while(x1 < x2) { 392 OneU4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 393 out++; 394 x1++; 395 } 396 } 397 398 void RsdCpuScriptIntrinsicConvolve5x5::kernelU2(const RsForEachStubParamStruct *p, 399 uint32_t xstart, uint32_t xend, 400 uint32_t instep, uint32_t outstep) { 401 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr; 402 if (!cp->alloc.get()) { 403 ALOGE("Convolve5x5 executed without input, skipping"); 404 return; 405 } 406 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr; 407 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride; 408 409 uint32_t y0 = rsMax((int32_t)p->y-2, 0); 410 uint32_t y1 = rsMax((int32_t)p->y-1, 0); 411 uint32_t y2 = p->y; 412 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1)); 413 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1)); 414 415 const uchar2 *py0 = (const uchar2 *)(pin + stride * y0); 416 const uchar2 *py1 = (const uchar2 *)(pin + stride * y1); 417 const uchar2 *py2 = (const uchar2 *)(pin + stride * y2); 418 const uchar2 *py3 = (const uchar2 *)(pin + stride * y3); 419 const uchar2 *py4 = (const uchar2 *)(pin + stride * y4); 420 421 uchar2 *out = (uchar2 *)p->out; 422 uint32_t x1 = xstart; 423 uint32_t x2 = xend; 424 425 while((x1 < x2) && (x1 < 2)) { 426 OneU2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 427 out++; 428 x1++; 429 } 430 431 #if 0//defined(ARCH_ARM_HAVE_NEON) 432 if((x1 + 3) < x2) { 433 uint32_t len = (x2 - x1 - 3) >> 1; 434 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len); 435 out += len << 1; 436 x1 += len << 1; 437 } 438 #endif 439 440 while(x1 < x2) { 441 OneU2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 442 out++; 443 x1++; 444 } 445 } 446 447 void RsdCpuScriptIntrinsicConvolve5x5::kernelU1(const RsForEachStubParamStruct *p, 448 uint32_t xstart, uint32_t xend, 449 uint32_t instep, uint32_t outstep) { 450 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr; 451 if (!cp->alloc.get()) { 452 ALOGE("Convolve5x5 executed without input, skipping"); 453 return; 454 } 455 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr; 456 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride; 457 458 uint32_t y0 = rsMax((int32_t)p->y-2, 0); 459 uint32_t y1 = rsMax((int32_t)p->y-1, 0); 460 uint32_t y2 = p->y; 461 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1)); 462 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1)); 463 464 const uchar *py0 = (const uchar *)(pin + stride * y0); 465 const uchar *py1 = (const uchar *)(pin + stride * y1); 466 const uchar *py2 = (const uchar *)(pin + stride * y2); 467 const uchar *py3 = (const uchar *)(pin + stride * y3); 468 const uchar *py4 = (const uchar *)(pin + stride * y4); 469 470 uchar *out = (uchar *)p->out; 471 uint32_t x1 = xstart; 472 uint32_t x2 = xend; 473 474 while((x1 < x2) && (x1 < 2)) { 475 OneU1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 476 out++; 477 x1++; 478 } 479 480 #if 0//defined(ARCH_ARM_HAVE_NEON) 481 if((x1 + 3) < x2) { 482 uint32_t len = (x2 - x1 - 3) >> 1; 483 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len); 484 out += len << 1; 485 x1 += len << 1; 486 } 487 #endif 488 489 while(x1 < x2) { 490 OneU1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 491 out++; 492 x1++; 493 } 494 } 495 496 void RsdCpuScriptIntrinsicConvolve5x5::kernelF4(const RsForEachStubParamStruct *p, 497 uint32_t xstart, uint32_t xend, 498 uint32_t instep, uint32_t outstep) { 499 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr; 500 if (!cp->alloc.get()) { 501 ALOGE("Convolve5x5 executed without input, skipping"); 502 return; 503 } 504 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr; 505 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride; 506 507 uint32_t y0 = rsMax((int32_t)p->y-2, 0); 508 uint32_t y1 = rsMax((int32_t)p->y-1, 0); 509 uint32_t y2 = p->y; 510 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1)); 511 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1)); 512 513 const float4 *py0 = (const float4 *)(pin + stride * y0); 514 const float4 *py1 = (const float4 *)(pin + stride * y1); 515 const float4 *py2 = (const float4 *)(pin + stride * y2); 516 const float4 *py3 = (const float4 *)(pin + stride * y3); 517 const float4 *py4 = (const float4 *)(pin + stride * y4); 518 519 float4 *out = (float4 *)p->out; 520 uint32_t x1 = xstart; 521 uint32_t x2 = xend; 522 523 while((x1 < x2) && (x1 < 2)) { 524 OneF4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 525 out++; 526 x1++; 527 } 528 529 #if 0//defined(ARCH_ARM_HAVE_NEON) 530 if((x1 + 3) < x2) { 531 uint32_t len = (x2 - x1 - 3) >> 1; 532 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len); 533 out += len << 1; 534 x1 += len << 1; 535 } 536 #endif 537 538 while(x1 < x2) { 539 OneF4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 540 out++; 541 x1++; 542 } 543 } 544 545 void RsdCpuScriptIntrinsicConvolve5x5::kernelF2(const RsForEachStubParamStruct *p, 546 uint32_t xstart, uint32_t xend, 547 uint32_t instep, uint32_t outstep) { 548 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr; 549 if (!cp->alloc.get()) { 550 ALOGE("Convolve5x5 executed without input, skipping"); 551 return; 552 } 553 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr; 554 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride; 555 556 uint32_t y0 = rsMax((int32_t)p->y-2, 0); 557 uint32_t y1 = rsMax((int32_t)p->y-1, 0); 558 uint32_t y2 = p->y; 559 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1)); 560 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1)); 561 562 const float2 *py0 = (const float2 *)(pin + stride * y0); 563 const float2 *py1 = (const float2 *)(pin + stride * y1); 564 const float2 *py2 = (const float2 *)(pin + stride * y2); 565 const float2 *py3 = (const float2 *)(pin + stride * y3); 566 const float2 *py4 = (const float2 *)(pin + stride * y4); 567 568 float2 *out = (float2 *)p->out; 569 uint32_t x1 = xstart; 570 uint32_t x2 = xend; 571 572 while((x1 < x2) && (x1 < 2)) { 573 OneF2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 574 out++; 575 x1++; 576 } 577 578 #if 0//defined(ARCH_ARM_HAVE_NEON) 579 if((x1 + 3) < x2) { 580 uint32_t len = (x2 - x1 - 3) >> 1; 581 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len); 582 out += len << 1; 583 x1 += len << 1; 584 } 585 #endif 586 587 while(x1 < x2) { 588 OneF2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 589 out++; 590 x1++; 591 } 592 } 593 594 void RsdCpuScriptIntrinsicConvolve5x5::kernelF1(const RsForEachStubParamStruct *p, 595 uint32_t xstart, uint32_t xend, 596 uint32_t instep, uint32_t outstep) { 597 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr; 598 if (!cp->alloc.get()) { 599 ALOGE("Convolve5x5 executed without input, skipping"); 600 return; 601 } 602 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr; 603 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride; 604 605 uint32_t y0 = rsMax((int32_t)p->y-2, 0); 606 uint32_t y1 = rsMax((int32_t)p->y-1, 0); 607 uint32_t y2 = p->y; 608 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1)); 609 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1)); 610 611 const float *py0 = (const float *)(pin + stride * y0); 612 const float *py1 = (const float *)(pin + stride * y1); 613 const float *py2 = (const float *)(pin + stride * y2); 614 const float *py3 = (const float *)(pin + stride * y3); 615 const float *py4 = (const float *)(pin + stride * y4); 616 617 float *out = (float *)p->out; 618 uint32_t x1 = xstart; 619 uint32_t x2 = xend; 620 621 while((x1 < x2) && (x1 < 2)) { 622 OneF1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 623 out++; 624 x1++; 625 } 626 627 #if 0//defined(ARCH_ARM_HAVE_NEON) 628 if((x1 + 3) < x2) { 629 uint32_t len = (x2 - x1 - 3) >> 1; 630 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len); 631 out += len << 1; 632 x1 += len << 1; 633 } 634 #endif 635 636 while(x1 < x2) { 637 OneF1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 638 out++; 639 x1++; 640 } 641 } 642 643 RsdCpuScriptIntrinsicConvolve5x5::RsdCpuScriptIntrinsicConvolve5x5( 644 RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) 645 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5) { 646 647 if (e->getType() == RS_TYPE_FLOAT_32) { 648 switch(e->getVectorSize()) { 649 case 1: 650 mRootPtr = &kernelF1; 651 break; 652 case 2: 653 mRootPtr = &kernelF2; 654 break; 655 case 3: 656 case 4: 657 mRootPtr = &kernelF4; 658 break; 659 } 660 } else { 661 switch(e->getVectorSize()) { 662 case 1: 663 mRootPtr = &kernelU1; 664 break; 665 case 2: 666 mRootPtr = &kernelU2; 667 break; 668 case 3: 669 case 4: 670 mRootPtr = &kernelU4; 671 break; 672 } 673 } 674 for(int ct=0; ct < 25; ct++) { 675 mFp[ct] = 1.f / 25.f; 676 mIp[ct] = (short)(mFp[ct] * 256.f); 677 } 678 } 679 680 RsdCpuScriptIntrinsicConvolve5x5::~RsdCpuScriptIntrinsicConvolve5x5() { 681 } 682 683 void RsdCpuScriptIntrinsicConvolve5x5::populateScript(Script *s) { 684 s->mHal.info.exportedVariableCount = 2; 685 } 686 687 void RsdCpuScriptIntrinsicConvolve5x5::invokeFreeChildren() { 688 alloc.clear(); 689 } 690 691 692 RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx, 693 const Script *s, const Element *e) { 694 695 return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s, e); 696 } 697 698 699 700