1 /* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 18 #include "rsCpuIntrinsic.h" 19 #include "rsCpuIntrinsicInlines.h" 20 21 namespace android { 22 namespace renderscript { 23 24 25 class RsdCpuScriptIntrinsicConvolve5x5 : public RsdCpuScriptIntrinsic { 26 public: 27 void populateScript(Script *) override; 28 void invokeFreeChildren() override; 29 30 void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override; 31 void setGlobalObj(uint32_t slot, ObjectBase *data) override; 32 33 ~RsdCpuScriptIntrinsicConvolve5x5() override; 34 RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e); 35 36 protected: 37 float mFp[28]; 38 short mIp[28]; 39 ObjectBaseRef<Allocation> alloc; 40 41 42 static void kernelU1(const RsExpandKernelDriverInfo *info, 43 uint32_t xstart, uint32_t xend, 44 uint32_t outstep); 45 static void kernelU2(const RsExpandKernelDriverInfo *info, 46 uint32_t xstart, uint32_t xend, 47 uint32_t outstep); 48 static void kernelU4(const RsExpandKernelDriverInfo *info, 49 uint32_t xstart, uint32_t xend, 50 uint32_t outstep); 51 static void kernelF1(const RsExpandKernelDriverInfo *info, 52 uint32_t xstart, uint32_t xend, 53 uint32_t outstep); 54 static void kernelF2(const RsExpandKernelDriverInfo *info, 55 uint32_t xstart, uint32_t xend, 56 uint32_t outstep); 57 static void kernelF4(const RsExpandKernelDriverInfo *info, 58 uint32_t xstart, uint32_t xend, 59 uint32_t outstep); 60 61 62 }; 63 64 void RsdCpuScriptIntrinsicConvolve5x5::setGlobalObj(uint32_t slot, ObjectBase *data) { 65 rsAssert(slot == 1); 66 alloc.set(static_cast<Allocation *>(data)); 67 } 68 69 void RsdCpuScriptIntrinsicConvolve5x5::setGlobalVar(uint32_t slot, 70 const void *data, size_t dataLength) { 71 rsAssert(slot == 0); 72 memcpy (&mFp, data, dataLength); 73 for(int ct=0; ct < 25; ct++) { 74 if (mFp[ct] >= 0) { 75 mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f); 76 } else { 77 mIp[ct] = (short)(mFp[ct] * 256.f - 0.5f); 78 } 79 } 80 } 81 82 83 static void OneU4(const RsExpandKernelDriverInfo *info, uint32_t x, uchar4 *out, 84 const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, const uchar4 *py3, const uchar4 *py4, 85 const float* coeff) { 86 87 uint32_t x0 = rsMax((int32_t)x-2, 0); 88 uint32_t x1 = rsMax((int32_t)x-1, 0); 89 uint32_t x2 = x; 90 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1)); 91 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1)); 92 93 float4 px = convert_float4(py0[x0]) * coeff[0] + 94 convert_float4(py0[x1]) * coeff[1] + 95 convert_float4(py0[x2]) * coeff[2] + 96 convert_float4(py0[x3]) * coeff[3] + 97 convert_float4(py0[x4]) * coeff[4] + 98 99 convert_float4(py1[x0]) * coeff[5] + 100 convert_float4(py1[x1]) * coeff[6] + 101 convert_float4(py1[x2]) * coeff[7] + 102 convert_float4(py1[x3]) * coeff[8] + 103 convert_float4(py1[x4]) * coeff[9] + 104 105 convert_float4(py2[x0]) * coeff[10] + 106 convert_float4(py2[x1]) * coeff[11] + 107 convert_float4(py2[x2]) * coeff[12] + 108 convert_float4(py2[x3]) * coeff[13] + 109 convert_float4(py2[x4]) * coeff[14] + 110 111 convert_float4(py3[x0]) * coeff[15] + 112 convert_float4(py3[x1]) * coeff[16] + 113 convert_float4(py3[x2]) * coeff[17] + 114 convert_float4(py3[x3]) * coeff[18] + 115 convert_float4(py3[x4]) * coeff[19] + 116 117 convert_float4(py4[x0]) * coeff[20] + 118 convert_float4(py4[x1]) * coeff[21] + 119 convert_float4(py4[x2]) * coeff[22] + 120 convert_float4(py4[x3]) * coeff[23] + 121 convert_float4(py4[x4]) * coeff[24]; 122 px = clamp(px + 0.5f, 0.f, 255.f); 123 *out = convert_uchar4(px); 124 } 125 126 static void OneU2(const RsExpandKernelDriverInfo *info, uint32_t x, uchar2 *out, 127 const uchar2 *py0, const uchar2 *py1, const uchar2 *py2, const uchar2 *py3, const uchar2 *py4, 128 const float* coeff) { 129 130 uint32_t x0 = rsMax((int32_t)x-2, 0); 131 uint32_t x1 = rsMax((int32_t)x-1, 0); 132 uint32_t x2 = x; 133 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1)); 134 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1)); 135 136 float2 px = convert_float2(py0[x0]) * coeff[0] + 137 convert_float2(py0[x1]) * coeff[1] + 138 convert_float2(py0[x2]) * coeff[2] + 139 convert_float2(py0[x3]) * coeff[3] + 140 convert_float2(py0[x4]) * coeff[4] + 141 142 convert_float2(py1[x0]) * coeff[5] + 143 convert_float2(py1[x1]) * coeff[6] + 144 convert_float2(py1[x2]) * coeff[7] + 145 convert_float2(py1[x3]) * coeff[8] + 146 convert_float2(py1[x4]) * coeff[9] + 147 148 convert_float2(py2[x0]) * coeff[10] + 149 convert_float2(py2[x1]) * coeff[11] + 150 convert_float2(py2[x2]) * coeff[12] + 151 convert_float2(py2[x3]) * coeff[13] + 152 convert_float2(py2[x4]) * coeff[14] + 153 154 convert_float2(py3[x0]) * coeff[15] + 155 convert_float2(py3[x1]) * coeff[16] + 156 convert_float2(py3[x2]) * coeff[17] + 157 convert_float2(py3[x3]) * coeff[18] + 158 convert_float2(py3[x4]) * coeff[19] + 159 160 convert_float2(py4[x0]) * coeff[20] + 161 convert_float2(py4[x1]) * coeff[21] + 162 convert_float2(py4[x2]) * coeff[22] + 163 convert_float2(py4[x3]) * coeff[23] + 164 convert_float2(py4[x4]) * coeff[24]; 165 px = clamp(px + 0.5f, 0.f, 255.f); 166 *out = convert_uchar2(px); 167 } 168 169 static void OneU1(const RsExpandKernelDriverInfo *info, uint32_t x, uchar *out, 170 const uchar *py0, const uchar *py1, const uchar *py2, const uchar *py3, const uchar *py4, 171 const float* coeff) { 172 173 uint32_t x0 = rsMax((int32_t)x-2, 0); 174 uint32_t x1 = rsMax((int32_t)x-1, 0); 175 uint32_t x2 = x; 176 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1)); 177 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1)); 178 179 float px = (float)(py0[x0]) * coeff[0] + 180 (float)(py0[x1]) * coeff[1] + 181 (float)(py0[x2]) * coeff[2] + 182 (float)(py0[x3]) * coeff[3] + 183 (float)(py0[x4]) * coeff[4] + 184 185 (float)(py1[x0]) * coeff[5] + 186 (float)(py1[x1]) * coeff[6] + 187 (float)(py1[x2]) * coeff[7] + 188 (float)(py1[x3]) * coeff[8] + 189 (float)(py1[x4]) * coeff[9] + 190 191 (float)(py2[x0]) * coeff[10] + 192 (float)(py2[x1]) * coeff[11] + 193 (float)(py2[x2]) * coeff[12] + 194 (float)(py2[x3]) * coeff[13] + 195 (float)(py2[x4]) * coeff[14] + 196 197 (float)(py3[x0]) * coeff[15] + 198 (float)(py3[x1]) * coeff[16] + 199 (float)(py3[x2]) * coeff[17] + 200 (float)(py3[x3]) * coeff[18] + 201 (float)(py3[x4]) * coeff[19] + 202 203 (float)(py4[x0]) * coeff[20] + 204 (float)(py4[x1]) * coeff[21] + 205 (float)(py4[x2]) * coeff[22] + 206 (float)(py4[x3]) * coeff[23] + 207 (float)(py4[x4]) * coeff[24]; 208 px = clamp(px + 0.5f, 0.f, 255.f); 209 *out = px; 210 } 211 212 static void OneF4(const RsExpandKernelDriverInfo *info, uint32_t x, float4 *out, 213 const float4 *py0, const float4 *py1, const float4 *py2, const float4 *py3, const float4 *py4, 214 const float* coeff) { 215 216 uint32_t x0 = rsMax((int32_t)x-2, 0); 217 uint32_t x1 = rsMax((int32_t)x-1, 0); 218 uint32_t x2 = x; 219 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1)); 220 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1)); 221 222 float4 px = py0[x0] * coeff[0] + 223 py0[x1] * coeff[1] + 224 py0[x2] * coeff[2] + 225 py0[x3] * coeff[3] + 226 py0[x4] * coeff[4] + 227 228 py1[x0] * coeff[5] + 229 py1[x1] * coeff[6] + 230 py1[x2] * coeff[7] + 231 py1[x3] * coeff[8] + 232 py1[x4] * coeff[9] + 233 234 py2[x0] * coeff[10] + 235 py2[x1] * coeff[11] + 236 py2[x2] * coeff[12] + 237 py2[x3] * coeff[13] + 238 py2[x4] * coeff[14] + 239 240 py3[x0] * coeff[15] + 241 py3[x1] * coeff[16] + 242 py3[x2] * coeff[17] + 243 py3[x3] * coeff[18] + 244 py3[x4] * coeff[19] + 245 246 py4[x0] * coeff[20] + 247 py4[x1] * coeff[21] + 248 py4[x2] * coeff[22] + 249 py4[x3] * coeff[23] + 250 py4[x4] * coeff[24]; 251 *out = px; 252 } 253 254 static void OneF2(const RsExpandKernelDriverInfo *info, uint32_t x, float2 *out, 255 const float2 *py0, const float2 *py1, const float2 *py2, const float2 *py3, const float2 *py4, 256 const float* coeff) { 257 258 uint32_t x0 = rsMax((int32_t)x-2, 0); 259 uint32_t x1 = rsMax((int32_t)x-1, 0); 260 uint32_t x2 = x; 261 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1)); 262 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1)); 263 264 float2 px = py0[x0] * coeff[0] + 265 py0[x1] * coeff[1] + 266 py0[x2] * coeff[2] + 267 py0[x3] * coeff[3] + 268 py0[x4] * coeff[4] + 269 270 py1[x0] * coeff[5] + 271 py1[x1] * coeff[6] + 272 py1[x2] * coeff[7] + 273 py1[x3] * coeff[8] + 274 py1[x4] * coeff[9] + 275 276 py2[x0] * coeff[10] + 277 py2[x1] * coeff[11] + 278 py2[x2] * coeff[12] + 279 py2[x3] * coeff[13] + 280 py2[x4] * coeff[14] + 281 282 py3[x0] * coeff[15] + 283 py3[x1] * coeff[16] + 284 py3[x2] * coeff[17] + 285 py3[x3] * coeff[18] + 286 py3[x4] * coeff[19] + 287 288 py4[x0] * coeff[20] + 289 py4[x1] * coeff[21] + 290 py4[x2] * coeff[22] + 291 py4[x3] * coeff[23] + 292 py4[x4] * coeff[24]; 293 *out = px; 294 } 295 296 static void OneF1(const RsExpandKernelDriverInfo *info, uint32_t x, float *out, 297 const float *py0, const float *py1, const float *py2, const float *py3, const float *py4, 298 const float* coeff) { 299 300 uint32_t x0 = rsMax((int32_t)x-2, 0); 301 uint32_t x1 = rsMax((int32_t)x-1, 0); 302 uint32_t x2 = x; 303 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1)); 304 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1)); 305 306 float px = py0[x0] * coeff[0] + 307 py0[x1] * coeff[1] + 308 py0[x2] * coeff[2] + 309 py0[x3] * coeff[3] + 310 py0[x4] * coeff[4] + 311 312 py1[x0] * coeff[5] + 313 py1[x1] * coeff[6] + 314 py1[x2] * coeff[7] + 315 py1[x3] * coeff[8] + 316 py1[x4] * coeff[9] + 317 318 py2[x0] * coeff[10] + 319 py2[x1] * coeff[11] + 320 py2[x2] * coeff[12] + 321 py2[x3] * coeff[13] + 322 py2[x4] * coeff[14] + 323 324 py3[x0] * coeff[15] + 325 py3[x1] * coeff[16] + 326 py3[x2] * coeff[17] + 327 py3[x3] * coeff[18] + 328 py3[x4] * coeff[19] + 329 330 py4[x0] * coeff[20] + 331 py4[x1] * coeff[21] + 332 py4[x2] * coeff[22] + 333 py4[x3] * coeff[23] + 334 py4[x4] * coeff[24]; 335 *out = px; 336 } 337 338 339 extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1, 340 const void *y2, const void *y3, const void *y4, 341 const short *coef, uint32_t count); 342 343 void RsdCpuScriptIntrinsicConvolve5x5::kernelU4(const RsExpandKernelDriverInfo *info, 344 uint32_t xstart, uint32_t xend, 345 uint32_t outstep) { 346 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr; 347 if (!cp->alloc.get()) { 348 ALOGE("Convolve5x5 executed without input, skipping"); 349 return; 350 } 351 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr; 352 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride; 353 354 uint32_t y0 = rsMax((int32_t)info->current.y-2, 0); 355 uint32_t y1 = rsMax((int32_t)info->current.y-1, 0); 356 uint32_t y2 = info->current.y; 357 uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1)); 358 uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1)); 359 360 const uchar4 *py0 = (const uchar4 *)(pin + stride * y0); 361 const uchar4 *py1 = (const uchar4 *)(pin + stride * y1); 362 const uchar4 *py2 = (const uchar4 *)(pin + stride * y2); 363 const uchar4 *py3 = (const uchar4 *)(pin + stride * y3); 364 const uchar4 *py4 = (const uchar4 *)(pin + stride * y4); 365 366 uchar4 *out = (uchar4 *)info->outPtr[0]; 367 uint32_t x1 = xstart; 368 uint32_t x2 = xend; 369 370 while((x1 < x2) && (x1 < 2)) { 371 OneU4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp); 372 out++; 373 x1++; 374 } 375 #if defined(ARCH_X86_HAVE_SSSE3) 376 // for x86 SIMD, require minimum of 7 elements (4 for SIMD, 377 // 3 for end boundary where x may hit the end boundary) 378 if (gArchUseSIMD &&((x1 + 6) < x2)) { 379 // subtract 3 for end boundary 380 uint32_t len = (x2 - x1 - 3) >> 2; 381 rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len); 382 out += len << 2; 383 x1 += len << 2; 384 } 385 #endif 386 387 #if defined(ARCH_ARM_USE_INTRINSICS) 388 if(gArchUseSIMD && ((x1 + 3) < x2)) { 389 uint32_t len = (x2 - x1 - 3) >> 1; 390 rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len); 391 out += len << 1; 392 x1 += len << 1; 393 } 394 #endif 395 396 while(x1 < x2) { 397 OneU4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp); 398 out++; 399 x1++; 400 } 401 } 402 403 void RsdCpuScriptIntrinsicConvolve5x5::kernelU2(const RsExpandKernelDriverInfo *info, 404 uint32_t xstart, uint32_t xend, 405 uint32_t outstep) { 406 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr; 407 if (!cp->alloc.get()) { 408 ALOGE("Convolve5x5 executed without input, skipping"); 409 return; 410 } 411 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr; 412 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride; 413 414 uint32_t y0 = rsMax((int32_t)info->current.y-2, 0); 415 uint32_t y1 = rsMax((int32_t)info->current.y-1, 0); 416 uint32_t y2 = info->current.y; 417 uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1)); 418 uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1)); 419 420 const uchar2 *py0 = (const uchar2 *)(pin + stride * y0); 421 const uchar2 *py1 = (const uchar2 *)(pin + stride * y1); 422 const uchar2 *py2 = (const uchar2 *)(pin + stride * y2); 423 const uchar2 *py3 = (const uchar2 *)(pin + stride * y3); 424 const uchar2 *py4 = (const uchar2 *)(pin + stride * y4); 425 426 uchar2 *out = (uchar2 *)info->outPtr[0]; 427 uint32_t x1 = xstart; 428 uint32_t x2 = xend; 429 430 while((x1 < x2) && (x1 < 2)) { 431 OneU2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp); 432 out++; 433 x1++; 434 } 435 436 #if 0//defined(ARCH_ARM_HAVE_NEON) 437 if((x1 + 3) < x2) { 438 uint32_t len = (x2 - x1 - 3) >> 1; 439 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len); 440 out += len << 1; 441 x1 += len << 1; 442 } 443 #endif 444 445 while(x1 < x2) { 446 OneU2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp); 447 out++; 448 x1++; 449 } 450 } 451 452 void RsdCpuScriptIntrinsicConvolve5x5::kernelU1(const RsExpandKernelDriverInfo *info, 453 uint32_t xstart, uint32_t xend, 454 uint32_t outstep) { 455 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr; 456 if (!cp->alloc.get()) { 457 ALOGE("Convolve5x5 executed without input, skipping"); 458 return; 459 } 460 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr; 461 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride; 462 463 uint32_t y0 = rsMax((int32_t)info->current.y-2, 0); 464 uint32_t y1 = rsMax((int32_t)info->current.y-1, 0); 465 uint32_t y2 = info->current.y; 466 uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1)); 467 uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1)); 468 469 const uchar *py0 = (const uchar *)(pin + stride * y0); 470 const uchar *py1 = (const uchar *)(pin + stride * y1); 471 const uchar *py2 = (const uchar *)(pin + stride * y2); 472 const uchar *py3 = (const uchar *)(pin + stride * y3); 473 const uchar *py4 = (const uchar *)(pin + stride * y4); 474 475 uchar *out = (uchar *)info->outPtr[0]; 476 uint32_t x1 = xstart; 477 uint32_t x2 = xend; 478 479 while((x1 < x2) && (x1 < 2)) { 480 OneU1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp); 481 out++; 482 x1++; 483 } 484 485 #if 0//defined(ARCH_ARM_HAVE_NEON) 486 if((x1 + 3) < x2) { 487 uint32_t len = (x2 - x1 - 3) >> 1; 488 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len); 489 out += len << 1; 490 x1 += len << 1; 491 } 492 #endif 493 494 while(x1 < x2) { 495 OneU1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp); 496 out++; 497 x1++; 498 } 499 } 500 501 void RsdCpuScriptIntrinsicConvolve5x5::kernelF4(const RsExpandKernelDriverInfo *info, 502 uint32_t xstart, uint32_t xend, 503 uint32_t outstep) { 504 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr; 505 if (!cp->alloc.get()) { 506 ALOGE("Convolve5x5 executed without input, skipping"); 507 return; 508 } 509 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr; 510 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride; 511 512 uint32_t y0 = rsMax((int32_t)info->current.y-2, 0); 513 uint32_t y1 = rsMax((int32_t)info->current.y-1, 0); 514 uint32_t y2 = info->current.y; 515 uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1)); 516 uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1)); 517 518 const float4 *py0 = (const float4 *)(pin + stride * y0); 519 const float4 *py1 = (const float4 *)(pin + stride * y1); 520 const float4 *py2 = (const float4 *)(pin + stride * y2); 521 const float4 *py3 = (const float4 *)(pin + stride * y3); 522 const float4 *py4 = (const float4 *)(pin + stride * y4); 523 524 float4 *out = (float4 *)info->outPtr[0]; 525 uint32_t x1 = xstart; 526 uint32_t x2 = xend; 527 528 while((x1 < x2) && (x1 < 2)) { 529 OneF4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp); 530 out++; 531 x1++; 532 } 533 534 #if 0//defined(ARCH_ARM_HAVE_NEON) 535 if((x1 + 3) < x2) { 536 uint32_t len = (x2 - x1 - 3) >> 1; 537 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len); 538 out += len << 1; 539 x1 += len << 1; 540 } 541 #endif 542 543 while(x1 < x2) { 544 OneF4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp); 545 out++; 546 x1++; 547 } 548 } 549 550 void RsdCpuScriptIntrinsicConvolve5x5::kernelF2(const RsExpandKernelDriverInfo *info, 551 uint32_t xstart, uint32_t xend, 552 uint32_t outstep) { 553 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr; 554 if (!cp->alloc.get()) { 555 ALOGE("Convolve5x5 executed without input, skipping"); 556 return; 557 } 558 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr; 559 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride; 560 561 uint32_t y0 = rsMax((int32_t)info->current.y-2, 0); 562 uint32_t y1 = rsMax((int32_t)info->current.y-1, 0); 563 uint32_t y2 = info->current.y; 564 uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1)); 565 uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1)); 566 567 const float2 *py0 = (const float2 *)(pin + stride * y0); 568 const float2 *py1 = (const float2 *)(pin + stride * y1); 569 const float2 *py2 = (const float2 *)(pin + stride * y2); 570 const float2 *py3 = (const float2 *)(pin + stride * y3); 571 const float2 *py4 = (const float2 *)(pin + stride * y4); 572 573 float2 *out = (float2 *)info->outPtr[0]; 574 uint32_t x1 = xstart; 575 uint32_t x2 = xend; 576 577 while((x1 < x2) && (x1 < 2)) { 578 OneF2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp); 579 out++; 580 x1++; 581 } 582 583 #if 0//defined(ARCH_ARM_HAVE_NEON) 584 if((x1 + 3) < x2) { 585 uint32_t len = (x2 - x1 - 3) >> 1; 586 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len); 587 out += len << 1; 588 x1 += len << 1; 589 } 590 #endif 591 592 while(x1 < x2) { 593 OneF2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp); 594 out++; 595 x1++; 596 } 597 } 598 599 void RsdCpuScriptIntrinsicConvolve5x5::kernelF1(const RsExpandKernelDriverInfo *info, 600 uint32_t xstart, uint32_t xend, 601 uint32_t outstep) { 602 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr; 603 if (!cp->alloc.get()) { 604 ALOGE("Convolve5x5 executed without input, skipping"); 605 return; 606 } 607 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr; 608 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride; 609 610 uint32_t y0 = rsMax((int32_t)info->current.y-2, 0); 611 uint32_t y1 = rsMax((int32_t)info->current.y-1, 0); 612 uint32_t y2 = info->current.y; 613 uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1)); 614 uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1)); 615 616 const float *py0 = (const float *)(pin + stride * y0); 617 const float *py1 = (const float *)(pin + stride * y1); 618 const float *py2 = (const float *)(pin + stride * y2); 619 const float *py3 = (const float *)(pin + stride * y3); 620 const float *py4 = (const float *)(pin + stride * y4); 621 622 float *out = (float *)info->outPtr[0]; 623 uint32_t x1 = xstart; 624 uint32_t x2 = xend; 625 626 while((x1 < x2) && (x1 < 2)) { 627 OneF1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp); 628 out++; 629 x1++; 630 } 631 632 #if 0//defined(ARCH_ARM_HAVE_NEON) 633 if((x1 + 3) < x2) { 634 uint32_t len = (x2 - x1 - 3) >> 1; 635 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len); 636 out += len << 1; 637 x1 += len << 1; 638 } 639 #endif 640 641 while(x1 < x2) { 642 OneF1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp); 643 out++; 644 x1++; 645 } 646 } 647 648 RsdCpuScriptIntrinsicConvolve5x5::RsdCpuScriptIntrinsicConvolve5x5( 649 RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) 650 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5) { 651 652 if (e->getType() == RS_TYPE_FLOAT_32) { 653 switch(e->getVectorSize()) { 654 case 1: 655 mRootPtr = &kernelF1; 656 break; 657 case 2: 658 mRootPtr = &kernelF2; 659 break; 660 case 3: 661 case 4: 662 mRootPtr = &kernelF4; 663 break; 664 } 665 } else { 666 switch(e->getVectorSize()) { 667 case 1: 668 mRootPtr = &kernelU1; 669 break; 670 case 2: 671 mRootPtr = &kernelU2; 672 break; 673 case 3: 674 case 4: 675 mRootPtr = &kernelU4; 676 break; 677 } 678 } 679 for(int ct=0; ct < 25; ct++) { 680 mFp[ct] = 1.f / 25.f; 681 mIp[ct] = (short)(mFp[ct] * 256.f); 682 } 683 } 684 685 RsdCpuScriptIntrinsicConvolve5x5::~RsdCpuScriptIntrinsicConvolve5x5() { 686 } 687 688 void RsdCpuScriptIntrinsicConvolve5x5::populateScript(Script *s) { 689 s->mHal.info.exportedVariableCount = 2; 690 } 691 692 void RsdCpuScriptIntrinsicConvolve5x5::invokeFreeChildren() { 693 alloc.clear(); 694 } 695 696 RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx, 697 const Script *s, const Element *e) { 698 699 return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s, e); 700 } 701 702 } // namespace renderscript 703 } // namespace android 704