1 /* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 18 #include "rsCpuIntrinsic.h" 19 #include "rsCpuIntrinsicInlines.h" 20 21 using namespace android; 22 using namespace android::renderscript; 23 24 namespace android { 25 namespace renderscript { 26 27 28 class RsdCpuScriptIntrinsicConvolve3x3 : public RsdCpuScriptIntrinsic { 29 public: 30 void populateScript(Script *) override; 31 void invokeFreeChildren() override; 32 33 void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override; 34 void setGlobalObj(uint32_t slot, ObjectBase *data) override; 35 36 ~RsdCpuScriptIntrinsicConvolve3x3() override; 37 RsdCpuScriptIntrinsicConvolve3x3(RsdCpuReferenceImpl *ctx, const Script *s, const Element *); 38 39 protected: 40 float mFp[16]; 41 short mIp[16]; 42 ObjectBaseRef<const Allocation> mAlloc; 43 ObjectBaseRef<const Element> mElement; 44 45 static void kernelU1(const RsExpandKernelDriverInfo *info, 46 uint32_t xstart, uint32_t xend, 47 uint32_t outstep); 48 static void kernelU2(const RsExpandKernelDriverInfo *info, 49 uint32_t xstart, uint32_t xend, 50 uint32_t outstep); 51 static void kernelU4(const RsExpandKernelDriverInfo *info, 52 uint32_t xstart, uint32_t xend, 53 uint32_t outstep); 54 static void kernelF1(const RsExpandKernelDriverInfo *info, 55 uint32_t xstart, uint32_t xend, 56 uint32_t outstep); 57 static void kernelF2(const RsExpandKernelDriverInfo *info, 58 uint32_t xstart, uint32_t xend, 59 uint32_t outstep); 60 static void kernelF4(const RsExpandKernelDriverInfo *info, 61 uint32_t xstart, uint32_t xend, 62 uint32_t outstep); 63 }; 64 65 } 66 } 67 68 69 void RsdCpuScriptIntrinsicConvolve3x3::setGlobalObj(uint32_t slot, ObjectBase *data) { 70 rsAssert(slot == 1); 71 mAlloc.set(static_cast<Allocation *>(data)); 72 } 73 74 void RsdCpuScriptIntrinsicConvolve3x3::setGlobalVar(uint32_t slot, const void *data, 75 size_t dataLength) { 76 rsAssert(slot == 0); 77 memcpy (&mFp, data, dataLength); 78 for(int ct=0; ct < 9; ct++) { 79 if (mFp[ct] >= 0) { 80 mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f); 81 } else { 82 mIp[ct] = (short)(mFp[ct] * 256.f - 0.5f); 83 } 84 } 85 } 86 87 extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0, const void *y1, 88 const void *y2, const short *coef, uint32_t count); 89 90 91 static void ConvolveOneU4(const RsExpandKernelDriverInfo *info, uint32_t x, uchar4 *out, 92 const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, 93 const float* coeff) { 94 95 uint32_t x1 = rsMax((int32_t)x-1, 0); 96 uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1); 97 98 float4 px = convert_float4(py0[x1]) * coeff[0] + 99 convert_float4(py0[x]) * coeff[1] + 100 convert_float4(py0[x2]) * coeff[2] + 101 convert_float4(py1[x1]) * coeff[3] + 102 convert_float4(py1[x]) * coeff[4] + 103 convert_float4(py1[x2]) * coeff[5] + 104 convert_float4(py2[x1]) * coeff[6] + 105 convert_float4(py2[x]) * coeff[7] + 106 convert_float4(py2[x2]) * coeff[8]; 107 108 px = clamp(px + 0.5f, 0.f, 255.f); 109 uchar4 o = {(uchar)px.x, (uchar)px.y, (uchar)px.z, (uchar)px.w}; 110 *out = o; 111 } 112 113 static void ConvolveOneU2(const RsExpandKernelDriverInfo *info, uint32_t x, uchar2 *out, 114 const uchar2 *py0, const uchar2 *py1, const uchar2 *py2, 115 const float* coeff) { 116 117 uint32_t x1 = rsMax((int32_t)x-1, 0); 118 uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1); 119 120 float2 px = convert_float2(py0[x1]) * coeff[0] + 121 convert_float2(py0[x]) * coeff[1] + 122 convert_float2(py0[x2]) * coeff[2] + 123 convert_float2(py1[x1]) * coeff[3] + 124 convert_float2(py1[x]) * coeff[4] + 125 convert_float2(py1[x2]) * coeff[5] + 126 convert_float2(py2[x1]) * coeff[6] + 127 convert_float2(py2[x]) * coeff[7] + 128 convert_float2(py2[x2]) * coeff[8]; 129 130 px = clamp(px + 0.5f, 0.f, 255.f); 131 *out = convert_uchar2(px); 132 } 133 134 static void ConvolveOneU1(const RsExpandKernelDriverInfo *info, uint32_t x, uchar *out, 135 const uchar *py0, const uchar *py1, const uchar *py2, 136 const float* coeff) { 137 138 uint32_t x1 = rsMax((int32_t)x-1, 0); 139 uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1); 140 141 float px = ((float)py0[x1]) * coeff[0] + 142 ((float)py0[x]) * coeff[1] + 143 ((float)py0[x2]) * coeff[2] + 144 ((float)py1[x1]) * coeff[3] + 145 ((float)py1[x]) * coeff[4] + 146 ((float)py1[x2]) * coeff[5] + 147 ((float)py2[x1]) * coeff[6] + 148 ((float)py2[x]) * coeff[7] + 149 ((float)py2[x2]) * coeff[8]; 150 *out = clamp(px + 0.5f, 0.f, 255.f); 151 } 152 153 static void ConvolveOneF4(const RsExpandKernelDriverInfo *info, uint32_t x, float4 *out, 154 const float4 *py0, const float4 *py1, const float4 *py2, 155 const float* coeff) { 156 157 uint32_t x1 = rsMax((int32_t)x-1, 0); 158 uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1); 159 *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) + 160 (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) + 161 (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]); 162 } 163 164 static void ConvolveOneF2(const RsExpandKernelDriverInfo *info, uint32_t x, float2 *out, 165 const float2 *py0, const float2 *py1, const float2 *py2, 166 const float* coeff) { 167 168 uint32_t x1 = rsMax((int32_t)x-1, 0); 169 uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1); 170 *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) + 171 (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) + 172 (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]); 173 } 174 175 static void ConvolveOneF1(const RsExpandKernelDriverInfo *info, uint32_t x, float *out, 176 const float *py0, const float *py1, const float *py2, 177 const float* coeff) { 178 179 uint32_t x1 = rsMax((int32_t)x-1, 0); 180 uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1); 181 *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) + 182 (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) + 183 (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]); 184 } 185 186 void RsdCpuScriptIntrinsicConvolve3x3::kernelU4(const RsExpandKernelDriverInfo *info, 187 uint32_t xstart, uint32_t xend, 188 uint32_t outstep) { 189 RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr; 190 191 if (!cp->mAlloc.get()) { 192 ALOGE("Convolve3x3 executed without input, skipping"); 193 return; 194 } 195 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr; 196 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; 197 198 uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1)); 199 uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0); 200 const uchar4 *py0 = (const uchar4 *)(pin + stride * y2); 201 const uchar4 *py1 = (const uchar4 *)(pin + stride * info->current.y); 202 const uchar4 *py2 = (const uchar4 *)(pin + stride * y1); 203 204 uchar4 *out = (uchar4 *)info->outPtr[0]; 205 uint32_t x1 = xstart; 206 uint32_t x2 = xend; 207 if(x1 == 0) { 208 ConvolveOneU4(info, 0, out, py0, py1, py2, cp->mFp); 209 x1 ++; 210 out++; 211 } 212 213 if(x2 > x1) { 214 #if defined(ARCH_ARM_USE_INTRINSICS) || defined(ARCH_X86_HAVE_SSSE3) 215 if (gArchUseSIMD) { 216 int32_t len = (x2 - x1 - 1) >> 1; 217 if(len > 0) { 218 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len); 219 x1 += len << 1; 220 out += len << 1; 221 } 222 } 223 #endif 224 225 while(x1 != x2) { 226 ConvolveOneU4(info, x1, out, py0, py1, py2, cp->mFp); 227 out++; 228 x1++; 229 } 230 } 231 } 232 233 void RsdCpuScriptIntrinsicConvolve3x3::kernelU2(const RsExpandKernelDriverInfo *info, 234 uint32_t xstart, uint32_t xend, 235 uint32_t outstep) { 236 RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr; 237 238 if (!cp->mAlloc.get()) { 239 ALOGE("Convolve3x3 executed without input, skipping"); 240 return; 241 } 242 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr; 243 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; 244 245 uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1)); 246 uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0); 247 const uchar2 *py0 = (const uchar2 *)(pin + stride * y2); 248 const uchar2 *py1 = (const uchar2 *)(pin + stride * info->current.y); 249 const uchar2 *py2 = (const uchar2 *)(pin + stride * y1); 250 251 uchar2 *out = (uchar2 *)info->outPtr[0]; 252 uint32_t x1 = xstart; 253 uint32_t x2 = xend; 254 if(x1 == 0) { 255 ConvolveOneU2(info, 0, out, py0, py1, py2, cp->mFp); 256 x1 ++; 257 out++; 258 } 259 260 if(x2 > x1) { 261 #if 0//defined(ARCH_ARM_HAVE_NEON) 262 int32_t len = (x2 - x1 - 1) >> 1; 263 if(len > 0) { 264 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len); 265 x1 += len << 1; 266 out += len << 1; 267 } 268 #endif 269 270 while(x1 != x2) { 271 ConvolveOneU2(info, x1, out, py0, py1, py2, cp->mFp); 272 out++; 273 x1++; 274 } 275 } 276 } 277 278 void RsdCpuScriptIntrinsicConvolve3x3::kernelU1(const RsExpandKernelDriverInfo *info, 279 uint32_t xstart, uint32_t xend, 280 uint32_t outstep) { 281 RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr; 282 283 if (!cp->mAlloc.get()) { 284 ALOGE("Convolve3x3 executed without input, skipping"); 285 return; 286 } 287 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr; 288 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; 289 290 uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1)); 291 uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0); 292 const uchar *py0 = (const uchar *)(pin + stride * y2); 293 const uchar *py1 = (const uchar *)(pin + stride * info->current.y); 294 const uchar *py2 = (const uchar *)(pin + stride * y1); 295 296 uchar *out = (uchar *)info->outPtr[0]; 297 uint32_t x1 = xstart; 298 uint32_t x2 = xend; 299 if(x1 == 0) { 300 ConvolveOneU1(info, 0, out, py0, py1, py2, cp->mFp); 301 x1 ++; 302 out++; 303 } 304 305 if(x2 > x1) { 306 #if 0//defined(ARCH_ARM_HAVE_NEON) 307 int32_t len = (x2 - x1 - 1) >> 1; 308 if(len > 0) { 309 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len); 310 x1 += len << 1; 311 out += len << 1; 312 } 313 #endif 314 315 while(x1 != x2) { 316 ConvolveOneU1(info, x1, out, py0, py1, py2, cp->mFp); 317 out++; 318 x1++; 319 } 320 } 321 } 322 323 void RsdCpuScriptIntrinsicConvolve3x3::kernelF4(const RsExpandKernelDriverInfo *info, 324 uint32_t xstart, uint32_t xend, 325 uint32_t outstep) { 326 RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr; 327 328 if (!cp->mAlloc.get()) { 329 ALOGE("Convolve3x3 executed without input, skipping"); 330 return; 331 } 332 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr; 333 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; 334 335 uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1)); 336 uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0); 337 const float4 *py0 = (const float4 *)(pin + stride * y2); 338 const float4 *py1 = (const float4 *)(pin + stride * info->current.y); 339 const float4 *py2 = (const float4 *)(pin + stride * y1); 340 341 float4 *out = (float4 *)info->outPtr[0]; 342 uint32_t x1 = xstart; 343 uint32_t x2 = xend; 344 if(x1 == 0) { 345 ConvolveOneF4(info, 0, out, py0, py1, py2, cp->mFp); 346 x1 ++; 347 out++; 348 } 349 350 if(x2 > x1) { 351 #if 0//defined(ARCH_ARM_HAVE_NEON) 352 int32_t len = (x2 - x1 - 1) >> 1; 353 if(len > 0) { 354 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len); 355 x1 += len << 1; 356 out += len << 1; 357 } 358 #endif 359 360 while(x1 != x2) { 361 ConvolveOneF4(info, x1, out, py0, py1, py2, cp->mFp); 362 out++; 363 x1++; 364 } 365 } 366 } 367 368 void RsdCpuScriptIntrinsicConvolve3x3::kernelF2(const RsExpandKernelDriverInfo *info, 369 uint32_t xstart, uint32_t xend, 370 uint32_t outstep) { 371 RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr; 372 373 if (!cp->mAlloc.get()) { 374 ALOGE("Convolve3x3 executed without input, skipping"); 375 return; 376 } 377 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr; 378 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; 379 380 uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1)); 381 uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0); 382 const float2 *py0 = (const float2 *)(pin + stride * y2); 383 const float2 *py1 = (const float2 *)(pin + stride * info->current.y); 384 const float2 *py2 = (const float2 *)(pin + stride * y1); 385 386 float2 *out = (float2 *)info->outPtr[0]; 387 uint32_t x1 = xstart; 388 uint32_t x2 = xend; 389 if(x1 == 0) { 390 ConvolveOneF2(info, 0, out, py0, py1, py2, cp->mFp); 391 x1 ++; 392 out++; 393 } 394 395 if(x2 > x1) { 396 #if 0//defined(ARCH_ARM_HAVE_NEON) 397 int32_t len = (x2 - x1 - 1) >> 1; 398 if(len > 0) { 399 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len); 400 x1 += len << 1; 401 out += len << 1; 402 } 403 #endif 404 405 while(x1 != x2) { 406 ConvolveOneF2(info, x1, out, py0, py1, py2, cp->mFp); 407 out++; 408 x1++; 409 } 410 } 411 } 412 void RsdCpuScriptIntrinsicConvolve3x3::kernelF1(const RsExpandKernelDriverInfo *info, 413 uint32_t xstart, uint32_t xend, 414 uint32_t outstep) { 415 RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr; 416 417 if (!cp->mAlloc.get()) { 418 ALOGE("Convolve3x3 executed without input, skipping"); 419 return; 420 } 421 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr; 422 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; 423 424 uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1)); 425 uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0); 426 const float *py0 = (const float *)(pin + stride * y2); 427 const float *py1 = (const float *)(pin + stride * info->current.y); 428 const float *py2 = (const float *)(pin + stride * y1); 429 430 float *out = (float *)info->outPtr[0]; 431 uint32_t x1 = xstart; 432 uint32_t x2 = xend; 433 if(x1 == 0) { 434 ConvolveOneF1(info, 0, out, py0, py1, py2, cp->mFp); 435 x1 ++; 436 out++; 437 } 438 439 if(x2 > x1) { 440 #if 0//defined(ARCH_ARM_HAVE_NEON) 441 int32_t len = (x2 - x1 - 1) >> 1; 442 if(len > 0) { 443 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len); 444 x1 += len << 1; 445 out += len << 1; 446 } 447 #endif 448 449 while(x1 != x2) { 450 ConvolveOneF1(info, x1, out, py0, py1, py2, cp->mFp); 451 out++; 452 x1++; 453 } 454 } 455 } 456 457 RsdCpuScriptIntrinsicConvolve3x3::RsdCpuScriptIntrinsicConvolve3x3( 458 RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) 459 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3) { 460 461 if (e->getType() == RS_TYPE_FLOAT_32) { 462 switch(e->getVectorSize()) { 463 case 1: 464 mRootPtr = &kernelF1; 465 break; 466 case 2: 467 mRootPtr = &kernelF2; 468 break; 469 case 3: 470 case 4: 471 mRootPtr = &kernelF4; 472 break; 473 } 474 } else { 475 switch(e->getVectorSize()) { 476 case 1: 477 mRootPtr = &kernelU1; 478 break; 479 case 2: 480 mRootPtr = &kernelU2; 481 break; 482 case 3: 483 case 4: 484 mRootPtr = &kernelU4; 485 break; 486 } 487 } 488 for(int ct=0; ct < 9; ct++) { 489 mFp[ct] = 1.f / 9.f; 490 mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f); 491 } 492 } 493 494 RsdCpuScriptIntrinsicConvolve3x3::~RsdCpuScriptIntrinsicConvolve3x3() { 495 } 496 497 void RsdCpuScriptIntrinsicConvolve3x3::populateScript(Script *s) { 498 s->mHal.info.exportedVariableCount = 2; 499 } 500 501 void RsdCpuScriptIntrinsicConvolve3x3::invokeFreeChildren() { 502 mAlloc.clear(); 503 } 504 505 506 RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) { 507 508 return new RsdCpuScriptIntrinsicConvolve3x3(ctx, s, e); 509 } 510