1 /* 2 * Copyright 2009 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #include "SkBitmapFilter_opts_SSE2.h" 9 #include "SkBitmapProcState_opts_SSE2.h" 10 #include "SkBitmapProcState_opts_SSSE3.h" 11 #include "SkBitmapScaler.h" 12 #include "SkBlitMask.h" 13 #include "SkBlitRect_opts_SSE2.h" 14 #include "SkBlitRow.h" 15 #include "SkBlitRow_opts_SSE2.h" 16 #include "SkBlitRow_opts_SSE4.h" 17 #include "SkBlurImage_opts_SSE2.h" 18 #include "SkBlurImage_opts_SSE4.h" 19 #include "SkMorphology_opts.h" 20 #include "SkMorphology_opts_SSE2.h" 21 #include "SkRTConf.h" 22 #include "SkUtils.h" 23 #include "SkUtils_opts_SSE2.h" 24 #include "SkXfermode.h" 25 #include "SkXfermode_proccoeff.h" 26 27 #if defined(_MSC_VER) && defined(_WIN64) 28 #include <intrin.h> 29 #endif 30 31 /* This file must *not* be compiled with -msse or any other optional SIMD 32 extension, otherwise gcc may generate SIMD instructions even for scalar ops 33 (and thus give an invalid instruction on Pentium3 on the code below). 34 For example, only files named *_SSE2.cpp in this directory should be 35 compiled with -msse2 or higher. */ 36 37 38 /* Function to get the CPU SSE-level in runtime, for different compilers. */ 39 #ifdef _MSC_VER 40 static inline void getcpuid(int info_type, int info[4]) { 41 #if defined(_WIN64) 42 __cpuid(info, info_type); 43 #else 44 __asm { 45 mov eax, [info_type] 46 cpuid 47 mov edi, [info] 48 mov [edi], eax 49 mov [edi+4], ebx 50 mov [edi+8], ecx 51 mov [edi+12], edx 52 } 53 #endif 54 } 55 #elif defined(__x86_64__) 56 static inline void getcpuid(int info_type, int info[4]) { 57 asm volatile ( 58 "cpuid \n\t" 59 : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3]) 60 : "a"(info_type) 61 ); 62 } 63 #else 64 static inline void getcpuid(int info_type, int info[4]) { 65 // We save and restore ebx, so this code can be compatible with -fPIC 66 asm volatile ( 67 "pushl %%ebx \n\t" 68 "cpuid \n\t" 69 "movl %%ebx, %1 \n\t" 70 "popl %%ebx \n\t" 71 : "=a"(info[0]), "=r"(info[1]), "=c"(info[2]), "=d"(info[3]) 72 : "a"(info_type) 73 ); 74 } 75 #endif 76 77 //////////////////////////////////////////////////////////////////////////////// 78 79 /* Fetch the SIMD level directly from the CPU, at run-time. 80 * Only checks the levels needed by the optimizations in this file. 81 */ 82 static int get_SIMD_level() { 83 int cpu_info[4] = { 0 }; 84 85 getcpuid(1, cpu_info); 86 if ((cpu_info[2] & (1<<20)) != 0) { 87 return SK_CPU_SSE_LEVEL_SSE42; 88 } else if ((cpu_info[2] & (1<<19)) != 0) { 89 return SK_CPU_SSE_LEVEL_SSE41; 90 } else if ((cpu_info[2] & (1<<9)) != 0) { 91 return SK_CPU_SSE_LEVEL_SSSE3; 92 } else if ((cpu_info[3] & (1<<26)) != 0) { 93 return SK_CPU_SSE_LEVEL_SSE2; 94 } else { 95 return 0; 96 } 97 } 98 99 /* Verify that the requested SIMD level is supported in the build. 100 * If not, check if the platform supports it. 101 */ 102 static inline bool supports_simd(int minLevel) { 103 #if defined(SK_CPU_SSE_LEVEL) 104 if (minLevel <= SK_CPU_SSE_LEVEL) { 105 return true; 106 } else 107 #endif 108 { 109 #if defined(SK_BUILD_FOR_ANDROID_FRAMEWORK) 110 /* For the Android framework we should always know at compile time if the device 111 * we are building for supports SSSE3. The one exception to this rule is on the 112 * emulator where we are compiled without the -mssse3 option (so we have no 113 * SSSE3 procs) but can be run on a host machine that supports SSSE3 114 * instructions. So for that particular case we disable our SSSE3 options. 115 */ 116 return false; 117 #else 118 static int gSIMDLevel = get_SIMD_level(); 119 return (minLevel <= gSIMDLevel); 120 #endif 121 } 122 } 123 124 //////////////////////////////////////////////////////////////////////////////// 125 126 SK_CONF_DECLARE( bool, c_hqfilter_sse, "bitmap.filter.highQualitySSE", true, "Use SSE optimized version of high quality image filters"); 127 128 void SkBitmapScaler::PlatformConvolutionProcs(SkConvolutionProcs* procs) { 129 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { 130 procs->fExtraHorizontalReads = 3; 131 procs->fConvolveVertically = &convolveVertically_SSE2; 132 procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_SSE2; 133 procs->fConvolveHorizontally = &convolveHorizontally_SSE2; 134 procs->fApplySIMDPadding = &applySIMDPadding_SSE2; 135 } 136 } 137 138 //////////////////////////////////////////////////////////////////////////////// 139 140 void SkBitmapProcState::platformProcs() { 141 /* Every optimization in the function requires at least SSE2 */ 142 if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { 143 return; 144 } 145 146 /* Check fSampleProc32 */ 147 if (fSampleProc32 == S32_opaque_D32_filter_DX) { 148 if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) { 149 fSampleProc32 = S32_opaque_D32_filter_DX_SSSE3; 150 } else { 151 fSampleProc32 = S32_opaque_D32_filter_DX_SSE2; 152 } 153 } else if (fSampleProc32 == S32_opaque_D32_filter_DXDY) { 154 if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) { 155 fSampleProc32 = S32_opaque_D32_filter_DXDY_SSSE3; 156 } 157 } else if (fSampleProc32 == S32_alpha_D32_filter_DX) { 158 if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) { 159 fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3; 160 } else { 161 fSampleProc32 = S32_alpha_D32_filter_DX_SSE2; 162 } 163 } else if (fSampleProc32 == S32_alpha_D32_filter_DXDY) { 164 if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) { 165 fSampleProc32 = S32_alpha_D32_filter_DXDY_SSSE3; 166 } 167 } 168 169 /* Check fSampleProc16 */ 170 if (fSampleProc16 == S32_D16_filter_DX) { 171 fSampleProc16 = S32_D16_filter_DX_SSE2; 172 } 173 174 /* Check fMatrixProc */ 175 if (fMatrixProc == ClampX_ClampY_filter_scale) { 176 fMatrixProc = ClampX_ClampY_filter_scale_SSE2; 177 } else if (fMatrixProc == ClampX_ClampY_nofilter_scale) { 178 fMatrixProc = ClampX_ClampY_nofilter_scale_SSE2; 179 } else if (fMatrixProc == ClampX_ClampY_filter_affine) { 180 fMatrixProc = ClampX_ClampY_filter_affine_SSE2; 181 } else if (fMatrixProc == ClampX_ClampY_nofilter_affine) { 182 fMatrixProc = ClampX_ClampY_nofilter_affine_SSE2; 183 } 184 185 /* Check fShaderProc32 */ 186 if (c_hqfilter_sse) { 187 if (fShaderProc32 == highQualityFilter32) { 188 fShaderProc32 = highQualityFilter_SSE2; 189 } 190 } 191 } 192 193 //////////////////////////////////////////////////////////////////////////////// 194 195 static SkBlitRow::Proc platform_16_procs[] = { 196 S32_D565_Opaque_SSE2, // S32_D565_Opaque 197 NULL, // S32_D565_Blend 198 S32A_D565_Opaque_SSE2, // S32A_D565_Opaque 199 NULL, // S32A_D565_Blend 200 S32_D565_Opaque_Dither_SSE2, // S32_D565_Opaque_Dither 201 NULL, // S32_D565_Blend_Dither 202 S32A_D565_Opaque_Dither_SSE2, // S32A_D565_Opaque_Dither 203 NULL, // S32A_D565_Blend_Dither 204 }; 205 206 SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) { 207 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { 208 return platform_16_procs[flags]; 209 } else { 210 return NULL; 211 } 212 } 213 214 static SkBlitRow::Proc32 platform_32_procs_SSE2[] = { 215 NULL, // S32_Opaque, 216 S32_Blend_BlitRow32_SSE2, // S32_Blend, 217 S32A_Opaque_BlitRow32_SSE2, // S32A_Opaque 218 S32A_Blend_BlitRow32_SSE2, // S32A_Blend, 219 }; 220 221 #if defined(SK_ATT_ASM_SUPPORTED) 222 static SkBlitRow::Proc32 platform_32_procs_SSE4[] = { 223 NULL, // S32_Opaque, 224 S32_Blend_BlitRow32_SSE2, // S32_Blend, 225 S32A_Opaque_BlitRow32_SSE4_asm, // S32A_Opaque 226 S32A_Blend_BlitRow32_SSE2, // S32A_Blend, 227 }; 228 #endif 229 230 SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { 231 #if defined(SK_ATT_ASM_SUPPORTED) 232 if (supports_simd(SK_CPU_SSE_LEVEL_SSE41)) { 233 return platform_32_procs_SSE4[flags]; 234 } else 235 #endif 236 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { 237 return platform_32_procs_SSE2[flags]; 238 } else { 239 return NULL; 240 } 241 } 242 243 SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() { 244 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { 245 return Color32_SSE2; 246 } else { 247 return NULL; 248 } 249 } 250 251 SkBlitRow::ColorRectProc PlatformColorRectProcFactory(); // suppress warning 252 253 SkBlitRow::ColorRectProc PlatformColorRectProcFactory() { 254 /* Return NULL for now, since the optimized path in ColorRect32_SSE2 is disabled. 255 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { 256 return ColorRect32_SSE2; 257 } else { 258 return NULL; 259 } 260 */ 261 return NULL; 262 } 263 264 //////////////////////////////////////////////////////////////////////////////// 265 266 SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkColorType dstCT, 267 SkMask::Format maskFormat, 268 SkColor color) { 269 if (SkMask::kA8_Format != maskFormat) { 270 return NULL; 271 } 272 273 ColorProc proc = NULL; 274 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { 275 switch (dstCT) { 276 case kN32_SkColorType: 277 // The SSE2 version is not (yet) faster for black, so we check 278 // for that. 279 if (SK_ColorBLACK != color) { 280 proc = SkARGB32_A8_BlitMask_SSE2; 281 } 282 break; 283 default: 284 break; 285 } 286 } 287 return proc; 288 } 289 290 SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) { 291 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { 292 if (isOpaque) { 293 return SkBlitLCD16OpaqueRow_SSE2; 294 } else { 295 return SkBlitLCD16Row_SSE2; 296 } 297 } else { 298 return NULL; 299 } 300 301 } 302 303 SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkColorType, SkMask::Format, RowFlags) { 304 return NULL; 305 } 306 307 //////////////////////////////////////////////////////////////////////////////// 308 309 SkMemset16Proc SkMemset16GetPlatformProc() { 310 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { 311 return sk_memset16_SSE2; 312 } else { 313 return NULL; 314 } 315 } 316 317 SkMemset32Proc SkMemset32GetPlatformProc() { 318 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { 319 return sk_memset32_SSE2; 320 } else { 321 return NULL; 322 } 323 } 324 325 SkMemcpy32Proc SkMemcpy32GetPlatformProc() { 326 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { 327 return sk_memcpy32_SSE2; 328 } else { 329 return NULL; 330 } 331 } 332 333 //////////////////////////////////////////////////////////////////////////////// 334 335 SkMorphologyImageFilter::Proc SkMorphologyGetPlatformProc(SkMorphologyProcType type) { 336 if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { 337 return NULL; 338 } 339 switch (type) { 340 case kDilateX_SkMorphologyProcType: 341 return SkDilateX_SSE2; 342 case kDilateY_SkMorphologyProcType: 343 return SkDilateY_SSE2; 344 case kErodeX_SkMorphologyProcType: 345 return SkErodeX_SSE2; 346 case kErodeY_SkMorphologyProcType: 347 return SkErodeY_SSE2; 348 default: 349 return NULL; 350 } 351 } 352 353 //////////////////////////////////////////////////////////////////////////////// 354 355 bool SkBoxBlurGetPlatformProcs(SkBoxBlurProc* boxBlurX, 356 SkBoxBlurProc* boxBlurY, 357 SkBoxBlurProc* boxBlurXY, 358 SkBoxBlurProc* boxBlurYX) { 359 #ifdef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION 360 return false; 361 #else 362 if (supports_simd(SK_CPU_SSE_LEVEL_SSE41)) { 363 return SkBoxBlurGetPlatformProcs_SSE4(boxBlurX, boxBlurY, boxBlurXY, boxBlurYX); 364 } 365 else if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { 366 return SkBoxBlurGetPlatformProcs_SSE2(boxBlurX, boxBlurY, boxBlurXY, boxBlurYX); 367 } 368 return false; 369 #endif 370 } 371 372 //////////////////////////////////////////////////////////////////////////////// 373 374 extern SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_SSE2(const ProcCoeff& rec, 375 SkXfermode::Mode mode); 376 377 SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl(const ProcCoeff& rec, 378 SkXfermode::Mode mode); 379 380 SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl(const ProcCoeff& rec, 381 SkXfermode::Mode mode) { 382 return NULL; 383 } 384 385 SkProcCoeffXfermode* SkPlatformXfermodeFactory(const ProcCoeff& rec, 386 SkXfermode::Mode mode); 387 388 SkProcCoeffXfermode* SkPlatformXfermodeFactory(const ProcCoeff& rec, 389 SkXfermode::Mode mode) { 390 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { 391 return SkPlatformXfermodeFactory_impl_SSE2(rec, mode); 392 } else { 393 return SkPlatformXfermodeFactory_impl(rec, mode); 394 } 395 } 396 397 SkXfermodeProc SkPlatformXfermodeProcFactory(SkXfermode::Mode mode); 398 399 SkXfermodeProc SkPlatformXfermodeProcFactory(SkXfermode::Mode mode) { 400 return NULL; 401 } 402